## Imports and downloads

In [1]:
%pip install jpype1

Collecting jpype1
  Downloading JPype1-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading JPype1-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.9/488.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: jpype1
Successfully installed jpype1-1.5.0
Note: you may need to restart the kernel to use updated packages.


In [23]:
import numpy as np 
import pandas as pd
from pathlib import Path 
import os 
from functools import partial 

data_dir = csv_path = Path(os.getcwd()) / 'data' 

## Helper Functions

In [22]:
def remove_white_spaces(df):
    # remove all the white space characters inside df
    df = df.replace(r'\s+', ' ', regex=True)
    return df 
    
def merge_slots(df, col_names):
    # merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
    # append all the SLOT values to the SLOT column SLOT col is a list 
    _df = df.groupby([col_names["code"], col_names["title"]])\
        .agg({col_names["slot"]: lambda x: list(x)})\
        .reset_index()
    return _df
    
def filter_course_slots(df, col_names):
    # convert the course-slots into a set to remove duplicates and remove nan and the empty slots
    df[col_names["slot"]] = df[col_names["slot"]].apply(lambda x: set(x) - {np.nan, " "})
    
    # filter out columns with any cell containing '--' or sequences of repeated '-'
    _df = df[~df[col_names["slot"]].astype(str).str.contains('-{2,}')]
    return _df 

def merge_course_code_and_title(df, col_names, is_lab):
    _df = df
    # merge course-code and course-title into a single column called course 
    _df["course"] = _df[col_names["code"]] + "-" + _df[col_names["title"]] + ("( LAB )" if is_lab else "") 
    # drop the course-code and course-title columns
    _df.drop(columns=[col_names["code"], col_names["title"]], inplace=True)
    return df 

def to_json(df, col_names):
    _df = df
    slot_col_name = col_names["slot"]
    # convert slot from set to list
    _df[slot_col_name] = _df[slot_col_name].apply(lambda x: list(x))
    return _df.set_index("course").to_dict()[slot_col_name]

def save_json(df_dict, name, **kwargs):
    import json
    with open(name, "w") as f:
        json.dump(df_dict, f, indent=4)

## Parsing the Lab slots 
- Change the `path`
- Change the `num pages`

Saves the parsed data into `lab.json` in the current working directory.

In [29]:
# read the pdf and convert to dataframe
NUM_PAGES = 8
THEORY_PATH = data_dir / 'ANNEXURE - II(B) LAB SLOTS.pdf'
dfs = io.read_pdf(
    THEORY_PATH.resolve(), 
    pages=[i+1 for i in range(NUM_PAGES)], 
    multiple_tables=False, guess=False, encoding='cp1252'
)
df = dfs[0]

In [None]:

is_lab = True
FILE_NAME = "lab.json"
col_names = {
    "slot" : "LAB SLOT",
    "code" : "COURSE CODE",
    "title": "COURSE TITLE"
}

fns = [
    merge_slots, 
    filter_course_slots,
    partial(merge_course_code_and_title, is_lab=is_lab),
    to_json,
    partial(save_json, name=FILE_NAME)
]

_df = df.copy()
for fn in fns:
    _df = fn(_df, col_names=col_names)

## Parsing the Theory slots 
- Change the `path`
- Change the `num pages`

Saves the parsed data into `lab.json` in the current working directory.

In [32]:
# read the pdf and convert to dataframe
NUM_PAGES = 12
THEORY_PATH = data_dir / 'ANNEXURE - II(A) THEORY SLOTS.pdf'
dfs = io.read_pdf(
    THEORY_PATH.resolve(), 
    pages=[i+1 for i in range(NUM_PAGES)], 
    multiple_tables=False, guess=False, encoding='cp1252'
)
df = dfs[0]

In [33]:
is_lab = False
FILE_NAME = "theory.json"
col_names = {
    "slot" : "SLOT",
    "code" : "COURSE CODE",
    "title": "COURSE TITLE"
}

fns = [
    merge_slots, 
    filter_course_slots,
    partial(merge_course_code_and_title, is_lab=is_lab),
    to_json,
    partial(save_json, name=FILE_NAME)
]

_df = df.copy()
for fn in fns:
    _df = fn(_df, col_names=col_names)

In [None]:

df[df['COURSE CODE'] == 'CHY1001']

## MISC

In [7]:
data_dir = csv_path = Path(os.getcwd()) / 'data' 
is_lab = False
pdf_path = data_dir/ 'ANNEXURE - II THEORY AND LAB SLOTS.pdf'

In [None]:
from tabula import io
is_lab: bool = False
dfs = io.read_pdf(pdf_path.resolve(), pages=[i+1 for i in range(2)], multiple_tables=False, guess=False, encoding='cp1252')

In [4]:
len(dfs)

1

In [6]:
df = dfs[0]

In [7]:
# get the col where code is chy1001
df[df['COURSE CODE'] == 'CHY1001']

Unnamed: 0,COURSE CODE,COURSE TITLE,THEORY SLOT,LAB SLOT
0,CHY1001,Environmental Studies,D,--


In [19]:
df.head()

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT
0,CSE1005,Software Engineering,L10+L11
1,CSE1005,Software Engineering,L28+L29
2,CSE1005,Software Engineering,L10+L11
3,CSE1005,Software Engineering,L43+L44
4,CSE1005,Software Engineering,L22+L23


In [8]:
df.shape

(56, 4)

In [20]:
df.isna().sum()

COURSE CODE     0
COURSE TITLE    0
SLOT            0
dtype: int64

In [10]:
# drop all the isna 
df = df.dropna()

In [11]:
# give me the last element
df.iloc[-1, :]# give me the last element
df.iloc[-1, :]

COURSE CODE                     MGT1029
COURSE TITLE    Basic Law for Engineers
THEORY SLOT                        D+TD
LAB SLOT                             --
Name: 55, dtype: object

In [34]:
# remove all the white space characters inside df
df = df.replace(r'\s+', ' ', regex=True)

In [12]:
df.head()# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

KeyError: 'TITLE'

In [21]:
df[~df["LAB SLOT"].astype(str).str.contains('-{2,}')]

KeyError: 'LAB SLOT'

In [15]:
df.columns

Index(['COURSE CODE', 'COURSE TITLE', 'THEORY SLOT', 'LAB SLOT'], dtype='object')

In [14]:
# read the ANNEXURE-II_A_-Theory-Slots.csv 
lab_path = data_dir / 'ANNEXURE - II(B) LAB SLOTS.pdf'
dfs = io.read_pdf(lab_path.resolve(), pages=[i+1 for i in range(8)], multiple_tables=False, guess=False, encoding='cp1252')

In [15]:
df = dfs[0]
df.head()

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT
0,CSE1005,Software Engineering,L10+L11
1,CSE1005,Software Engineering,L28+L29
2,CSE1005,Software Engineering,L10+L11
3,CSE1005,Software Engineering,L43+L44
4,CSE1005,Software Engineering,L22+L23


In [59]:
from functools import partial 

is_lab = False
col_names = {
    "slot" : "THEORY SLOT",
    "code" : "COURSE CODE",
    "title": "COURSE TITLE"
}

fns = [
    merge_slots, 
    filter_course_slots,
    partial(merge_course_code_and_title, is_lab=is_lab),
    to_json,
    partial(save_json, name="theory.json")
]

_df = df.copy()
for fn in fns:
    _df = fn(_df, col_names=col_names)

In [58]:
# give me the last element
df.iloc[-1, :]

COURSE CODE                                        SWE4003
COURSE TITLE    Concepts of Mobile Application Development
SLOT                                       L31+L32+L43+L44
Name: 55, dtype: object

In [None]:
df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,STS3006,Basic Competitive Coding - I,G1+TG1
1,CHY1001,Environmental Studies,E1
2,CHY1005,Industrial Chemistry for Engineers,C1+TC1
3,CHY1005,Industrial Chemistry for Engineers,G2+TG2
4,CHY1005,Industrial Chemistry for Engineers,G1+TG1


In [33]:
csv_path = data_dir / 'ANNEXURE-II(B) Lab Slots.csv'
df = pd.read_csv(csv_path.resolve())

In [36]:
# print all the courses whose course code aren't in the format of [A-Z]{3}[0-9]{3}
df[~df["course code".upper()].str.match(r"^[A-Z]{3}[0-9]{4}$")]

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT


In [35]:
# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

In [36]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE1005,Software Engineering,"[L1+L2, L3+L4, L5+L6, L7+L8, L9+L10, L11+L12, ..."
1,CSE1006,Foundations for Data Analytics,"[L1+L2, L5+L6, L27+L28, L31+L32, L47+L48, L57+..."
2,CSE1007,Introduction to Cryptography,"[L37+L38, L7+L8, L51+L52]"
3,CSE1020,Procedural Programming Using C,"[L47+L48, L43+L44, L15+L16]"
4,CSE2006,Computer Graphics,[L9+L10]


In [37]:
# convert the course-slots into a set to remove duplicates and remove nan and the empty slots
_df["SLOT"] = _df["SLOT"].apply(lambda x: set(x) - {np.nan, " "})

In [38]:
_df.isnull().sum()

COURSE CODE    0
TITLE          0
SLOT           0
dtype: int64

In [39]:
# merge course-code and course-title into a single column called course 
_df["course"] = _df["COURSE CODE"] + "-" + _df["TITLE"] + ("( LAB )" if is_lab else "") 

In [40]:
# drop the course-code and course-title columns
_df.drop(columns=["COURSE CODE", "TITLE"], inplace=True)

In [41]:
# convert slot from set to list
_df["SLOT"] = _df["SLOT"].apply(lambda x: list(x))

In [42]:
# convert into a dict where course title is key and value is slots 
course_dict = _df.set_index("course").to_dict()["SLOT"]
course_dict

{'CSE1005-Software Engineering': ['L27+L28',
  'L35+L36',
  'L19+L20',
  'L3+L4',
  'L7+L8',
  'L1+L2',
  'L43+L44',
  'L21+L22',
  'L33+L34',
  'L39+L40',
  'L37+L38',
  'L5+L6',
  'L47+L48',
  'L9+L10',
  'L45+L46',
  'L23+L24',
  'L49+L50',
  'L13+L14',
  'L53+L54',
  'L15+L16',
  'L17+L18',
  'L29+L30',
  'L55+L56',
  'L25+L26',
  'L11+L12',
  'L31+L32',
  'L57+L58',
  'L51+L52',
  'L41+L42'],
 'CSE1006-Foundations for Data Analytics': ['L23+L24',
  'L27+L28',
  'L5+L6',
  'L13+L14',
  'L3+L4',
  'L1+L2',
  'L33+L34',
  'L31+L32',
  'L57+L58',
  'L51+L52',
  'L41+L42',
  'L47+L48'],
 'CSE1007-Introduction to Cryptography': ['L51+L52', 'L7+L8', 'L37+L38'],
 'CSE1020-Procedural Programming Using C': ['L47+L48', 'L43+L44', 'L15+L16'],
 'CSE2006-Computer Graphics': ['L9+L10'],
 'CSE2007-Database Management Systems': ['L27+L28',
  'L35+L36',
  'L19+L20',
  'L3+L4',
  'L7+L8',
  'L1+L2',
  'L43+L44',
  'L21+L22',
  'L33+L34',
  'L39+L40',
  'L37+L38',
  'L5+L6',
  'L47+L48',
  'L9+L10',


In [43]:
# save this as json 
import json
with open("course_dict.json", "w") as f:
    json.dump(course_dict, f, indent=4)