In [69]:
import numpy as np 
import pandas as pd

In [41]:
import tabula
is_lab: bool = False
pdf_path = "./data/ANNEXURE-II(A) Theory Slots.pdf"
dfs = tabula.io.read_pdf(pdf_path, pages=[i+1 for i in range(9)], multiple_tables=False)

In [3]:
len(dfs)

1

In [4]:
df = dfs[0]

In [68]:
# get the col where code is chy1001
df[df['COURSE CODE'] == 'CHY1001']

Unnamed: 0,COURSE CODE,TITLE,SLOT
257,CHY1001,Environmental Studies,E1
258,CHY1001,Environmental Studies,


In [5]:
# give me the last element
df.iloc[-1, :]

COURSE CODE                         STS3006
TITLE          Basic Competitive Coding - I
SLOT                                 G1+TG1
Name: 394, dtype: object

In [6]:
# remove all the white space characters inside df
df = df.replace(r'\s+', ' ', regex=True)

In [8]:
df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE2001,Data Structures and Algorithms,A1+TA1+G1
1,CSE2001,Data Structures and Algorithms,A1+TA1+G1
2,CSE2001,Data Structures and Algorithms,A1+TA1+G1
3,CSE2001,Data Structures and Algorithms,A1+TA1+G1
4,CSE2001,Data Structures and Algorithms,A1+TA1+G1


In [10]:
# print all the courses whose course code aren't in the format of [A-Z]{3}[0-9]{3}
df[~df["course code".upper()].str.match(r"^[A-Z]{3}[0-9]{4}$")]

Unnamed: 0,COURSE CODE,TITLE,SLOT


In [66]:
# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

In [67]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CHY1001,Environmental Studies,"[E1, nan]"
1,CHY1005,Industrial Chemistry for Engineers,"[C1+TC1, G2+TG2, G1+TG1]"
2,CHY1006,Corrosion Science and Engineering,"[E2+TE2, F2+TF2, D1+TD1]"
3,CHY1007,Energy Materials and Their Applications,"[B2+TB2, D2+TD2, F1+TF1]"
4,CHY2001,Chemical and Biosensors,[A2+TA2]


In [58]:
_df = _df.drop(columns=[0]) # remove the column that is there because of the groupby

In [47]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CHY1001,Environmental Studies,E1
1,CHY1005,Industrial Chemistry for Engineers,C1+TC1
2,CHY1005,Industrial Chemistry for Engineers,G1+TG1
3,CHY1005,Industrial Chemistry for Engineers,G2+TG2
4,CHY1006,Corrosion Science and Engineering,D1+TD1


In [70]:
# convert the course-slots into a set to remove duplicates and remove nan and the empty slots
_df["SLOT"] = _df["SLOT"].apply(lambda x: set(x) - {np.nan, " "})

In [72]:
_df.isnull().sum()

COURSE CODE    0
TITLE          0
SLOT           0
dtype: int64

In [73]:
# merge course-code and course-title into a single column called course 
_df["course"] = _df["COURSE CODE"] + "-" + _df["TITLE"] + ("( LAB )" if is_lab else "") 

In [74]:
# drop the course-code and course-title columns
_df.drop(columns=["COURSE CODE", "TITLE"], inplace=True)

In [78]:
# convert slot from set to list
_df["SLOT"] = _df["SLOT"].apply(lambda x: list(x))

In [79]:
# convert into a dict where course title is key and value is slots 
course_dict = _df.set_index("course").to_dict()["SLOT"]
course_dict

{'CHY1001-Environmental Studies': ['E1'],
 'CHY1005-Industrial Chemistry for Engineers': ['G1+TG1', 'C1+TC1', 'G2+TG2'],
 'CHY1006-Corrosion Science and Engineering': ['E2+TE2', 'D1+TD1', 'F2+TF2'],
 'CHY1007-Energy Materials and Their Applications': ['D2+TD2',
  'B2+TB2',
  'F1+TF1'],
 'CHY2001-Chemical and Biosensors': ['A2+TA2'],
 'CHY2002-Organic Electronics': ['E1+TE1', 'B2+TB2'],
 'CSE1006-Foundations for Data Analytics': ['E1'],
 'CSE1008-Theory of Computation': ['C1+TC1+TCC1', 'D2+TD2+TDD2'],
 'CSE2001-Data Structures and Algorithms': ['B1+TB1+TBB1+TG1',
  'A1+TA1+G1',
  'B2+TB2+TBB2+TG2',
  'A2+TA2+G2'],
 'CSE2009-Soft Computing': ['C1+TC1', 'C2+TC2'],
 'CSE2010-Secure Coding': ['C2+TC2'],
 'CSE2011-Cyber Security and Digital Forensics': ['D2+TD2'],
 'CSE2013-Information Theory and Coding': ['B1+TB1+TBB1'],
 'CSE3001-Agile Development Process': ['A2+TA2', 'F1+TF1'],
 'CSE3002-Artificial Intelligence': ['A1+TA1', 'B2+TB2'],
 'CSE3003-Computer Networks': ['G1+TG1'],
 'CSE3004-De

In [80]:
# save this as json 
import json
with open("course_dict.json", "w") as f:
    json.dump(course_dict, f, indent=4)