In [17]:
import numpy as np 
import pandas as pd

In [18]:
import tabula
is_lab: bool = True
pdf_path = "./data/ANNEXURE-II(B) Lab Slots.pdf"
dfs = tabula.io.read_pdf(pdf_path, pages=[i+1 for i in range(4)], multiple_tables=False)

In [19]:
len(dfs)

1

In [20]:
df = dfs[0]

In [21]:
# get the col where code is chy1001
df[df['COURSE CODE'] == 'CHY1001']

Unnamed: 0,COURSE CODE,TITLE,SLOT


In [22]:
# give me the last element
df.iloc[-1, :]

COURSE CODE                   MEC3012
TITLE          Additive Manufacturing
SLOT                          L33+L34
Name: 156, dtype: object

In [23]:
# remove all the white space characters inside df
df = df.replace(r'\s+', ' ', regex=True)

In [24]:
df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE2001,Data Structures and Algorithms,L1+L2
1,CSE2001,Data Structures and Algorithms,L3+L4
2,CSE2001,Data Structures and Algorithms,L5+L6
3,CSE2001,Data Structures and Algorithms,L7+L8
4,CSE2001,Data Structures and Algorithms,L9+L10


In [25]:
# print all the courses whose course code aren't in the format of [A-Z]{3}[0-9]{3}
df[~df["course code".upper()].str.match(r"^[A-Z]{3}[0-9]{4}$")]

Unnamed: 0,COURSE CODE,TITLE,SLOT


In [26]:
# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

In [27]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE1006,Foundations for Data Analytics,"[L53+L54, L33+L34]"
1,CSE2001,Data Structures and Algorithms,"[L1+L2, L3+L4, L5+L6, L7+L8, L9+L10, L11+L12, ..."
2,CSE2010,Secure Coding,[L9+L10]
3,CSE2011,Cyber Security and Digital Forensics,[L37+L38]
4,CSE3001,Agile Development Process,"[L47+L48, L11+L12]"


In [28]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE1006,Foundations for Data Analytics,"[L53+L54, L33+L34]"
1,CSE2001,Data Structures and Algorithms,"[L1+L2, L3+L4, L5+L6, L7+L8, L9+L10, L11+L12, ..."
2,CSE2010,Secure Coding,[L9+L10]
3,CSE2011,Cyber Security and Digital Forensics,[L37+L38]
4,CSE3001,Agile Development Process,"[L47+L48, L11+L12]"


In [29]:
# convert the course-slots into a set to remove duplicates and remove nan and the empty slots
_df["SLOT"] = _df["SLOT"].apply(lambda x: set(x) - {np.nan, " "})

In [30]:
_df.isnull().sum()

COURSE CODE    0
TITLE          0
SLOT           0
dtype: int64

In [31]:
# merge course-code and course-title into a single column called course 
_df["course"] = _df["COURSE CODE"] + "-" + _df["TITLE"] + ("( LAB )" if is_lab else "") 

In [32]:
# drop the course-code and course-title columns
_df.drop(columns=["COURSE CODE", "TITLE"], inplace=True)

In [33]:
# convert slot from set to list
_df["SLOT"] = _df["SLOT"].apply(lambda x: list(x))

In [34]:
# convert into a dict where course title is key and value is slots 
course_dict = _df.set_index("course").to_dict()["SLOT"]
course_dict

{'CSE1006-Foundations for Data Analytics( LAB )': ['L33+L34', 'L53+L54'],
 'CSE2001-Data Structures and Algorithms( LAB )': ['L17+L18',
  'L35+L36',
  'L13+L14',
  'L25+L26',
  'L53+L54',
  'L47+L48',
  'L57+L58',
  'L33+L34',
  'L39+L40',
  'L51+L52',
  'L43+L44',
  'L37+L38',
  'L49+L50',
  'L29+L30',
  'L23+L24',
  'L7+L8',
  'L11+L12',
  'L27+L28',
  'L31+L32',
  'L59+L60',
  'L41+L42',
  'L21+L22',
  'L45+L46',
  'L3+L4',
  'L1+L2',
  'L5+L6',
  'L15+L16',
  'L19+L20',
  'L55+L56',
  'L9+L10'],
 'CSE2010-Secure Coding( LAB )': ['L9+L10'],
 'CSE2011-Cyber Security and Digital Forensics( LAB )': ['L37+L38'],
 'CSE3001-Agile Development Process( LAB )': ['L47+L48', 'L11+L12'],
 'CSE3002-Artificial Intelligence( LAB )': ['L15+L16', 'L55+L56'],
 'CSE3003-Computer Networks( LAB )': ['L45+L46'],
 'CSE3004-Design and Analysis of Algorithms( LAB )': ['L39+L40'],
 'CSE3006-Data Visualization( LAB )': ['L39+L40'],
 'CSE3008-Introduction to Machine Learning( LAB )': ['L23+L24', 'L53+L54'],
 '

In [35]:
# save this as json 
import json
with open("course_dict.json", "w") as f:
    json.dump(course_dict, f, indent=4)