In [4]:
%pip install jpype1

Collecting jpype1
  Downloading JPype1-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading JPype1-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.9/488.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: jpype1
Successfully installed jpype1-1.5.0
Note: you may need to restart the kernel to use updated packages.


In [26]:
import numpy as np 
import pandas as pd
from pathlib import Path 
import os 

data_dir = csv_path = Path(os.getcwd()) / 'data' 
is_lab = False
pdf_path = data_dir/ 'ANNEXURE - II(B) THEORY SLOTS.pdf'

In [30]:
from tabula import io
is_lab: bool = False
dfs = io.read_pdf(pdf_path.resolve(), pages=[i+1 for i in range(3)], multiple_tables=False, guess=False, encoding='cp1252')

In [31]:
len(dfs)

1

In [32]:
df = dfs[0]

In [23]:
# get the col where code is chy1001
df[df['COURSE CODE'] == 'CHY1001']

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT
0,CHY1001,Environmental Studies,A+TA


In [10]:
df.shape

(648, 3)

In [19]:
df.isna().sum()

COURSE CODE     0
COURSE TITLE    0
SLOT            0
dtype: int64

In [7]:
# drop all the isna 
df = df.dropna()

In [33]:
# give me the last element
df.iloc[-1, :]# give me the last element
df.iloc[-1, :]

COURSE CODE             TEC3001
COURSE TITLE    Design Thinking
SLOT                   C+TC+TCC
Name: 118, dtype: object

In [34]:
# remove all the white space characters inside df
df = df.replace(r'\s+', ' ', regex=True)

In [38]:
df.head()# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT
0,CHY1001,Environmental Studies,A+TA
1,CHY1004,Engineering Chemistry,C+TC
2,CHY1005,Industrial Chemistry for Engineers,A+TA+TAA
3,CHY1006,Corrosion Science and Engineering,C+TC+TCC
4,CHY2001,Chemical and Biosensors,A+TA+TAA


In [49]:
def remove_white_spaces(df):
    # remove all the white space characters inside df
    df = df.replace(r'\s+', ' ', regex=True)
    return df 
    
def merge_slots(df):
    # merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
    # append all the SLOT values to the SLOT column SLOT col is a list 
    _df = df.groupby(["COURSE CODE", "COURSE TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()
    return _df
    
def filter_course_slots(df):
    # convert the course-slots into a set to remove duplicates and remove nan and the empty slots
    df["SLOT"] = df["SLOT"].apply(lambda x: set(x) - {np.nan, " "})
    return df 

def merge_course_code_and_title(df, is_lab):
    _df = df
    # merge course-code and course-title into a single column called course 
    _df["course"] = _df["COURSE CODE"] + "-" + _df["COURSE TITLE"] + ("( LAB )" if is_lab else "") 
    # drop the course-code and course-title columns
    _df.drop(columns=["COURSE CODE", "COURSE TITLE"], inplace=True)
    return df 

def to_json(df):
    _df = df
    # convert slot from set to list
    _df["SLOT"] = _df["SLOT"].apply(lambda x: list(x))
    return _df.set_index("course").to_dict()["SLOT"]

def save_json(df_dict, name):
    import json
    with open(name, "w") as f:
        json.dump(df_dict, f, indent=4)

In [None]:
is_lab = False
fns = [
    merge_slots, 
    filter_course_slots,
    lambda df : merge_course_code_and_title(df, is_lab),
    to_json,
    lambda df : save_json(df, "theory.json")
]

_df = df.copy()
for fn in fns:
    _df = fn(_df)

In [56]:
# read the ANNEXURE-II_A_-Theory-Slots.csv 
lab_path = data_dir / 'ANNEXURE - II(A) LAB SLOTS.pdf'
dfs = io.read_pdf(lab_path.resolve(), pages=[i+1 for i in range(2)], multiple_tables=False, guess=False, encoding='cp1252')

In [57]:
df = dfs[0]
df.head()

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT
0,CHY1004,Engineering Chemistry,L35+L36+L47+L48
1,CSE1007,Introduction to Cryptography,L31+L32+L43+L44
2,CSE1012,Problem Solving using Python,L27+L28+L39+L40
3,CSE1012,Problem Solving using Python,L31+L32+L43+L44
4,CSE2001,Data Structures and Algorithms,L35+L36+L47+L48


In [59]:
is_lab = True
fns = [
    remove_white_spaces,
    merge_slots, 
    filter_course_slots,
    lambda df : merge_course_code_and_title(df, is_lab),
    to_json,
    lambda df : save_json(df, "lab.json")
]

_df = df.copy()
for fn in fns:
    _df = fn(_df)

In [58]:
# give me the last element
df.iloc[-1, :]

COURSE CODE                                        SWE4003
COURSE TITLE    Concepts of Mobile Application Development
SLOT                                       L31+L32+L43+L44
Name: 55, dtype: object

In [None]:
df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,STS3006,Basic Competitive Coding - I,G1+TG1
1,CHY1001,Environmental Studies,E1
2,CHY1005,Industrial Chemistry for Engineers,C1+TC1
3,CHY1005,Industrial Chemistry for Engineers,G2+TG2
4,CHY1005,Industrial Chemistry for Engineers,G1+TG1


In [33]:
csv_path = data_dir / 'ANNEXURE-II(B) Lab Slots.csv'
df = pd.read_csv(csv_path.resolve())

In [36]:
# print all the courses whose course code aren't in the format of [A-Z]{3}[0-9]{3}
df[~df["course code".upper()].str.match(r"^[A-Z]{3}[0-9]{4}$")]

Unnamed: 0,COURSE CODE,COURSE TITLE,SLOT


In [35]:
# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

In [36]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE1005,Software Engineering,"[L1+L2, L3+L4, L5+L6, L7+L8, L9+L10, L11+L12, ..."
1,CSE1006,Foundations for Data Analytics,"[L1+L2, L5+L6, L27+L28, L31+L32, L47+L48, L57+..."
2,CSE1007,Introduction to Cryptography,"[L37+L38, L7+L8, L51+L52]"
3,CSE1020,Procedural Programming Using C,"[L47+L48, L43+L44, L15+L16]"
4,CSE2006,Computer Graphics,[L9+L10]


In [37]:
# convert the course-slots into a set to remove duplicates and remove nan and the empty slots
_df["SLOT"] = _df["SLOT"].apply(lambda x: set(x) - {np.nan, " "})

In [38]:
_df.isnull().sum()

COURSE CODE    0
TITLE          0
SLOT           0
dtype: int64

In [39]:
# merge course-code and course-title into a single column called course 
_df["course"] = _df["COURSE CODE"] + "-" + _df["TITLE"] + ("( LAB )" if is_lab else "") 

In [40]:
# drop the course-code and course-title columns
_df.drop(columns=["COURSE CODE", "TITLE"], inplace=True)

In [41]:
# convert slot from set to list
_df["SLOT"] = _df["SLOT"].apply(lambda x: list(x))

In [42]:
# convert into a dict where course title is key and value is slots 
course_dict = _df.set_index("course").to_dict()["SLOT"]
course_dict

{'CSE1005-Software Engineering': ['L27+L28',
  'L35+L36',
  'L19+L20',
  'L3+L4',
  'L7+L8',
  'L1+L2',
  'L43+L44',
  'L21+L22',
  'L33+L34',
  'L39+L40',
  'L37+L38',
  'L5+L6',
  'L47+L48',
  'L9+L10',
  'L45+L46',
  'L23+L24',
  'L49+L50',
  'L13+L14',
  'L53+L54',
  'L15+L16',
  'L17+L18',
  'L29+L30',
  'L55+L56',
  'L25+L26',
  'L11+L12',
  'L31+L32',
  'L57+L58',
  'L51+L52',
  'L41+L42'],
 'CSE1006-Foundations for Data Analytics': ['L23+L24',
  'L27+L28',
  'L5+L6',
  'L13+L14',
  'L3+L4',
  'L1+L2',
  'L33+L34',
  'L31+L32',
  'L57+L58',
  'L51+L52',
  'L41+L42',
  'L47+L48'],
 'CSE1007-Introduction to Cryptography': ['L51+L52', 'L7+L8', 'L37+L38'],
 'CSE1020-Procedural Programming Using C': ['L47+L48', 'L43+L44', 'L15+L16'],
 'CSE2006-Computer Graphics': ['L9+L10'],
 'CSE2007-Database Management Systems': ['L27+L28',
  'L35+L36',
  'L19+L20',
  'L3+L4',
  'L7+L8',
  'L1+L2',
  'L43+L44',
  'L21+L22',
  'L33+L34',
  'L39+L40',
  'L37+L38',
  'L5+L6',
  'L47+L48',
  'L9+L10',


In [43]:
# save this as json 
import json
with open("course_dict.json", "w") as f:
    json.dump(course_dict, f, indent=4)