In [25]:
import numpy as np 
import pandas as pd
from pathlib import Path 
data_dir = csv_path = Path(__vsc_ipynb_file__).resolve().parent / 'data' 
is_lab = False
pdf_path = data_dir/ 'ANNEXURE-II(A) Theory Slots.pdf'

In [2]:
from tabula import io
is_lab: bool = False
dfs = io.read_pdf(pdf_path.resolve(), pages=[i+1 for i in range(9)], multiple_tables=False, guess=False, encoding='cp1252')

JVMNotFoundException: No JVM shared library file (jvm.dll) found. Try setting up the JAVA_HOME environment variable properly.

In [None]:
len(dfs)

1

In [None]:
df = dfs[0]

In [None]:
# get the col where code is chy1001
df[df['COURSE CODE'] == 'CHY1001']

Unnamed: 0,COURSE CODE,TITLE,SLOT
1,CHY1001,Environmental Studies,E1


In [None]:
df.isna().sum()

COURSE CODE    0
TITLE          0
SLOT           0
dtype: int64

In [None]:
# drop all the isna 
df = df.dropna()

In [None]:
# give me the last element
df.iloc[-1, :]

COURSE CODE                    MGT1029
TITLE          Basic Law for Engineers
SLOT                            F1+TF1
Name: 365, dtype: object

In [None]:
# remove all the white space characters inside df
df = df.replace(r'\s+', ' ', regex=True)

In [32]:
# read the ANNEXURE-II_A_-Theory-Slots.csv 
csv_path = data_dir / 'ANNEXURE-II(B) Lab Slots.csv'
with open(csv_path, 'r') as f:
    csv_data = f.read()

# split the csv_data by \n
csv_data = csv_data.split('\n')
parsed_data = []
for line in csv_data:
    parts = line.replace("\"", "").split(" ")
    parts = [part.strip() for part in parts if part != '']
    p1, p2, p3 = parts[0], parts[1:-1], parts[-1]
    parts = [p1, " ".join(p2), p3]
    parsed_data.append(",".join(parts))

# save the file as csv
csv_path = data_dir / 'ANNEXURE-II_A_parsed-Lab-Slots.csv'
with open(csv_path, 'w') as f:
    f.write('\n'.join(parsed_data))

IndexError: list index out of range

In [None]:
df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,STS3006,Basic Competitive Coding - I,G1+TG1
1,CHY1001,Environmental Studies,E1
2,CHY1005,Industrial Chemistry for Engineers,C1+TC1
3,CHY1005,Industrial Chemistry for Engineers,G2+TG2
4,CHY1005,Industrial Chemistry for Engineers,G1+TG1


In [33]:
csv_path = data_dir / 'ANNEXURE-II(B) Lab Slots.csv'
df = pd.read_csv(csv_path.resolve())

In [34]:
# print all the courses whose course code aren't in the format of [A-Z]{3}[0-9]{3}
df[~df["course code".upper()].str.match(r"^[A-Z]{3}[0-9]{4}$")]

Unnamed: 0,COURSE CODE,TITLE,SLOT


In [35]:
# merge all the rows that have COURSE CODE AND SLOT AND COURSE TITLE as SAME
# append all the SLOT values to the SLOT column SLOT col is a list 
_df = df.groupby(["COURSE CODE", "TITLE"])\
        .agg({"SLOT": lambda x: list(x)})\
        .reset_index()

In [36]:
_df.head()

Unnamed: 0,COURSE CODE,TITLE,SLOT
0,CSE1005,Software Engineering,"[L1+L2, L3+L4, L5+L6, L7+L8, L9+L10, L11+L12, ..."
1,CSE1006,Foundations for Data Analytics,"[L1+L2, L5+L6, L27+L28, L31+L32, L47+L48, L57+..."
2,CSE1007,Introduction to Cryptography,"[L37+L38, L7+L8, L51+L52]"
3,CSE1020,Procedural Programming Using C,"[L47+L48, L43+L44, L15+L16]"
4,CSE2006,Computer Graphics,[L9+L10]


In [37]:
# convert the course-slots into a set to remove duplicates and remove nan and the empty slots
_df["SLOT"] = _df["SLOT"].apply(lambda x: set(x) - {np.nan, " "})

In [38]:
_df.isnull().sum()

COURSE CODE    0
TITLE          0
SLOT           0
dtype: int64

In [39]:
# merge course-code and course-title into a single column called course 
_df["course"] = _df["COURSE CODE"] + "-" + _df["TITLE"] + ("( LAB )" if is_lab else "") 

In [40]:
# drop the course-code and course-title columns
_df.drop(columns=["COURSE CODE", "TITLE"], inplace=True)

In [41]:
# convert slot from set to list
_df["SLOT"] = _df["SLOT"].apply(lambda x: list(x))

In [42]:
# convert into a dict where course title is key and value is slots 
course_dict = _df.set_index("course").to_dict()["SLOT"]
course_dict

{'CSE1005-Software Engineering': ['L27+L28',
  'L35+L36',
  'L19+L20',
  'L3+L4',
  'L7+L8',
  'L1+L2',
  'L43+L44',
  'L21+L22',
  'L33+L34',
  'L39+L40',
  'L37+L38',
  'L5+L6',
  'L47+L48',
  'L9+L10',
  'L45+L46',
  'L23+L24',
  'L49+L50',
  'L13+L14',
  'L53+L54',
  'L15+L16',
  'L17+L18',
  'L29+L30',
  'L55+L56',
  'L25+L26',
  'L11+L12',
  'L31+L32',
  'L57+L58',
  'L51+L52',
  'L41+L42'],
 'CSE1006-Foundations for Data Analytics': ['L23+L24',
  'L27+L28',
  'L5+L6',
  'L13+L14',
  'L3+L4',
  'L1+L2',
  'L33+L34',
  'L31+L32',
  'L57+L58',
  'L51+L52',
  'L41+L42',
  'L47+L48'],
 'CSE1007-Introduction to Cryptography': ['L51+L52', 'L7+L8', 'L37+L38'],
 'CSE1020-Procedural Programming Using C': ['L47+L48', 'L43+L44', 'L15+L16'],
 'CSE2006-Computer Graphics': ['L9+L10'],
 'CSE2007-Database Management Systems': ['L27+L28',
  'L35+L36',
  'L19+L20',
  'L3+L4',
  'L7+L8',
  'L1+L2',
  'L43+L44',
  'L21+L22',
  'L33+L34',
  'L39+L40',
  'L37+L38',
  'L5+L6',
  'L47+L48',
  'L9+L10',


In [43]:
# save this as json 
import json
with open("course_dict.json", "w") as f:
    json.dump(course_dict, f, indent=4)