In [1]:
import pandas as pd 

In [2]:
with open("ANNEXURE-II-List-of-Courses-_Lab_-FASTTRACK-Fall.csv", "r") as f:
    text = f.readlines()

In [3]:
text = text[4:]
text

['"CSE3003   Computer Networks                      L1+L2     L17+L18   L49+L50   L35+L36   L1+L2     L11+L12"\n',
 '"CSE3003   Computer Networks                      L43+L44   L35+L36   L1+L2     L11+L12   L1+L2     L11+L12"\n',
 '"CSE3003   Computer Networks                      L27+L28   L39+L40   L11+L12   L25+L26   L1+L2     L39+L40"\n',
 '"CSE3003   Computer Networks                      L25+L26   L35+L36   L1+L2     L25+L26   L39+L40   L35+L36"\n',
 '"CSE3003   Computer Networks                      L11+L12   L25+L26   L39+L40   L25+L26   L39+L40   L1+L2"\n',
 '"CSE3003   Computer Networks                      L25+L26   L39+L40   L11+L12"\n',
 '"CSE4004   Web Technologies                       L31+L32   L9+L10    L47+L48   L37+L38   L13+L14   L29+L30"\n',
 '"CSE4004   Web Technologies                       L21+L22   L15+L16   L45+L46   L43+L44   L29+L30   L7+L8"\n',
 '"CSE4002   Mobile Application Development         L41+L42   L27+L28   L5+L6     L31+L32   L3+L4     L41+L42"\n',

In [4]:
text = [ t.replace("\n", "").replace("\"", "") for t in text]
text[:5]

['CSE3003   Computer Networks                      L1+L2     L17+L18   L49+L50   L35+L36   L1+L2     L11+L12',
 'CSE3003   Computer Networks                      L43+L44   L35+L36   L1+L2     L11+L12   L1+L2     L11+L12',
 'CSE3003   Computer Networks                      L27+L28   L39+L40   L11+L12   L25+L26   L1+L2     L39+L40',
 'CSE3003   Computer Networks                      L25+L26   L35+L36   L1+L2     L25+L26   L39+L40   L35+L36',
 'CSE3003   Computer Networks                      L11+L12   L25+L26   L39+L40   L25+L26   L39+L40   L1+L2']

In [5]:
import re 
def is_empty(s: str):
    # check if all the characters are empty using regex
    return re.match(r"^\s*$", s) is not None
df_dict = {
    "course-code": [],
    "course-title": [],
    "course-slots": [],
}
for row in text:
    if row == "": continue
    row = row.split("  ")
    if len(row) < 3: 
        print(row)
        continue
    df_dict["course-code"].append(row[0].strip())
    df_dict["course-title"].append(row[1].strip())
    df_dict["course-slots"].append([ s.strip() for s in row[2:] if not is_empty(s)])

df = pd.DataFrame(df_dict)

In [6]:
df

Unnamed: 0,course-code,course-title,course-slots
0,CSE3003,Computer Networks,"[L1+L2, L17+L18, L49+L50, L35+L36, L1+L2, L11+..."
1,CSE3003,Computer Networks,"[L43+L44, L35+L36, L1+L2, L11+L12, L1+L2, L11+..."
2,CSE3003,Computer Networks,"[L27+L28, L39+L40, L11+L12, L25+L26, L1+L2, L3..."
3,CSE3003,Computer Networks,"[L25+L26, L35+L36, L1+L2, L25+L26, L39+L40, L3..."
4,CSE3003,Computer Networks,"[L11+L12, L25+L26, L39+L40, L25+L26, L39+L40, ..."
5,CSE3003,Computer Networks,"[L25+L26, L39+L40, L11+L12]"
6,CSE4004,Web Technologies,"[L31+L32, L9+L10, L47+L48, L37+L38, L13+L14, L..."
7,CSE4004,Web Technologies,"[L21+L22, L15+L16, L45+L46, L43+L44, L29+L30, ..."
8,CSE4002,Mobile Application Development,"[L41+L42, L27+L28, L5+L6, L31+L32, L3+L4, L41+..."
9,CSE4002,Mobile Application Development,"[L15+L16, L45+L46]"


In [7]:
# print all the courses whose course code aren't in the format of [A-Z]{3}[0-9]{3}
df[~df["course-code"].str.match(r"^[A-Z]{3}[0-9]{4}$")]

Unnamed: 0,course-code,course-title,course-slots


In [8]:
# merge all repeated course-code if matches the course-code 
_df = df.groupby("course-code").agg({ "course-title": "first", "course-slots": "sum" }).reset_index()

In [9]:
# convert the course-slots into a set to remove duplicates
_df["course-slots"] = _df["course-slots"].apply(lambda x: list(set(x)))

In [10]:
# merge course-code and course-title into a single column called course 
_df["course"] = _df["course-code"] + "-" + _df["course-title"] + "( LAB )"

In [11]:
# drop the course-code and course-title columns
_df.drop(columns=["course-code", "course-title"], inplace=True)

In [12]:
_df.head()

Unnamed: 0,course-slots,course
0,[L11+L12],CHY1009-Chemistry and Environmental Studies( L...
1,[L49+L50],CSE1005-Software Engineering( LAB )
2,"[L21+L22, L3+L4, L33+L34, L55+L56, L47+L48, L7...",CSE1006-Foundations for Data Analytics( LAB )
3,"[L27+L28, L49+L50, L17+L18]",CSE1007-Introduction to Cryptography( LAB )
4,"[L31+L32, L5+L6]",CSE1018-Cyber Physical System Security( LAB )


In [13]:
# convert into a dict where course title is key and value is slots 
course_dict = _df.set_index("course").to_dict()["course-slots"]
course_dict

{'CHY1009-Chemistry and Environmental Studies( LAB )': ['L11+L12'],
 'CSE1005-Software Engineering( LAB )': ['L49+L50'],
 'CSE1006-Foundations for Data Analytics( LAB )': ['L21+L22',
  'L3+L4',
  'L33+L34',
  'L55+L56',
  'L47+L48',
  'L7+L8'],
 'CSE1007-Introduction to Cryptography( LAB )': ['L27+L28',
  'L49+L50',
  'L17+L18'],
 'CSE1018-Cyber Physical System Security( LAB )': ['L31+L32', 'L5+L6'],
 'CSE2005-Object Oriented Programming( LAB )': ['L33+L34'],
 'CSE2008-Operating Systems( LAB )': ['L27+L28'],
 'CSE2019-Machine Learning( LAB )': ['L57+L58', 'L33+L34'],
 'CSE3002-Artificial Intelligence( LAB )': ['L45+L46',
  'L49+L50',
  'L17+L18',
  'L15+L16'],
 'CSE3003-Computer Networks( LAB )': ['L1+L2',
  'L49+L50',
  'L11+L12',
  'L35+L36',
  'L39+L40',
  'L25+L26',
  'L17+L18',
  'L27+L28',
  'L43+L44'],
 'CSE3005-Distributed Systems( LAB )': ['L55+L56'],
 'CSE3014-Software Design with UML( LAB )': ['L45+L46', 'L31+L32'],
 'CSE3024-Compiler Design( LAB )': ['L55+L56', 'L49+L50'],


In [14]:
# save this as json 
import json
with open("course_dict.json", "w") as f:
    json.dump(course_dict, f, indent=4)