# Python notebook that is used to get all the default data from the list of courses with department and difficulty

## Import the libraries

In [None]:
%pip install pandas
%pip install numpy
%pip install nltk
%pip install torch
%pip install transformers

In [None]:
import pandas as pd
import numpy as np
import itertools as it

import nltk

import torch
from transformers import AutoTokenizer, AutoModel

### nltk requires that these modules be downloaded separately

In [None]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Read the file and store them in a pandas data frame

In [None]:
df = pd.read_json("list of courses with dept and difficulty.json")
df.rename(columns={"Offering Dept": "degree_type", "Difficulty": "difficulty", "Course Title": "course_name"}, inplace=True)
df["degree_type"] = df["degree_type"].str.lower()
df["course_name"] = df["course_name"].str.lower()
df.head()

### Get all the unique degree types and set their index starting from 1

In [None]:
degree_types = pd.DataFrame(df["degree_type"].unique(), columns=["degree_type"])
degree_types.index = range(1, len(degree_types) + 1)
degree_types.index.name = "id"
degree_types.to_csv("degree_type.csv")

### Get all the courses and set their index starting from 1

In [None]:
course = df[["course_name", "difficulty"]]
# remove duplicates
course.drop_duplicates(subset="course_name", inplace=True)
course.index = range(1, len(course) + 1)
course.index.name = "id"

For courses generate embeddings to make content based filtering more accurate

In [None]:
model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
course_names = course["course_name"].tolist()

encoded_input = tokenizer(
    course_names,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

with torch.no_grad():
    model_output = model(**encoded_input)

token_embeddings = model_output.last_hidden_state

attention_mask = encoded_input['attention_mask']

mask_expended = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

sum_embeddings = torch.sum(token_embeddings * mask_expended, 1)

sum_mask = torch.clamp(mask_expended.sum(1), min=1e-9)
sentence_embeddings = sum_embeddings / sum_mask

print("shape of sentence embeddings:")
print(sentence_embeddings.shape)

print("\nEmbedding for the first sentence (first 10 values):")
print(sentence_embeddings[0][:10])

In [None]:
embeddings_list = sentence_embeddings.cpu().tolist()

course['embedding'] = embeddings_list

course.to_csv("course.csv")

course.head()

### Make a junction table which would store the id of degree_type and course so that they can be stored efficiently

In [None]:
course_with_id = course.reset_index()\
    .rename(columns={"id": "course_id"})\
    .drop("difficulty", axis=1)

degree_with_id = degree_types.reset_index()\
    .rename(columns={"id": "degree_type_id"})

df_modified = df.drop("difficulty", axis=1)

course_merged = pd.merge(course_with_id, df_modified, on="course_name", how="left")
degree_course = pd.merge(course_merged, degree_with_id, on="degree_type", how="left")

degree_course_modified = degree_course.drop(["degree_type", "course_name"], axis=1)
degree_course_modified.to_csv("degree_course.csv", index=False)

### Process of making tags of each course

steps to get tags are:

1. combine degree name and course name
2. lower case the combined strings
3. tokenize the combined string
4. remove stop words, add some of our own
5. lemmatize the words
6. return the list of tags for each course

In [None]:
def generate_tags(row) -> list[str]:
    degree_type = row["degree_type"]
    course_name = row["course_name"]
    complete_text = degree_type + " " + course_name
    
    # tokenize the complete text
    tokens = word_tokenize(complete_text)

    # Define and remove stop words
    stop_words = set(stopwords.words("english"))
    custom_words = {"principles", "principle", "introduction", "introductory", "intro", "of", "to", "and", "in", "for", "the", "with", "a", "an", "i", "ii"}
    stop_words.update(custom_words)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize the remaining words to their root form
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # list of words to exclude from minimum word length
    length_exception = ["art", "ai", "ml", "war", "law", "lab", "jog", "iot"]

    # remove tags that are below the maximum word limit and print them
    limit_length_tokens = [word for word in lemmatized_tokens if ((len(word) > 3) or (word in length_exception))]

    # use set to remove any duplicate and list to turn into a list again
    final_tags = list(set(limit_length_tokens))

    return final_tags

In [None]:
course_tags = degree_course.drop_duplicates(subset="course_name").apply(generate_tags, axis=1)
course_tags.index = range(1, len(course_tags) + 1)
course_tags.index.name = "course_id"
course_tags.name = "course_tags"
course_tags

## We need to make the tags unique for tag table

In [None]:
# using itertools we will unpack all strings from the course_tag dataframe
# putting them in a set will remove all duplicate values and give us unique tags
tag = pd.DataFrame(set(it.chain.from_iterable(course_tags)), columns=["tag_name"])
tag.index = range(1, len(tag) + 1)
tag.index.name = "id"
tag.to_csv("tag.csv")

### make course tags which would have id of all the courses linked with their respective tags

In [None]:
tag_with_id = tag.reset_index()\
    .rename(columns={"id": "tag_id"})

course_tags_with_id = course_tags.reset_index()

# make the dataframe which would store the id of the tags and courses
course_tag = pd.DataFrame(columns=["course_id", "tag_id"])

for course_id, tags in zip(course_tags_with_id["course_id"], course_tags_with_id["course_tags"]):
    for tag_name in tags:
        tag_id = tag_with_id.loc[tag_with_id["tag_name"] == tag_name, "tag_id"].values[0]
        new_row = pd.DataFrame({"course_id": [course_id], "tag_id": [tag_id]})
        course_tag = pd.concat([course_tag, new_row], ignore_index=True)

course_tag.to_csv("course_tag.csv", index=False)