# Python notebook that is used to get all the default data from the list of courses with department and difficulty

## Import the libraries

In [1]:
%pip install pandas
%pip install numpy
%pip install nltk
%pip install torch
%pip install transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting torch
  Downloading torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting filelock (from torch)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.8.0-cp312-cp312-win_amd64.whl (241.3 MB)
   ---------------------------------------- 0.0/241.3 MB ? eta -:--:--
   ---------------------------------------- 1.6/241.3 MB 14.1 MB/s eta 0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
accelerate 1.2.1 requires huggingface-hub>=0.21.0, which is not installed.
accelerate 1.2.1 requires safetensors>=0.4.3, which is not installed.


Collecting transformers
  Using cached transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Using cached transformers-4.56.1-py3-none-any.whl (11.6 MB)
Using cached huggingface_hub-0.35.0-py3-none-any.whl (563 kB)
Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl (2.7 MB)
Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl (320 kB)
Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers

   ---------- ----------------------------- 1/4 [huggingface-hub]
   ---------- ----------------------------- 1/4 [huggingface-hub]
   ---------- ----------------------------- 1/4

In [2]:
import pandas as pd
import numpy as np
import itertools as it

import nltk

import torch
from transformers import AutoTokenizer, AutoModel

### nltk requires that these modules be downloaded separately

In [3]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ribba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\ribba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ribba/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ribba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Read the file and store them in a pandas data frame

In [5]:
df = pd.read_json("list of courses with dept and difficulty.json")
df.rename(columns={"Offering Dept": "degree_type", "Difficulty": "difficulty", "Course Title": "course_name"}, inplace=True)
df["degree_type"] = df["degree_type"].str.lower()
df["course_name"] = df["course_name"].str.lower()
df.head()

Unnamed: 0,degree_type,difficulty,course_name
0,accounting and finance,1,principles of financial accounting
1,accounting and finance,1,principles of management accounting
2,accounting and finance,2,theory and concepts of accounting - islamic pe...
3,accounting and finance,2,corporate financial reporting i
4,accounting and finance,4,applied financial analysis


### Get all the unique degree types and set their index starting from 1

In [6]:
degree_types = pd.DataFrame(df["degree_type"].unique(), columns=["degree_type"])
degree_types.index = range(1, len(degree_types) + 1)
degree_types.index.name = "id"
degree_types.to_csv("degree_type.csv")

### Get all the courses and set their index starting from 1

In [7]:
course = df[["course_name", "difficulty"]]
# remove duplicates
course.drop_duplicates(subset="course_name", inplace=True)
course.index = range(1, len(course) + 1)
course.index.name = "id"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course.drop_duplicates(subset="course_name", inplace=True)


For courses generate embeddings to make content based filtering more accurate

In [8]:
model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
course_names = course["course_name"].tolist()

encoded_input = tokenizer(
    course_names,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

with torch.no_grad():
    model_output = model(**encoded_input)

token_embeddings = model_output.last_hidden_state

attention_mask = encoded_input['attention_mask']

mask_expended = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

sum_embeddings = torch.sum(token_embeddings * mask_expended, 1)

sum_mask = torch.clamp(mask_expended.sum(1), min=1e-9)
sentence_embeddings = sum_embeddings / sum_mask

print("shape of sentence embeddings:")
print(sentence_embeddings.shape)

print("\nEmbedding for the first sentence (first 10 values):")
print(sentence_embeddings[0][:10])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

shape of sentence embeddings:
torch.Size([405, 768])

Embedding for the first sentence (first 10 values):
tensor([-0.0308,  0.0594,  0.0208,  0.0224,  0.1083, -0.0609, -0.0001,  0.0020,
        -0.0348,  0.0147])


In [None]:
embeddings_list = sentence_embeddings.cpu().tolist()

course['embedding'] = embeddings_list

course.to_csv("course.csv")

course.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course['embedding'] = embeddings_list


                                          course_name  difficulty  \
id                                                                  
1                  principles of financial accounting           1   
2                 principles of management accounting           1   
3   theory and concepts of accounting - islamic pe...           2   
4                     corporate financial reporting i           2   
5                          applied financial analysis           4   

                                            embedding  
id                                                     
1   [-0.03077944554388523, 0.059383489191532135, 0...  
2   [-0.025201469659805298, 0.045207493007183075, ...  
3   [-0.049371156841516495, 0.06671033799648285, 0...  
4   [-0.016406942158937454, 0.07581979781389236, -...  
5   [-0.024054786190390587, 0.11262382566928864, 0...  


### Make a junction table which would store the id of degree_type and course so that they can be stored efficiently

In [None]:
course_with_id = course.reset_index()\
    .rename(columns={"id": "course_id"})\
    .drop("difficulty", axis=1)

degree_with_id = degree_types.reset_index()\
    .rename(columns={"id": "degree_type_id"})

df_modified = df.drop("difficulty", axis=1)

course_merged = pd.merge(course_with_id, df_modified, on="course_name", how="left")
degree_course = pd.merge(course_merged, degree_with_id, on="degree_type", how="left")

degree_course_modified = degree_course.drop(["degree_type", "course_name"], axis=1)
degree_course_modified.to_csv("degree_course.csv", index=False)

: 

: 

### Process of making tags of each course

steps to get tags are:

1. combine degree name and course name
2. lower case the combined strings
3. tokenize the combined string
4. remove stop words, add some of our own
5. lemmatize the words
6. return the list of tags for each course

In [None]:
def generate_tags(row) -> list[str]:
    degree_type = row["degree_type"]
    course_name = row["course_name"]
    complete_text = degree_type + " " + course_name
    
    # tokenize the complete text
    tokens = word_tokenize(complete_text)

    # Define and remove stop words
    stop_words = set(stopwords.words("english"))
    custom_words = {"principles", "principle", "introduction", "introductory", "intro", "of", "to", "and", "in", "for", "the", "with", "a", "an", "i", "ii"}
    stop_words.update(custom_words)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize the remaining words to their root form
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # list of words to exclude from minimum word length
    length_exception = ["art", "ai", "ml", "war", "law", "lab", "jog", "iot"]

    # remove tags that are below the maximum word limit and print them
    limit_length_tokens = [word for word in lemmatized_tokens if ((len(word) > 3) or (word in length_exception))]

    # use set to remove any duplicate and list to turn into a list again
    final_tags = list(set(limit_length_tokens))

    return final_tags

: 

: 

In [None]:
course_tags = degree_course.drop_duplicates(subset="course_name").apply(generate_tags, axis=1)
course_tags.index = range(1, len(course_tags) + 1)
course_tags.index.name = "course_id"
course_tags.name = "course_tags"
course_tags

course_id
1                       [financial, accounting, finance]
2                      [accounting, management, finance]
3      [accounting, concept, perspective, finance, is...
4      [accounting, financial, corporate, finance, re...
5      [accounting, analysis, financial, finance, app...
                             ...                        
401                 [tutorial, year, engineering, first]
402                                 [swimming, beginner]
403    [swimming, conditioning, strength, beginner, t...
404                     [swimming, basketball, beginner]
405              [swimming, walk, stretch, jog, fitness]
Name: course_tags, Length: 405, dtype: object

: 

: 

## We need to make the tags unique for tag table

In [None]:
# using itertools we will unpack all strings from the course_tag dataframe
# putting them in a set will remove all duplicate values and give us unique tags
tag = pd.DataFrame(set(it.chain.from_iterable(course_tags)), columns=["tag_name"])
tag.index = range(1, len(tag) + 1)
tag.index.name = "id"
tag.to_csv("tag.csv")

: 

: 

### make course tags which would have id of all the courses linked with their respective tags

In [None]:
tag_with_id = tag.reset_index()\
    .rename(columns={"id": "tag_id"})

course_tags_with_id = course_tags.reset_index()

# make the dataframe which would store the id of the tags and courses
course_tag = pd.DataFrame(columns=["course_id", "tag_id"])

for course_id, tags in zip(course_tags_with_id["course_id"], course_tags_with_id["course_tags"]):
    for tag_name in tags:
        tag_id = tag_with_id.loc[tag_with_id["tag_name"] == tag_name, "tag_id"].values[0]
        new_row = pd.DataFrame({"course_id": [course_id], "tag_id": [tag_id]})
        course_tag = pd.concat([course_tag, new_row], ignore_index=True)

course_tag.to_csv("course_tag.csv", index=False)

: 

: 