In [1]:
import pandas as pd
import numpy as np

import nltk

In [2]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ribba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\ribba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ribba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
df = pd.read_json("list of courses with dept and difficulty.json")
df.rename(columns={"Offering Dept": "degree_type", "Difficulty": "difficulty", "Course Title": "course_name"}, inplace=True)
df["degree_type"] = df["degree_type"].str.lower()
df["course_name"] = df["course_name"].str.lower()
df.head()

Unnamed: 0,degree_type,difficulty,course_name
0,accounting and finance,1,principles of financial accounting
1,accounting and finance,1,principles of management accounting
2,accounting and finance,2,theory and concepts of accounting - islamic pe...
3,accounting and finance,2,corporate financial reporting i
4,accounting and finance,4,applied financial analysis


In [5]:
degree_types = pd.DataFrame(df["degree_type"].unique(), columns=["degree_type"])
degree_types.index = range(1, len(degree_types) + 1)
degree_types.index.name = "id"
degree_types.to_csv("degree_type.csv")

In [6]:
course = df[["course_name", "difficulty"]]
course.index = range(1, len(course) + 1)
course.index.name = "id"
course.to_csv("course.csv")

In [7]:
course_with_id = course.reset_index()\
    .rename(columns={"id": "course_id"})\
    .drop("difficulty", axis=1)

degree_with_id = degree_types.reset_index()\
    .rename(columns={"id": "degree_type_id"})

df_modified = df.drop("difficulty", axis=1)

course_merged = pd.merge(df_modified, course_with_id, on="course_name", how="right")
degree_course = pd.merge(course_merged, degree_with_id, on="degree_type", how="inner")

degree_course.drop(["degree_type", "course_name"], axis=1, inplace=True)
degree_course.to_csv("degree_course.csv", index=False)

In [8]:
def generate_tags(row) -> list[str]:
    degree_type = row["degree_type"]
    course_name = row["course_name"]
    complete_text = degree_type + " " + course_name
    
    # tokenize the complete text
    tokens = word_tokenize(complete_text)

    # Define and remove stop words
    stop_words = set(stopwords.words("english"))
    custom_words = {"principles", "principle", "introduction", "of", "to", "and", "in", "for", "the", "with", "a", "an", "i", "ii"}
    stop_words.update(custom_words)
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatize the remaining words to their root form
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # use set to remove any duplicate and list to turn into a list again
    final_tags = list(set(lemmatized_tokens))

    return final_tags