# **Calculating Course Similarity using BoW Features**


In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
import pandas as pd
import nltk as nltk
import scipy

from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim import corpora

%matplotlib inline

In [3]:
# also set a random state
rs = 123

In [8]:
def generate_sparse_bow(course):
    """
    Generate a sparse bag-of-words (BoW) representation for a given course.

    Parameters:
    course (str): The input course text to generate the BoW representation for.

    Returns:
    list: A sparse BoW representation where each element corresponds to the presence (1) or absence (0)
    of a word in the input course text.
    """

    # Initialize an empty list to store the BoW vector
    bow_vector = []

    # Tokenize the course text by splitting it into words
    words = course.split()

    # Iterate through all unique words (tokens) in the course
    for token in set(words):
        # Check if the token is present in the course text
        if token in words:
            # If the token is present, append 1 to the BoW vector
            bow_vector.append(1)
        else:
            # If the token is not present, append 0 to the BoW vector
            bow_vector.append(0)

    # Return the sparse BoW vector
    return bow_vector


Finding similar courses to the course `Machine Learning with Python`


In [13]:
# Load the BoW features as Pandas dataframe
bows_url = "courses_bows.csv"
bows_df = pd.read_csv(bows_url)
bows_df = bows_df[['doc_id', 'token', 'bow']]

In [14]:
bows_df.head(10)

Unnamed: 0,doc_id,token,bow
0,ML0201EN,ai,2
1,ML0201EN,apps,2
2,ML0201EN,build,2
3,ML0201EN,cloud,1
4,ML0201EN,coming,1
5,ML0201EN,create,1
6,ML0201EN,data,1
7,ML0201EN,developer,1
8,ML0201EN,found,1
9,ML0201EN,fun,1


The `bows_df` dataframe contains the BoW features vectors for each course, in a vertical and dense format. It has three columns `doc_id` represents the course id, `token` represents the token value, and `bow` represents the BoW value (token count).


Then, let's load another course content dataset which contains the course title and description:


In [15]:
# Load the course dataframe
course_url = "course_processed.csv"
course_df = pd.read_csv(course_url)

In [16]:
course_df.head(10)

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...
5,CNSC02EN,cloud native security conference data security,introduction to data security on cloud
6,DX0106EN,data science bootcamp with r for university pr...,a multi day intensive in person data science ...
7,GPXX0FTCEN,learn how to use docker containers for iterati...,learn how to use docker containers for iterati...
8,RAVSCTEST1,scorm test 1,scron test course
9,GPXX06RFEN,create your first mongodb database,in this guided project you will get started w...


Given course ID `ML0101ENv3`, let's find out its title and description:


In [17]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


We can see it is a machine learning with Python course so we can expect any machine learning or Python related courses would be similar.


Then, let's print its associated BoW features:


In [18]:
ml_course = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
ml_course

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


We can see the BoW feature vector is in vertical format but normally feature vectors are in horizontal format. One way to transpose the feature vector from vertical to horizontal is to use the Pandas `pivot()` method:


In [19]:
ml_courseT = ml_course.pivot(index=['doc_id'], columns='token').reset_index(level=[0])
ml_courseT

Unnamed: 0_level_0,doc_id,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow
token,Unnamed: 1_level_1,beneficial,course,free,future,get,give,hidden,insights,learning,machine,need,predict,python,started,supervised,tool,tools,trends,unsupervised
0,ML0101ENv3,1,1,1,1,1,1,1,1,4,3,1,1,2,1,1,1,1,1,1


To compare the BoWs of any two courses, which normally have a different set of tokens, we need to create a union token set and then transpose them. We have provided a method called `pivot_two_bows` as follows:


In [20]:
def pivot_two_bows(basedoc, comparedoc):
    """
    Pivot two bag-of-words (BoW) representations for comparison.

    Parameters:
    basedoc (DataFrame): DataFrame containing the bag-of-words representation for the base document.
    comparedoc (DataFrame): DataFrame containing the bag-of-words representation for the document to compare.

    Returns:
    DataFrame: A DataFrame with pivoted BoW representations for the base and compared documents,
    facilitating direct comparison of word occurrences between the two documents.
    """

    # Create copies of the input DataFrames to avoid modifying the originals
    base = basedoc.copy()
    base['type'] = 'base'  # Add a 'type' column indicating base document
    compare = comparedoc.copy()
    compare['type'] = 'compare'  # Add a 'type' column indicating compared document

    # Concatenate the two DataFrames vertically
    join = pd.concat([base, compare])

    # Pivot the concatenated DataFrame based on 'doc_id' and 'type', with words as columns
    joinT = join.pivot(index=['doc_id', 'type'], columns='token').fillna(0).reset_index(level=[0, 1])

    # Assign meaningful column names to the pivoted DataFrame
    joinT.columns = ['doc_id', 'type'] + [t[1] for t in joinT.columns][2:]

    # Return the pivoted DataFrame for comparison
    return joinT


In [21]:
course1 = bows_df[bows_df['doc_id'] == 'ML0151EN']
course2 = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

In [22]:
bow_vectors = pivot_two_bows(course1, course2)
bow_vectors

Unnamed: 0,doc_id,type,approachable,basics,beneficial,comparison,course,dives,free,future,...,relates,started,statistical,supervised,tool,tools,trends,unsupervised,using,vs
0,ML0101ENv3,compare,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,ML0151EN,base,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


Similarly, we can use the cosine method to calculate their similarity:


In [23]:
similarity = 1 - cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])
similarity

0.662622139954909

Finding all courses similar to the course `Machine Learning with Python`:


In [24]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


We can set a similarity threshold such as 0.5 to determine if two courses are similar enough.


 Find courses which are similar to course `Machine Learning with Python (ML0101ENv3)` 

In [25]:
# Step 1: Retrieve the BoW feature vector for the course ML0101ENv3
ml_bow = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

# Step 2: Retrieve the BoW feature vectors for all other courses
course_list = [x for x in bows_df['doc_id'].unique().tolist() if x != 'ML0101ENv3']

# Step 3 & 4: Calculate cosine similarity between ML0101ENv3 and each other course
similar_courses = []

for course in course_list:
    other_bow =  bows_df[bows_df['doc_id'] ==course]
    pivoted_bows = pivot_two_bows(ml_bow, other_bow)
    similarity = 1 - cosine(pivoted_bows.iloc[0, 2:], pivoted_bows.iloc[1, 2:])
    if similarity > 0.5:
        #print(course, similarity)
        similar_courses.append((course, similarity))


In [26]:
# Step 5: Report all courses with similarities larger than 0.5
for course_id, similarity in similar_courses:
    print(f"Course ID: {course_id}, Similarity: {similarity:.2f}")

Course ID: ML0109EN, Similarity: 0.52
Course ID: ML0151EN, Similarity: 0.66
Course ID: excourse46, Similarity: 0.61
Course ID: excourse47, Similarity: 0.63
Course ID: excourse60, Similarity: 0.55


### Computing the full sim matrix

In [27]:

#MY FULL COURSE SIMILARITY MODEL
#-------------------------------------------------------
course_ids = course_df.index
similarity_matrix = pd.DataFrame(np.zeros((len(course_ids), len(course_ids))), index=course_ids, columns=course_ids)

course_list = [x for x in bows_df['doc_id'].unique().tolist()]

for first_course in course_list:
    # Step 1: Retrieve the BoW feature vector for the course ML0101ENv3
    ml_bow = bows_df[bows_df['doc_id'] == first_course]
    
    for other_course in course_list:
        other_bow =  bows_df[bows_df['doc_id'] ==other_course]
        pivoted_bows = pivot_two_bows(ml_bow, other_bow)
        similarity = 1 - cosine(pivoted_bows.iloc[0, 2:], pivoted_bows.iloc[1, 2:])
        i = course_df[course_df['COURSE_ID']==first_course].index[0]
        j = course_df[course_df['COURSE_ID']==other_course].index[0]
        similarity_matrix.iloc[i, j] = similarity


In [28]:
#Saving the matrix to a CSV file
csv_file = 'my_sim.csv'
similarity_matrix.to_csv(csv_file)
print(f"Similarity matrix saved to {csv_file}")

#-----------------------------------------------------

Similarity matrix saved to my_sim.csv
