# Calculate Course Similarity using BoW Features

Similarity is a very important topic for recommendation systems. In essence when we have a new user who likes Course 1 our system tries to respond to the user and propose a similar course 2.


# Prepare the lab

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
import pandas as pd
import nltk as nltk

from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_score

from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim import corpora

%matplotlib inline

In [2]:
rs = 42

In [4]:
# Insert the data
bows_df = pd.read_csv("courses_bows.csv")
bows_df = bows_df[['doc_id', 'token', 'bow']]
bows_df.head(10)

Unnamed: 0,doc_id,token,bow
0,ML0201EN,ai,2
1,ML0201EN,apps,2
2,ML0201EN,build,2
3,ML0201EN,cloud,1
4,ML0201EN,coming,1
5,ML0201EN,create,1
6,ML0201EN,data,1
7,ML0201EN,developer,1
8,ML0201EN,found,1
9,ML0201EN,fun,1


The `bows_df` dataframe contains the BoW features vectors for each course, in a vertical and dense format. It has three columns `doc_id` represents the course id, `token` represents the token value, and `bow` represents the BoW value (token count).


In [6]:
# Load the course dataframe
course_df = pd.read_csv("course_processed.csv")
course_df.head(10)

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...
5,CNSC02EN,cloud native security conference data security,introduction to data security on cloud
6,DX0106EN,data science bootcamp with r for university pr...,a multi day intensive in person data science ...
7,GPXX0FTCEN,learn how to use docker containers for iterati...,learn how to use docker containers for iterati...
8,RAVSCTEST1,scorm test 1,scron test course
9,GPXX06RFEN,create your first mongodb database,in this guided project you will get started w...


In [7]:
# Pick a course and find the title and description
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


In [9]:
machine_learning_course = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
machine_learning_course

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


Make it horizontal format

In [10]:
machine_learning_courseT = machine_learning_course.pivot(index=['doc_id'], columns='token').reset_index(level=[0])
machine_learning_courseT

Unnamed: 0_level_0,doc_id,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow
token,Unnamed: 1_level_1,beneficial,course,free,future,get,give,hidden,insights,learning,machine,need,predict,python,started,supervised,tool,tools,trends,unsupervised
0,ML0101ENv3,1,1,1,1,1,1,1,1,4,3,1,1,2,1,1,1,1,1,1


To compare the BoWs of any two courses, which normally have a different set of tokens, we need to create a union token set and then transpose them.

In [11]:
def pivot_two_bows(basedoc, comparedoc):
    """
    Pivot two bag-of-words (BoW) representations for comparison.

    Parameters:
    basedoc (DataFrame): DataFrame containing the bag-of-words representation for the base document.
    comparedoc (DataFrame): DataFrame containing the bag-of-words representation for the document to compare.

    Returns:
    DataFrame: A DataFrame with pivoted BoW representations for the base and compared documents,
    facilitating direct comparison of word occurrences between the two documents.
    """

    # Create copies of the input DataFrames to avoid modifying the originals
    base = basedoc.copy()
    base['type'] = 'base'  # Add a 'type' column indicating base document
    compare = comparedoc.copy()
    compare['type'] = 'compare'  # Add a 'type' column indicating compared document

    # Concatenate the two DataFrames vertically
    join = pd.concat([base, compare])

    # Pivot the concatenated DataFrame based on 'doc_id' and 'type', with words as columns
    joinT = join.pivot(index=['doc_id', 'type'], columns='token').fillna(0).reset_index(level=[0, 1])

    # Assign meaningful column names to the pivoted DataFrame
    joinT.columns = ['doc_id', 'type'] + [t[1] for t in joinT.columns][2:]

    # Return the pivoted DataFrame for comparison
    return joinT


In [12]:
# Pick 2 courses
course1 = bows_df[bows_df['doc_id'] == 'ML0151EN']
course2 = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

bow_vectors = pivot_two_bows(course1, course2)
bow_vectors

Unnamed: 0,doc_id,type,approachable,basics,beneficial,comparison,course,dives,free,future,...,relates,started,statistical,supervised,tool,tools,trends,unsupervised,using,vs
0,ML0101ENv3,compare,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,ML0151EN,base,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


In [13]:
similarity = 1 - cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])
similarity

0.662622139954909

Similar courses with the machine learning with python

In [14]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


Set a threshhold such as 0.5 to determine if two courses are similar

In [15]:
# Function to convert a course course vertical Bag Of Words into a Horizontal Bag of Words
def pivot_course_rows(bows_df, course_id):
    course_bow = bows_df[bows_df['doc_id'] == course_id]
    return course_bow.pivot(index='doc_id', columns='token', values='bow').fillna(0)

In [16]:
# Initialize the base course
base_course_id = 'ML0101ENv3'
base_vector = pivot_course_rows(bows_df, base_course_id)

In [17]:
# Compare against all other courses
results = []
for other_id in course_df['COURSE_ID'].unique():
    if other_id == base_course_id:
        continue
    try:
        other_vector = pivot_course_rows(bows_df, other_id)
        # Align both vectors on same set of tokens
        all_tokens = base_vector.columns.union(other_vector.columns)
        base_aligned = base_vector.reindex(columns=all_tokens, fill_value=0)
        other_aligned = other_vector.reindex(columns=all_tokens, fill_value=0)
        # Compute cosine similarity
        similarity = 1 - cosine(base_aligned.values[0], other_aligned.values[0])
        results.append((other_id, similarity))
    except Exception as e:
        # Skip any course with issues
        continue

In [18]:
# Create a dataframe with results
similarities_dataframe = pd.DataFrame(results, columns=["COURSE_ID", "Cosine_Similarity"])


In [19]:
# Filter by similarity threshold
threshold = 0.5
similar_courses = similarities_dataframe[similarities_dataframe["Cosine_Similarity"] > threshold]

In [20]:
# Sort and display
similar_courses = similar_courses.sort_values(by="Cosine_Similarity", ascending=False)
print(similar_courses)

      COURSE_ID  Cosine_Similarity
199    ML0151EN           0.662622
259  excourse47           0.634755
258  excourse46           0.612054
272  excourse60           0.549040
157    ML0109EN           0.521749


These courses likely cover overlapping topics or use similar wording in their descriptions.
