# Calculate Course Similarity using BoW Features

Similarity is a very important topic for recommendation systems. In essence when we have a new user who likes Course 1 our system tries to respond to the user and propose a similar course 2.


# Prepare lab enviroment

In [12]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
import pandas as pd
import nltk as nltk

from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_score

from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim import corpora

%matplotlib inline

In [2]:
rs = 42

# Example understanding how to calculate similarities

Consine similarity calculation

In [4]:
course1 = "machine learning for everyone"
course2 = "machine learning for beginners"

In [6]:
# tokenize the words
tokens = set(course1.split() + course2.split())
# Make the tokens a list
tokens = list(tokens)
tokens

['beginners', 'learning', 'machine', 'for', 'everyone']

In [7]:
# Generate the Bag Of Word features
def generate_sparse_bow(course):
    """
    Generate a sparse bag-of-words (BoW) representation for a given course.

    Parameters:
    course (str): The input course text to generate the BoW representation for.

    Returns:
    list: A sparse BoW representation where each element corresponds to the presence (1) or absence (0)
    of a word in the input course text.
    """

    # Initialize an empty list to store the BoW vector
    bow_vector = []

    # Tokenize the course text by splitting it into words
    words = course.split()

    # Iterate through all unique words (tokens) in the course
    for token in set(words):
        # Check if the token is present in the course text
        if token in words:
            # If the token is present, append 1 to the BoW vector
            bow_vector.append(1)
        else:
            # If the token is not present, append 0 to the BoW vector
            bow_vector.append(0)

    # Return the sparse BoW vector
    return bow_vector


In [8]:
bow1 = generate_sparse_bow(course1)
bow1

[1, 1, 1, 1]

In [9]:
bow2 = generate_sparse_bow(course2)
bow2

[1, 1, 1, 1]

Consine Similarities

In [10]:
cos_sim = 1 - cosine(bow1, bow2)

In [11]:
print(f"The cosine similarity between course `{course1}` and course `{course2}` is {round(cos_sim, 2) * 100}%")

The cosine similarity between course `machine learning for everyone` and course `machine learning for beginners` is 100.0%


Euclidean Distance

In [None]:
euclidean_dist = euclidean(bow1, bow2)

Jaccard index using Bag of Words

In [14]:
jaccard_idx_bow = jaccard_score(bow1, bow2)

In [15]:
# Calculate Jaccard Index using set similarity
set1 = set(course1.split())
set2 = set(course2.split())
jaccard_idx_set = len(set1 & set2) / len(set1 | set2)

In [16]:
# Print  results
print("Euclidean Distance:", euclidean_dist)
print("Jaccard Index (BoW):", jaccard_idx_bow)
print("Jaccard Index (Set):", jaccard_idx_set)

Euclidean Distance: 0.0
Jaccard Index (BoW): 1.0
Jaccard Index (Set): 0.6


Euclidean distance between 2 points $p$ and $q$ can be summarized by this equation: $d(p,q)={\sqrt {(p_{1}-q_{1})^{2}+(p_{2}-q_{2})^{2}+(p_{3}-q_{3})^{2}}}$.