# Course Recommendation Using Cosine Similarity

In [1]:
import pandas as pd #importing dependencies

In [2]:
df = pd.read_csv("../datasets/Coursera.csv")
print("Columns : ",df.columns)

Columns :  Index(['Course Name', 'University', 'Difficulty Level', 'Course Rating',
       'Course URL', 'Course Description', 'Skills'],
      dtype='object')


In [3]:
s = df.duplicated(subset=["Course Name"])
print(s.value_counts())

s = df.duplicated(subset=["Course URL"])
print(s.value_counts())

s = df.duplicated(subset=["Course URL","Course Name"])
print(s.value_counts())

False    3416
True      106
Name: count, dtype: int64
False    3424
True       98
Name: count, dtype: int64
False    3424
True       98
Name: count, dtype: int64


In [4]:
courses = df[['Course Name', 'Course Description']].to_dict(orient='records')

In [5]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [6]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [7]:
# Preprocess and tokenize course descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform([course['Course Description'] for course in courses])
course_indices = {course['Course Name']: index for index, course in enumerate(courses)}

# Tokenize and preprocess the course description
user_description = "python and computer graphics"
# Calculate TF-IDF vectors for the course description
user_vector = tfidf.transform([user_description])

# Calculate cosine similarity between user description and course descriptions
cosine_sim = linear_kernel(user_vector, tfidf_matrix)
# Get recommended courses
recommended_courses = []
for idx in cosine_sim.argsort()[0][::-1][:5]:
    course_title = courses[idx]['Course Name']
    recommended_courses.append(course_title)

print("Recommended Courses:", recommended_courses)

Recommended Courses: ['Advanced App Development in Android Capstone', 'Python Data Analysis', 'Python Data Representations', 'Python Programming Essentials', 'Interactive Computer Graphics']


# Combining Coursera and Udemy datasets

In [8]:
# making c_df and u_df as coursera and udemy dataframes
c_df  = df
u_df = pd.read_csv('../datasets/udemy_courses.csv')

## Analyzing the datasets used

In [9]:
print(u_df['level'].unique())
print(c_df['Difficulty Level'].unique())
print(c_df['Difficulty Level'].eq('Not Calibrated').sum())
print(c_df['Difficulty Level'].eq('Conversant').sum())
print(c_df.shape)

['All Levels' 'Intermediate Level' 'Beginner Level' 'Expert Level']
['Beginner' 'Advanced' 'Intermediate' 'Not Calibrated' 'Conversant']
50
186
(3522, 7)


## Data preprocessing & cleaning

In [10]:
# Renaming the values to make them consistent data
u_df['level'] = u_df['level'].replace('All Levels','Advanced')
u_df['level'] = u_df['level'].replace('Intermediate Level','Intermediate')
u_df['level'] = u_df['level'].replace('Beginner Level','Beginner')
u_df['level'] = u_df['level'].replace('Expert Level','Advanced')
c_df['Difficulty Level'] = c_df['Difficulty Level'].replace('Conversant','Advanced')
c_df['Difficulty Level'] = c_df['Difficulty Level'].replace('Not Calibrated','Beginner')

In [11]:
print(c_df['Difficulty Level'].value_counts())
print(u_df['level'].value_counts())
print(u_df.columns)
print(c_df.columns)

Difficulty Level
Beginner        1494
Advanced        1191
Intermediate     837
Name: count, dtype: int64
level
Advanced        1987
Beginner        1270
Intermediate     421
Name: count, dtype: int64
Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')
Index(['Course Name', 'University', 'Difficulty Level', 'Course Rating',
       'Course URL', 'Course Description', 'Skills'],
      dtype='object')


In [12]:
# removing unnecessary columns
u_df = u_df[['course_title','url','level']]

# renaming columns
u_df.rename(columns={'course_title': 'Course Name','url': 'Course URL','level':'Difficulty Level'}, inplace=True)
print(u_df.columns)
print(c_df.columns)

Index(['Course Name', 'Course URL', 'Difficulty Level'], dtype='object')
Index(['Course Name', 'University', 'Difficulty Level', 'Course Rating',
       'Course URL', 'Course Description', 'Skills'],
      dtype='object')


In [13]:
# Merge both dataframes
combined_df = pd.concat([u_df, c_df], ignore_index=True)

#dropping duplicates
combined_df = combined_df.drop_duplicates(subset=["Course Name", "Course URL"], keep='first')

combined_df['Difficulty Level'].value_counts()

Difficulty Level
Advanced        3128
Beginner        2724
Intermediate    1244
Name: count, dtype: int64

In [14]:
# Adding course name to description and skills columns as they also describe the course and skills
combined_df['Course Description'].fillna(combined_df['Course Name'], inplace=True)
combined_df['Skills'].fillna(combined_df['Course Name'], inplace=True)

# Extracting Course Name & Course Description for content based filtering
courses = combined_df[['Course Name', 'Course Description']].to_dict(orient='records')

## Tranforming the data & Creating the recommendation system

In [15]:
import random #for shuffling indices to get more deversity in recommendations

# Preprocess and tokenize course descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform([course['Course Description'] for course in courses])
course_indices = {course['Course Name']: index for index, course in enumerate(courses)}

# Tokenize and preprocess the course description
user_description = "python"
# Calculate TF-IDF vectors for the course description
user_vector = tfidf.transform([user_description])

# Calculate cosine similarity between user description and course descriptions
cosine_sim = linear_kernel(user_vector, tfidf_matrix)
indices = list(range(len(courses)))  # Create a list of indices

# Shuffle the indices to randomize the order
random.shuffle(indices)

# Get recommended courses
recommended_courses = []
for idx in cosine_sim.argsort()[0][::-1][:5]:
    course_title = courses[idx]['Course Name']
    recommended_courses.append(course_title)

# show recommended courses
filtered_df = combined_df[combined_df['Course Name'].isin(recommended_courses)]
filtered_df

Unnamed: 0,Course Name,Course URL,Difficulty Level,University,Course Rating,Course Description,Skills
2681,Python for Beginners: Python Programming Langu...,https://www.udemy.com/python-course/,Beginner,,,Python for Beginners: Python Programming Langu...,Python for Beginners: Python Programming Langu...
3138,Complete Python Web Course: Build 8 Python Web...,https://www.udemy.com/the-complete-python-web-...,Advanced,,,Complete Python Web Course: Build 8 Python Web...,Complete Python Web Course: Build 8 Python Web...
3694,Python Programming Essentials,https://www.coursera.org/learn/python-programming,Beginner,Rice University,4.8,This course will introduce you to the wonderfu...,semantics Python Programming coding conventi...
4418,Python Data Representations,https://www.coursera.org/learn/python-represen...,Advanced,Rice University,4.7,This course will continue the introduction to ...,immutable object Python Programming Smoothin...
6007,Python Data Analysis,https://www.coursera.org/learn/python-analysis,Advanced,Rice University,4.6,This course will continue the introduction to ...,Data Visualization jpeg dict Python Program...


In [16]:
# defining a function to get recommendations
# provide df from pickel file or csv file and make sure it has columns 'Course Name' and 'Course Description'
# courses = combined_df[['Course Name', 'Course Description']].to_dict(orient='records')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform([course['Course Description'] for course in courses])

def get_recommendations(user_description, n=5):
    recommended_courses = []
    course_title = ''
    user_vector = tfidf.transform([user_description])
    cosine_sim = linear_kernel(user_vector, tfidf_matrix)
    for idx in cosine_sim.argsort()[0][::-1][:n]:
        course_title = courses[idx]['Course Name']
        recommended_courses.append(course_title)
    filtered_df = combined_df[combined_df['Course Name'].isin(recommended_courses)]
    return filtered_df

# get recommendations
user_description = "JavaScript"
recommended_courses = get_recommendations(user_description, 5)
print("Recommended Courses : ")
recommended_courses

Recommended Courses : 


Unnamed: 0,Course Name,Course URL,Difficulty Level,University,Course Rating,Course Description,Skills
2557,JavaScript the Basics - JavaScript for Beginners,https://www.udemy.com/javascript-the-basics-fo...,Beginner,,,JavaScript the Basics - JavaScript for Beginners,JavaScript the Basics - JavaScript for Beginners
2626,JavaScript For Beginners : Learn JavaScript Fr...,https://www.udemy.com/javascript-course-for-be...,Advanced,,,JavaScript For Beginners : Learn JavaScript Fr...,JavaScript For Beginners : Learn JavaScript Fr...
3069,JavaScript in Action JavaScript Projects,https://www.udemy.com/javascript-in-action-lea...,Beginner,,,JavaScript in Action JavaScript Projects,JavaScript in Action JavaScript Projects
3210,JavaScript programming: JavaScript for beginners,https://www.udemy.com/learn-javascript-online/,Beginner,,,JavaScript programming: JavaScript for beginners,JavaScript programming: JavaScript for beginners
3283,JavaScript Complete Guide to learning JavaScript,https://www.udemy.com/javascript-complete-guid...,Beginner,,,JavaScript Complete Guide to learning JavaScript,JavaScript Complete Guide to learning JavaScript


In [17]:
import pickle
# Saving the combined_df as csv and pickle file
pickle.dump(combined_df,open('../models/combined_df.pkl','wb'))
combined_df.to_csv('../datasets/combined_df.csv') 