In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
df = pd.read_csv("complete_course_data.csv")

In [18]:
df.head()

Unnamed: 0,index,course_title,url,certification,level,organization,platform
0,0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,COURSE,All Levels,Business Finance,udemy
1,1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,COURSE,All Levels,Business Finance,udemy
2,2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,COURSE,Intermediate Level,Business Finance,udemy
3,3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,COURSE,All Levels,Business Finance,udemy
4,4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,COURSE,Intermediate Level,Business Finance,udemy


In [19]:
df

Unnamed: 0,index,course_title,url,certification,level,organization,platform
0,0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,COURSE,All Levels,Business Finance,udemy
1,1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,COURSE,All Levels,Business Finance,udemy
2,2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,COURSE,Intermediate Level,Business Finance,udemy
3,3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,COURSE,All Levels,Business Finance,udemy
4,4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,COURSE,Intermediate Level,Business Finance,udemy
...,...,...,...,...,...,...,...
4904,175,Windows Performance Monitoring Fundamentals,https://www.pluralsight.com/courses/windows-pe...,COURSE,Beginner,pluralsight,pluralsight
4905,176,Windows PowerShell Toolmaking Fundamentals,https://www.pluralsight.com/courses/powershell...,COURSE,Intermediate,pluralsight,pluralsight
4906,177,Windows Server 2012 R2 (70-413) Server Deployment,https://www.pluralsight.com/courses/windows-se...,COURSE,Intermediate,pluralsight,pluralsight
4907,178,Windows Server Administration Fundamentals Usi...,https://www.pluralsight.com/courses/windows-se...,COURSE,Beginner,pluralsight,pluralsight


In [48]:
features = ["course_title","platform","level"]

In [49]:
def combine_features(row):
    return row['course_title']+" "+row['platform']+" "+row['level']

In [50]:
for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string

df["combined_features"] = df.apply(combine_features,axis=1)
#applying combined_features() method over each rows of dataframe and storing the combined string in "combined_features" column


In [51]:
df.head()

Unnamed: 0,index,course_title,url,certification,level,organization,platform,combined_features
0,0,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,COURSE,All Levels,Business Finance,udemy,Ultimate Investment Banking Course udemy All L...
1,1,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,COURSE,All Levels,Business Finance,udemy,Complete GST Course & Certification - Grow You...
2,2,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,COURSE,Intermediate Level,Business Finance,udemy,Financial Modeling for Business Analysts and C...
3,3,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,COURSE,All Levels,Business Finance,udemy,Beginner to Pro - Financial Analysis in Excel ...
4,4,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,COURSE,Intermediate Level,Business Finance,udemy,How To Maximize Your Profits Trading Options u...


In [52]:
df.iloc[0].combined_features

'Ultimate Investment Banking Course udemy All Levels'

In [53]:
cv = CountVectorizer() #creating new CountVectorizer() object
count_matrix = cv.fit_transform(df["combined_features"]) #feeding combined strings(movie contents) to CountVectorizer() object

In [54]:

cosine_sim = cosine_similarity(count_matrix) #cosine similarity matrix for count matrix

In [60]:
#functions to get course title from course index and vice-versa.

def get_title_from_index(index):
    return df[df.index == index]["course_title"].values[0]
def get_index_from_title(title):
    return df[df.course_title == title]["index"].values[0]

#  Tip: add a titile field and keywords field in data for improvement

# Our next step is to get the title of the course that the user currently likes. Then we will find the index of that title. After that, we will access the row corresponding to this titile in the similarity matrix. Thus, we will get the similarity scores of all other title from the current course. Then we will enumerate through all the similarity scores of that movie to make a tuple of course index and similarity score. This will convert a row of similarity scores like this- [1 0.5 0.2 0.9] to this- [(0, 1) (1, 0.5) (2, 0.2) (3, 0.9)] . Here, each item is in this form- (course index, similarity score).

In [71]:
course_user_likes = "Ultimate Investment Banking Course" # here take the inputs of user
course_index = get_index_from_title(course_user_likes)
similar_courses = list(enumerate(cosine_sim[course_index])) #accessing the row corresponding to given course to find all the similarity scores for that course and then enumerating over it

#  We will sort the list similar_course according to similarity scores in descending order. Since the most similar course to a given course will be itself, we will discard the first element after sorting the courses.

In [73]:
sorted_similar_courses = sorted(similar_courses,key=lambda x:x[1],reverse=True)[1:]

In [74]:
#we will run a loop to print first 5 entries from sorted_similar_courses list.

i=0
print("Top 5 similar courses to "+course_user_likes+" are:\n")
for element in sorted_similar_courses:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>5:
        break

Top 5 similar courses to Ultimate Investment Banking Course are:

The Complete Investment Banking Course 2017
Ultimate WordPress Plugin Course
The Ultimate jQuery Course
The Investment Banking Recruitment Series
The Ultimate Web Development Course
The Ultimate Vue JS 2 Developers Course
