In [5]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
%matplotlib inline

In [6]:
os.listdir()

['.git',
 '.ipynb_checkpoints',
 'BusinessFinance.csv',
 'Course_Recommendation_System.ipynb',
 'Data.zip',
 'GraphicDesign.csv',
 'MusicInstraments.csv',
 'phoenix-buildwithai.ipynb',
 'WebDevelopment.csv']

In [7]:
#Read data
WebDev = pd.read_csv('WebDevelopment.csv')
Business = pd.read_csv('BusinessFinance.csv')
GraphicDesign = pd.read_csv('GraphicDesign.csv')
Music = pd.read_csv('MusicInstraments.csv')

In [8]:
#Add Subject column to the 4 datasets
WebDev['Subject']= 'Web Development'
Business['Subject']= 'Business'
GraphicDesign['Subject']= 'Graphic Design'
Music['Subject']= 'Music Instr'

In [9]:
#Combine all 4 datasets into a single one using concat()
data=pd.concat([WebDev,Business,GraphicDesign,Music],axis=0)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3683 entries, 0 to 680
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3683 non-null   int64  
 1   title                 3683 non-null   object 
 2   url                   3683 non-null   object 
 3   isPaid                3683 non-null   object 
 4   price                 3683 non-null   object 
 5   numSubscribers        3683 non-null   int64  
 6   numReviews            3683 non-null   int64  
 7   numPublishedLectures  3683 non-null   int64  
 8   instructionalLevel    3683 non-null   object 
 9   contentInfo           3683 non-null   object 
 10  publishedTime         3683 non-null   object 
 11  Unnamed: 11           0 non-null      float64
 12  Unnamed: 12           0 non-null      float64
 13  Is Paid               8 non-null      object 
 14  Total                 8 non-null      float64
 15  Percent               

## Removing Duplicates

In [11]:
#Finding number of duplicates
data.id.duplicated().sum()

6

In [12]:
#List of duplicates
pd.concat(g for _, g in data.groupby("id") if len(g) > 1)

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,Unnamed: 11,Unnamed: 12,Is Paid,Total,Percent,Subject,Column1
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,,,Yes,1067.0,89%,Web Development,
83,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,,,,,,Web Development,
39,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,37 mins,2014-04-15T21:48:55Z,,,,,,Graphic Design,
278,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,37 mins,2014-04-15T21:48:55Z,,,,,,Graphic Design,
455,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,37 mins,2016-05-16T18:28:30Z,,,,,,Business,
789,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,37 mins,2016-05-16T18:28:30Z,,,,,,Business,
780,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1 hour,2016-12-15T14:56:17Z,,,,,,Business,
897,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1 hour,2016-12-15T14:56:17Z,,,,,,Business,
465,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5 hours,2017-07-02T14:29:35Z,,,,,,Business,
1104,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5 hours,2017-07-02T14:29:35Z,,,,,,Business,


In [13]:
#Removing duplicates
data=data.drop_duplicates(subset=['id'], keep='first')

In [14]:
len(data)

3677

# Dropping Unnecessary Columns

In [15]:
data.columns

Index(['id', 'title', 'url', 'isPaid', 'price', 'numSubscribers', 'numReviews',
       'numPublishedLectures', 'instructionalLevel', 'contentInfo',
       'publishedTime', 'Unnamed: 11', 'Unnamed: 12', 'Is Paid', 'Total',
       'Percent', 'Subject', 'Column1'],
      dtype='object')

In [16]:
data_final=data[['id','title','url','Subject','instructionalLevel']]

In [17]:
data_final.isnull().sum()

id                    0
title                 0
url                   0
Subject               0
instructionalLevel    0
dtype: int64

In [18]:
#getting raw text from the URL (can be useful for our analysis)
data_final['url']=data_final['url'].replace({"https://www.udemy.com/":"",
                              "-":" ","/":""},regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final['url']=data_final['url'].replace({"https://www.udemy.com/":"",


In [19]:
data_final['corpus']=data_final['title'
                               ]+" "+data_final['url'
                                               ]+data_final['Subject'
                                                           ]+data_final['instructionalLevel']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final['corpus']=data_final['title'


In [20]:
data_final

Unnamed: 0,id,title,url,Subject,instructionalLevel,corpus
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,build beautiful html5 website,Web Development,All Levels,Learn Web Designing & HTML5/CSS3 Essentials in...
1,19603,Learning Dynamic Website Design - PHP MySQL an...,learning dynamic website design,Web Development,All Levels,Learning Dynamic Website Design - PHP MySQL an...
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,chatbots,Web Development,All Levels,ChatBots: Messenger ChatBot with API.AI and No...
3,197836,Projects in HTML5,projects in html5,Web Development,Intermediate Level,Projects in HTML5 projects in html5Web Develop...
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,html css more,Web Development,Beginner Level,Programming Foundations: HTML5 + CSS3 for Entr...
...,...,...,...,...,...,...
676,513528,Curso de guitarra para principiantes.,curso de guitarra para principiantes2,Music Instr,Beginner Level,Curso de guitarra para principiantes. curso de...
677,211026,Aprende a tocar Jazz y Ritmos Latinos en la gu...,aprende a tocar jazz y ritmos latinos en la gu...,Music Instr,All Levels,Aprende a tocar Jazz y Ritmos Latinos en la gu...
678,847808,こどもギターレッスン　リトルギタリストⅢ（完結編）,takalittle3,Music Instr,Beginner Level,こどもギターレッスン　リトルギタリストⅢ（完結編） takalittle3Music Ins...
679,218856,Aprende a tocar la flauta dulce de forma senci...,aprende a tocar la flauta dulce curso en video,Music Instr,All Levels,Aprende a tocar la flauta dulce de forma senci...


# Creating a cosine similarity matrix from the corpus

In [21]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : {'filename', 'file', 'content'}, default='content'
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |  

In [22]:
def create_similarity_matrix(new_description, overall_descriptions):
#Append the new description to the overall set.
    overall_descriptions=list(overall_descriptions)
    overall_descriptions.append(new_description)
    # Define a tfidf vectorizer and remove all stopwords.
    tfidf = TfidfVectorizer(stop_words="english")
    #Convert tfidf matrix by fitting and transforming the data.
    tfidf_matrix = tfidf.fit_transform(overall_descriptions)
    # output the shape of the matrix.
    tfidf_matrix.shape
    # calculating the cosine similarity matrix.
    cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
    return cosine_sim

In [35]:
def get_recommendations(new_description,overall_descriptions):
    cosine_sim = create_similarity_matrix(new_description,overall_descriptions)
    # Get pairwise similarity scores of all the students with new student.
    sim_scores = list(enumerate(cosine_sim[-1]))
    
    sim_scores = sorted(sim_scores,key =lambda x:x[1],reverse= True)
    # Get the scores of top 10 descriptions.
    sim_scores = sim_scores[1:10]
    # Get the student indices.
    indices = [i[0]for i in sim_scores]
    return data[['title','url','instructionalLevel']].iloc[indices]

In [42]:
title="HTML"
subject="Web Development"
level="All Levels"

In [43]:
get_recommendations(title+" "+subject+" "+level,data_final['corpus'])

Unnamed: 0,title,url,instructionalLevel
612,The All-In-One Web Development Course,https://www.udemy.com/the-all-in-one-web-devel...,All Levels
852,"Complete Web Development with HTML, CSS and Ja...",https://www.udemy.com/complete-web-development...,Intermediate Level
460,HTML Web Development Crash Course,https://www.udemy.com/html-tutorials/,Beginner Level
281,Web Development With HTML CSS BOOTSTRAP JQUERY...,https://www.udemy.com/web-development-with-htm...,Beginner Level
929,Core HTML How to get online quickly HTML to HT...,https://www.udemy.com/web-development-introduc...,Beginner Level
16,Introduction to Web Development: HTML,https://www.udemy.com/webdevelopment101_html/,Beginner Level
15,Learn Complete Web Development From Scratch,https://www.udemy.com/learn-complete-web-devel...,All Levels
361,Beginning Web development Learn the basics of ...,https://www.udemy.com/learn-the-basics-of-html...,Beginner Level
663,The Complete Web Development Course - Build 1...,https://www.udemy.com/complete-web-development...,All Levels


In [44]:
data['corpus']=data_final['corpus']
data_for_use=data[['title','url','instructionalLevel','corpus']]

In [48]:
data_for_use.to_csv('courses.csv',index=False)

### Actual Code for use

In [49]:
data=pd.read_csv('courses.csv')

In [50]:
data

Unnamed: 0,title,url,instructionalLevel,corpus
0,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,All Levels,Learn Web Designing & HTML5/CSS3 Essentials in...
1,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,All Levels,Learning Dynamic Website Design - PHP MySQL an...
2,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,All Levels,ChatBots: Messenger ChatBot with API.AI and No...
3,Projects in HTML5,https://www.udemy.com/projects-in-html5/,Intermediate Level,Projects in HTML5 projects in html5Web Develop...
4,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,Beginner Level,Programming Foundations: HTML5 + CSS3 for Entr...
...,...,...,...,...
3672,Curso de guitarra para principiantes.,https://www.udemy.com/curso-de-guitarra-para-p...,Beginner Level,Curso de guitarra para principiantes. curso de...
3673,Aprende a tocar Jazz y Ritmos Latinos en la gu...,https://www.udemy.com/aprende-a-tocar-jazz-y-r...,All Levels,Aprende a tocar Jazz y Ritmos Latinos en la gu...
3674,こどもギターレッスン　リトルギタリストⅢ（完結編）,https://www.udemy.com/takalittle3/,Beginner Level,こどもギターレッスン　リトルギタリストⅢ（完結編） takalittle3Music Ins...
3675,Aprende a tocar la flauta dulce de forma senci...,https://www.udemy.com/aprende-a-tocar-la-flaut...,All Levels,Aprende a tocar la flauta dulce de forma senci...


In [51]:
def create_similarity_matrix(new_description, overall_descriptions):
    #Append the new description to the overall set
    overall_descriptions=list(overall_descriptions)
    overall_descriptions.append(new_description)
    # Define a tfidf vectorizer and remove all stopwords.
    tfidf = TfidfVectorizer(stop_words="english")
    #Convert tfidf matrix by fitting and transforming the data.
    tfidf_matrix = tfidf.fit_transform(overall_descriptions)
    # output the shape of the matrix.
    tfidf_matrix.shape
    # calculating the cosine similarity matrix.
    cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
    return cosine_sim

In [52]:
def get_recommendations(new_description,overall_descriptions):
    cosine_sim = create_similarity_matrix(new_description,overall_descriptions)
    # Get pairwise similarity scores of all the students with new student.
    sim_scores = list(enumerate(cosine_sim[-1]))
    
    sim_scores = sorted(sim_scores,key =lambda x:x[1],reverse= True)
    # Get the scores of top 10 descriptions.
    sim_scores = sim_scores[1:10]
    # Get the student indices.
    indices = [i[0]for i in sim_scores]
    return data[['title','url','instructionalLevel']].iloc[indices]

In [53]:
title="HTML"
subject="Web Development"
level="All Levels"

In [56]:
a=get_recommendations(title+" "+subject+" "+level,data['corpus'])

In [55]:
data['instructionalLevel'].value_counts()

All Levels            1928
Beginner Level        1269
Intermediate Level     421
Expert Level            58
52                       1
Name: instructionalLevel, dtype: int64

In [79]:
a

Unnamed: 0,title,url,instructionalLevel
611,The All-In-One Web Development Course,https://www.udemy.com/the-all-in-one-web-devel...,All Levels
851,"Complete Web Development with HTML, CSS and Ja...",https://www.udemy.com/complete-web-development...,Intermediate Level
459,HTML Web Development Crash Course,https://www.udemy.com/html-tutorials/,Beginner Level
280,Web Development With HTML CSS BOOTSTRAP JQUERY...,https://www.udemy.com/web-development-with-htm...,Beginner Level
928,Core HTML How to get online quickly HTML to HT...,https://www.udemy.com/web-development-introduc...,Beginner Level
16,Introduction to Web Development: HTML,https://www.udemy.com/webdevelopment101_html/,Beginner Level
15,Learn Complete Web Development From Scratch,https://www.udemy.com/learn-complete-web-devel...,All Levels
360,Beginning Web development Learn the basics of ...,https://www.udemy.com/learn-the-basics-of-html...,Beginner Level
662,The Complete Web Development Course - Build 1...,https://www.udemy.com/complete-web-development...,All Levels


In [96]:
context={'courses':list(a['title'].values
                       ),'urls':list(a['url'].values)}


In [112]:
for i in context.values():
    print(i)
    

['The All-In-One Web Development Course', 'Complete Web Development with HTML, CSS and Javascript', 'HTML Web Development Crash Course', 'Web Development With HTML CSS BOOTSTRAP JQUERY for Beginners', 'Core HTML How to get online quickly HTML to HTML5 ', 'Introduction to Web Development: HTML', 'Learn Complete Web Development From Scratch', 'Beginning Web development Learn the basics of HTML', 'The Complete Web Development Course -  Build 15 Projects']
['https://www.udemy.com/the-all-in-one-web-development-course/', 'https://www.udemy.com/complete-web-development-with-html-css-and-javascript/', 'https://www.udemy.com/html-tutorials/', 'https://www.udemy.com/web-development-with-html-css-bootstrap-jquery-for-beginners/', 'https://www.udemy.com/web-development-introduction-html-html5-course/', 'https://www.udemy.com/webdevelopment101_html/', 'https://www.udemy.com/learn-complete-web-development-from-scratch/', 'https://www.udemy.com/learn-the-basics-of-html-in-less-than-a-week/', 'https:

In [106]:
context

{'courses': ['The All-In-One Web Development Course',
  'Complete Web Development with HTML, CSS and Javascript',
  'HTML Web Development Crash Course',
  'Web Development With HTML CSS BOOTSTRAP JQUERY for Beginners',
  'Core HTML How to get online quickly HTML to HTML5 ',
  'Introduction to Web Development: HTML',
  'Learn Complete Web Development From Scratch',
  'Beginning Web development Learn the basics of HTML',
  'The Complete Web Development Course -  Build 15 Projects'],
 'urls': ['https://www.udemy.com/the-all-in-one-web-development-course/',
  'https://www.udemy.com/complete-web-development-with-html-css-and-javascript/',
  'https://www.udemy.com/html-tutorials/',
  'https://www.udemy.com/web-development-with-html-css-bootstrap-jquery-for-beginners/',
  'https://www.udemy.com/web-development-introduction-html-html5-course/',
  'https://www.udemy.com/webdevelopment101_html/',
  'https://www.udemy.com/learn-complete-web-development-from-scratch/',
  'https://www.udemy.com/lea

In [121]:
a.to_dict('records')

[{'title': 'The All-In-One Web Development Course',
  'url': 'https://www.udemy.com/the-all-in-one-web-development-course/',
  'instructionalLevel': 'All Levels'},
 {'title': 'Complete Web Development with HTML, CSS and Javascript',
  'url': 'https://www.udemy.com/complete-web-development-with-html-css-and-javascript/',
  'instructionalLevel': 'Intermediate Level'},
 {'title': 'HTML Web Development Crash Course',
  'url': 'https://www.udemy.com/html-tutorials/',
  'instructionalLevel': 'Beginner Level'},
 {'title': 'Web Development With HTML CSS BOOTSTRAP JQUERY for Beginners',
  'url': 'https://www.udemy.com/web-development-with-html-css-bootstrap-jquery-for-beginners/',
  'instructionalLevel': 'Beginner Level'},
 {'title': 'Core HTML How to get online quickly HTML to HTML5 ',
  'url': 'https://www.udemy.com/web-development-introduction-html-html5-course/',
  'instructionalLevel': 'Beginner Level'},
 {'title': 'Introduction to Web Development: HTML',
  'url': 'https://www.udemy.com/we