In [63]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
%matplotlib inline

In [64]:
os.listdir()

['.git',
 '.ipynb_checkpoints',
 'BusinessFinance.csv',
 'Course_Recommendation_System.ipynb',
 'Data.zip',
 'GraphicDesign.csv',
 'MusicInstraments.csv',
 'phoenix-buildwithai.ipynb',
 'WebDevelopment.csv']

In [65]:
#Read data
WebDev = pd.read_csv('WebDevelopment.csv')
Business = pd.read_csv('BusinessFinance.csv')
GraphicDesign = pd.read_csv('GraphicDesign.csv')
Music = pd.read_csv('MusicInstraments.csv')

In [66]:
#Add Subject column to the 4 datasets
WebDev['Subject']= 'Web Development'
Business['Subject']= 'Business'
GraphicDesign['Subject']= 'Graphic Design'
Music['Subject']= 'Music Instr'

In [67]:
#Combine all 4 datasets into a single one using concat()
data=pd.concat([WebDev,Business,GraphicDesign,Music],axis=0)

In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3683 entries, 0 to 680
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3683 non-null   int64  
 1   title                 3683 non-null   object 
 2   url                   3683 non-null   object 
 3   isPaid                3683 non-null   object 
 4   price                 3683 non-null   object 
 5   numSubscribers        3683 non-null   int64  
 6   numReviews            3683 non-null   int64  
 7   numPublishedLectures  3683 non-null   int64  
 8   instructionalLevel    3683 non-null   object 
 9   contentInfo           3683 non-null   object 
 10  publishedTime         3683 non-null   object 
 11  Unnamed: 11           0 non-null      float64
 12  Unnamed: 12           0 non-null      float64
 13  Is Paid               8 non-null      object 
 14  Total                 8 non-null      float64
 15  Percent               

## Removing Duplicates

In [69]:
#Finding number of duplicates
data.id.duplicated().sum()

6

In [70]:
#List of duplicates
pd.concat(g for _, g in data.groupby("id") if len(g) > 1)

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,Unnamed: 11,Unnamed: 12,Is Paid,Total,Percent,Subject,Column1
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,,,Yes,1067.0,89%,Web Development,
83,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,,,,,,Web Development,
39,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,37 mins,2014-04-15T21:48:55Z,,,,,,Graphic Design,
278,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,37 mins,2014-04-15T21:48:55Z,,,,,,Graphic Design,
455,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,37 mins,2016-05-16T18:28:30Z,,,,,,Business,
789,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,37 mins,2016-05-16T18:28:30Z,,,,,,Business,
780,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1 hour,2016-12-15T14:56:17Z,,,,,,Business,
897,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1 hour,2016-12-15T14:56:17Z,,,,,,Business,
465,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5 hours,2017-07-02T14:29:35Z,,,,,,Business,
1104,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5 hours,2017-07-02T14:29:35Z,,,,,,Business,


In [71]:
#Removing duplicates
data=data.drop_duplicates(subset=['id'], keep='first')

In [72]:
len(data)

3677

# Dropping Unnecessary Columns

In [73]:
data.columns

Index(['id', 'title', 'url', 'isPaid', 'price', 'numSubscribers', 'numReviews',
       'numPublishedLectures', 'instructionalLevel', 'contentInfo',
       'publishedTime', 'Unnamed: 11', 'Unnamed: 12', 'Is Paid', 'Total',
       'Percent', 'Subject', 'Column1'],
      dtype='object')

In [74]:
data_final=data[['id','title','url','Subject','instructionalLevel']]

In [75]:
data_final.isnull().sum()

id                    0
title                 0
url                   0
Subject               0
instructionalLevel    0
dtype: int64

In [76]:
#getting raw text from the URL (can be useful for our analysis)
data_final['url']=data_final['url'].replace({"https://www.udemy.com/":"",
                              "-":" ","/":""},regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final['url']=data_final['url'].replace({"https://www.udemy.com/":"",


In [77]:
data_final['corpus']=data_final['title'
                               ]+" "+data_final['url'
                                               ]+data_final['Subject'
                                                           ]+data_final['instructionalLevel']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final['corpus']=data_final['title'


In [78]:
data_final

Unnamed: 0,id,title,url,Subject,instructionalLevel,corpus
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,build beautiful html5 website,Web Development,All Levels,Learn Web Designing & HTML5/CSS3 Essentials in...
1,19603,Learning Dynamic Website Design - PHP MySQL an...,learning dynamic website design,Web Development,All Levels,Learning Dynamic Website Design - PHP MySQL an...
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,chatbots,Web Development,All Levels,ChatBots: Messenger ChatBot with API.AI and No...
3,197836,Projects in HTML5,projects in html5,Web Development,Intermediate Level,Projects in HTML5 projects in html5Web Develop...
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,html css more,Web Development,Beginner Level,Programming Foundations: HTML5 + CSS3 for Entr...
...,...,...,...,...,...,...
676,513528,Curso de guitarra para principiantes.,curso de guitarra para principiantes2,Music Instr,Beginner Level,Curso de guitarra para principiantes. curso de...
677,211026,Aprende a tocar Jazz y Ritmos Latinos en la gu...,aprende a tocar jazz y ritmos latinos en la gu...,Music Instr,All Levels,Aprende a tocar Jazz y Ritmos Latinos en la gu...
678,847808,こどもギターレッスン　リトルギタリストⅢ（完結編）,takalittle3,Music Instr,Beginner Level,こどもギターレッスン　リトルギタリストⅢ（完結編） takalittle3Music Ins...
679,218856,Aprende a tocar la flauta dulce de forma senci...,aprende a tocar la flauta dulce curso en video,Music Instr,All Levels,Aprende a tocar la flauta dulce de forma senci...


# Creating a cosine similarity matrix from the corpus

In [79]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : {'filename', 'file', 'content'}, default='content'
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |  

In [105]:
def create_similarity_matrix(new_description, overall_descriptions):
#Append the new description to the overall set.
    overall_descriptions=list(overall_descriptions)
    overall_descriptions.append(new_description)
    # Define a tfidf vectorizer and remove all stopwords.
    tfidf = TfidfVectorizer(stop_words="english")
    #Convert tfidf matrix by fitting and transforming the data.
    tfidf_matrix = tfidf.fit_transform(overall_descriptions)
    # output the shape of the matrix.
    tfidf_matrix.shape
    # calculating the cosine similarity matrix.
    cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
    return cosine_sim

In [106]:
def get_recommendations(new_description,overall_descriptions):
    cosine_sim = create_similarity_matrix(new_description,overall_descriptions)
    # Get pairwise similarity scores of all the students with new student.
    sim_scores = list(enumerate(cosine_sim[-1]))
    
    sim_scores = sorted(sim_scores,key =lambda x:x[1],reverse= True)
    # Get the scores of top 10 descriptions.
    sim_scores = sim_scores[1:10]
    # Get the student indices.
    indices = [i[0]for i in sim_scores]
    return data_final['corpus'].iloc[indices]

In [110]:
get_recommendations("guitar",data_final['corpus'])

26     Guitar: The Next Level guitar the next levelMu...
428    Guitar Essentials guitar essentialsMusic Instr...
58     Guitar Chord Riot! Learn to Play Guitar Like a...
156    Beginner Guitar Lessons: Learn To Play Guitar ...
565    Guitar for Christmas! guitar for christmasMusi...
610    Fingerpicking Blues Guitar Lessons - Bottlenec...
438    Beginner Guitar Essentials beginner guitar ess...
4      Your First 10 Guitar Lessons - Learn how to pl...
288    Guitar for Music Educators: Learning How to Te...
Name: corpus, dtype: object