# Udemy Model

### Importing Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#pip install neattext

In [4]:
import neattext as nt

In [5]:
dir(nt)

['AUTOMATED_READ_INDEX',
 'BTC_ADDRESS_REGEX',
 'CONTRACTIONS_DICT',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Callable',
 'Counter',
 'CreditCard_REGEX',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'FUNCTORS_WORDLIST',
 'HASTAG_REGEX',
 'HTML_TAGS_REGEX',
 'List',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PUNCT_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextCleaner',
 'TextExtractor',
 'TextFrame',
 'TextMetrics',
 'TextPipeline',
 'Tuple',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 'ZIP_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'clean_text',
 'defaultdict',
 'digit2words',
 'emoji_explainer',
 'emojify',
 'explainer',
 'extract_btc_address',
 

### Loading Data

In [6]:
df = pd.read_csv('udemy_courses-cleaning.csv')

In [7]:
df['course_title']

0                                Javascript for Beginners
1                 HTML Tutorial: HTML & CSS for Beginners
2                     Become a Web Developer from Scratch
3          Learn jQuery: An In-depth Course For Beginners
4                        Become a Certified Web Developer
                              ...                        
3667                              Angular 4 Front To Back
3668    ABRSM Grade III Piano Class - Handel Sonatina ...
3669    Condor Broken Wing Butterfly Options Trading C...
3670    Forex how traders beat the markest with little...
3671    Cryptocurrency (BTC & ETH) Investment & Tradin...
Name: course_title, Length: 3672, dtype: object

In [8]:
df['course_title'][2]

'Become a Web Developer from Scratch'

### text cleaning

In [9]:
df['course_title_cleaned'] =df['course_title'].apply(nt.remove_stopwords)

In [10]:
df.sample(2)

Unnamed: 0,published_timestamp,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,...,published_timestamp.1,subject,year,months,day,quarter,profit,duration_categroy,price_Cat,course_title_cleaned
774,2015-01-14 22:25:58+00:00,370752,How to Make a Wordpress Website 2017,https://www.udemy.com/wordpress-4-for-beginners/,False,0,12293,235,18,All Levels,...,2015-01-14 22:25:58+00:00,Web Development,2015,January,Wednesday,1,0,12:20,Free,Wordpress Website 2017
1706,2015-12-08 17:12:06+00:00,607800,Introduction To The World Of Equity Finance,https://www.udemy.com/equity-finance-basics/,True,20,29,9,32,All Levels,...,2015-12-08 17:12:06+00:00,Business Finance,2015,December,Tuesday,4,580,7:12,20:50,Introduction World Equity Finance


### text pre-processing

In [11]:
df['course_title_cleaned'] =df['course_title'].apply(nt.remove_special_characters)

In [12]:
df.sample(2)

Unnamed: 0,published_timestamp,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,...,published_timestamp.1,subject,year,months,day,quarter,profit,duration_categroy,price_Cat,course_title_cleaned
2283,2016-05-27 17:16:58+00:00,754524,Fundraising Success: The Art & Science of Majo...,https://www.udemy.com/fundraising-success-the-...,True,50,38,9,25,Intermediate Level,...,2016-05-27 17:16:58+00:00,Business Finance,2016,May,Friday,2,1900,12:20,20:50,Fundraising Success The Art Science of Major ...
913,2015-03-13 18:53:07+00:00,311638,Joomla 3 : Develop a Professional Website in 3...,https://www.udemy.com/develop-a-professional-h...,True,35,1309,74,43,All Levels,...,2015-03-13 18:53:07+00:00,Web Development,2015,March,Friday,1,45815,20:30,20:50,Joomla 3 Develop a Professional Website in 3 ...


### text vectorization

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer()

In [15]:
df['course_title_cleaned']

0                                Javascript for Beginners
1                   HTML Tutorial HTML  CSS for Beginners
2                     Become a Web Developer from Scratch
3            Learn jQuery An Indepth Course For Beginners
4                        Become a Certified Web Developer
                              ...                        
3667                              Angular 4 Front To Back
3668    ABRSM Grade III Piano Class  Handel Sonatina i...
3669    Condor Broken Wing Butterfly Options Trading C...
3670    Forex how traders beat the markest with little...
3671    Cryptocurrency BTC  ETH Investment  Trading Co...
Name: course_title_cleaned, Length: 3672, dtype: object

In [16]:
df['course_title_cleaned'].duplicated().sum()

35

In [17]:
title_matrix =cv.fit_transform(df['course_title_cleaned'].drop_duplicates()).toarray()

In [18]:
title_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
title_matrix.shape

(3637, 3676)

### Building Recommender system

In [20]:
sim_matrix= cosine_similarity(title_matrix)

In [21]:
sim_matrix

array([[1.        , 0.40824829, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.40824829, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.26726124],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.26726124, 0.        ,
        1.        ]])

In [22]:
sim_matrix.shape

(3637, 3637)

In [23]:
df.sample(2)

Unnamed: 0,published_timestamp,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,...,published_timestamp.1,subject,year,months,day,quarter,profit,duration_categroy,price_Cat,course_title_cleaned
2550,2016-08-26 22:57:33+00:00,824034,Technical Analysis: Understanding Price Action,https://www.udemy.com/technical-analysis-under...,True,45,39,6,37,All Levels,...,2016-08-26 22:57:33+00:00,Business Finance,2016,August,Friday,3,1755,12:20,20:50,Technical Analysis Understanding Price Action
3505,2017-05-28 23:56:09+00:00,1226008,Superb jQuery Course - Become Certified jQuery...,https://www.udemy.com/superb-jquery-course/,True,95,817,6,32,All Levels,...,2017-05-28 23:56:09+00:00,Web Development,2017,May,Sunday,2,77615,3:7,55:100,Superb jQuery Course Become Certified jQuery ...


In [24]:
course_index=pd.Series(df.index, index=df['course_title']).drop_duplicates()

In [25]:
course_index

course_title
Javascript for Beginners                                           0
HTML Tutorial: HTML & CSS for Beginners                            1
Become a Web Developer from Scratch                                2
Learn jQuery: An In-depth Course For Beginners                     3
Become a Certified Web Developer                                   4
                                                                ... 
Angular 4 Front To Back                                         3667
ABRSM Grade III Piano Class - Handel Sonatina in G 2017-2018    3668
Condor Broken Wing Butterfly Options Trading Course System      3669
Forex how traders beat the markest with little experience       3670
Cryptocurrency (BTC & ETH) Investment & Trading Course 2017     3671
Length: 3672, dtype: int64

In [26]:
title = 'Become a Certified Web Developer'

In [27]:
course_index[title]

4

In [28]:
sim_matrix[4]

array([0.        , 0.        , 0.67082039, ..., 0.        , 0.        ,
       0.        ])

In [29]:
scores = list(enumerate(sim_matrix[4]))

In [30]:
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.6708203932499369),
 (3, 0.0),
 (4, 1.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.25),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.1889822365046136),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.16666666666666666),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.20412414523193154),
 (37, 0.0),
 (38, 0.0),
 (39, 0.4082482904638631),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.15811388300841897),
 (48, 0.0),
 (49, 0.0),
 (50, 0.1889822365046136),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.0),
 (62, 0.0),
 (63, 0.0),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 

In [31]:
sorted(scores, key =lambda x:x[1], reverse=True)

[(4, 1.0),
 (954, 0.7559289460184544),
 (2, 0.6708203932499369),
 (264, 0.6123724356957946),
 (2045, 0.5669467095138407),
 (3379, 0.5669467095138407),
 (586, 0.5303300858899106),
 (2810, 0.5303300858899106),
 (3470, 0.5303300858899106),
 (190, 0.5),
 (787, 0.5),
 (1568, 0.5),
 (3472, 0.5),
 (524, 0.4472135954999579),
 (1098, 0.4472135954999579),
 (39, 0.4082482904638631),
 (1997, 0.4082482904638631),
 (3397, 0.4082482904638631),
 (3436, 0.4082482904638631),
 (1671, 0.3779644730092272),
 (2585, 0.3779644730092272),
 (3042, 0.3779644730092272),
 (3276, 0.3779644730092272),
 (778, 0.35355339059327373),
 (940, 0.35355339059327373),
 (1345, 0.35355339059327373),
 (1565, 0.35355339059327373),
 (2247, 0.35355339059327373),
 (2267, 0.35355339059327373),
 (2536, 0.35355339059327373),
 (3118, 0.35355339059327373),
 (3596, 0.35355339059327373),
 (2396, 0.3333333333333333),
 (2490, 0.3333333333333333),
 (2669, 0.3333333333333333),
 (470, 0.31622776601683794),
 (977, 0.31622776601683794),
 (3199, 0