# UDEMY ML MODELS

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import neattext as ntx

In [4]:
dir(ntx)

['AUTOMATED_READ_INDEX',
 'BTC_ADDRESS_REGEX',
 'CONTRACTIONS_DICT',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Callable',
 'Counter',
 'CreditCard_REGEX',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'FUNCTORS_WORDLIST',
 'HASTAG_REGEX',
 'HTML_TAGS_REGEX',
 'List',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PUNCT_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextCleaner',
 'TextExtractor',
 'TextFrame',
 'TextMetrics',
 'TextPipeline',
 'Tuple',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 'ZIP_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'clean_text',
 'defaultdict',
 'digit2words',
 'emoji_explainer',
 'emojify',
 'explainer',
 'extract_btc_address',
 

## Loading & Inspecting Data

In [5]:
df = pd.read_csv('udemy_courses-cleaned.csv')

In [6]:
df.sample(2)

Unnamed: 0,published_timestamp,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp.1,subject,year,month,day,quarter,profit,duration_category,price_category
3554,2014-01-21 03:56:12+00:00,149042,JavaScript for Absolute Beginners,https://www.udemy.com/javascript-for-absolute-...,True,20,10689,394,17,Beginner Level,2.0,2014-01-21 03:56:12+00:00,Web Development,2014,January,Tuesday,1,213780,3:7,20:50
2354,2016-03-29 00:11:57+00:00,806642,Blues Master - Professional Techniques Piano C...,https://www.udemy.com/blues-master-professiona...,True,75,4,0,16,All Levels,1.0,2016-03-29 00:11:57+00:00,Musical Instruments,2016,March,Tuesday,1,300,0:1,55:100


## Text Pre-processing

##### Text Cleaning

In [7]:
df['course_title']#[0]

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3667    Learn jQuery from Scratch - Master of JavaScri...
3668    How To Design A WordPress Website With No Codi...
3669                        Learn and Build using Polymer
3670    CSS Animations: Create Amazing Effects on Your...
3671    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3672, dtype: object

In [8]:
df['course_title'][2]

'Financial Modeling for Business Analysts and Consultants'

In [9]:
df['course_title_cleaned'] = df['course_title'].apply(ntx.remove_stopwords)

In [10]:
df['course_title_cleaned'] = df['course_title'].apply(ntx.remove_special_characters)

In [11]:
df.sample(2)

Unnamed: 0,published_timestamp,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,...,published_timestamp.1,subject,year,month,day,quarter,profit,duration_category,price_category,course_title_cleaned
2985,2016-03-14 22:22:28+00:00,791610,Create a business website with WordPress,https://www.udemy.com/create-a-business-websit...,True,50,1624,31,38,All Levels,...,2016-03-14 22:22:28+00:00,Web Development,2016,March,Monday,1,81200,12:20,20:50,Create a business website with WordPress
2487,2016-11-07 17:53:40+00:00,987064,Learn by Doing: Vue JS 2.0 the Right Way,https://www.udemy.com/learn-by-doing-vue-js-2-...,True,95,1087,220,32,Intermediate Level,...,2016-11-07 17:53:40+00:00,Web Development,2016,November,Monday,4,103265,7:12,55:100,Learn by Doing Vue JS 20 the Right Way


##### Text Vectorization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
cv = CountVectorizer()

In [14]:
df['course_title_cleaned']

0                      Ultimate Investment Banking Course
1       Complete GST Course  Certification  Grow Your ...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro  Financial Analysis in Excel 2017
4            How To Maximize Your Profits Trading Options
                              ...                        
3667    Learn jQuery from Scratch  Master of JavaScrip...
3668    How To Design A WordPress Website With No Codi...
3669                        Learn and Build using Polymer
3670    CSS Animations Create Amazing Effects on Your ...
3671    Using MODX CMS to Build Websites A Beginners G...
Name: course_title_cleaned, Length: 3672, dtype: object

In [15]:
df['course_title_cleaned'].duplicated().sum()

35

In [16]:
title_matrix = cv.fit_transform(df['course_title_cleaned'].drop_duplicates()).toarray()

In [17]:
title_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
title_matrix.shape

(3637, 3676)

In [19]:
# pd.DataFrame(title_matrix)

## Building Recommender System

In [20]:
sim_matrix = cosine_similarity(title_matrix)

In [21]:
sim_matrix

array([[1.        , 0.1767767 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1767767 , 1.        , 0.        , ..., 0.        , 0.125     ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.16903085, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.16903085, ..., 1.        , 0.        ,
        0.31622777],
       [0.        , 0.125     , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.31622777, 0.        ,
        1.        ]])

In [22]:
sim_matrix.shape

(3637, 3637)

In [23]:
# pd.DataFrame(sim_matrix)

In [24]:
# plt.figure(figsize=(30,15))
# sns.heatmap(sim_matrix)

In [25]:
df.sample(2)

Unnamed: 0,published_timestamp,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,...,published_timestamp.1,subject,year,month,day,quarter,profit,duration_category,price_category,course_title_cleaned
3370,2016-10-12 23:24:28+00:00,982328,How to Build an Autocomplete System Like Google,https://www.udemy.com/how-to-build-an-autocomp...,True,30,255,5,9,All Levels,...,2016-10-12 23:24:28+00:00,Web Development,2016,October,Wednesday,4,7650,0:1,20:50,How to Build an Autocomplete System Like Google
282,2015-11-28 06:25:44+00:00,666254,Curso Avanzado de Trading,https://www.udemy.com/curso-avanzado-de-trading/,True,125,106,11,100,All Levels,...,2015-11-28 06:25:44+00:00,Business Finance,2015,November,Saturday,4,13250,20+,105:150,Curso Avanzado de Trading


In [26]:
df['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3667    Learn jQuery from Scratch - Master of JavaScri...
3668    How To Design A WordPress Website With No Codi...
3669                        Learn and Build using Polymer
3670    CSS Animations: Create Amazing Effects on Your...
3671    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3672, dtype: object

In [27]:
df.index

RangeIndex(start=0, stop=3672, step=1)

In [34]:
course_index = pd.Series(df.index, index=df['course_title']).drop_duplicates()

In [35]:
course_index

course_title
Ultimate Investment Banking Course                                0
Complete GST Course & Certification - Grow Your CA Practice       1
Financial Modeling for Business Analysts and Consultants          2
Beginner to Pro - Financial Analysis in Excel 2017                3
How To Maximize Your Profits Trading Options                      4
                                                               ... 
Learn jQuery from Scratch - Master of JavaScript library       3667
How To Design A WordPress Website With No Coding At All        3668
Learn and Build using Polymer                                  3669
CSS Animations: Create Amazing Effects on Your Website         3670
Using MODX CMS to Build Websites: A Beginner's Guide           3671
Length: 3672, dtype: int64

In [36]:
title = 'How To Maximize Your Profits Trading Options'

In [37]:
course_index[title]

4

In [40]:
sim_matrix[course_index[title]]

array([0.        , 0.13363062, 0.        , ..., 0.        , 0.13363062,
       0.13363062])

In [43]:
scores = list(enumerate(sim_matrix[course_index[title]]))

In [44]:
scores

[(0, 0.0),
 (1, 0.13363062095621217),
 (2, 0.0),
 (3, 0.13363062095621217),
 (4, 0.9999999999999998),
 (5, 0.1259881576697424),
 (6, 0.13363062095621217),
 (7, 0.13363062095621217),
 (8, 0.26726124191242434),
 (9, 0.1259881576697424),
 (10, 0.1259881576697424),
 (11, 0.3380617018914066),
 (12, 0.0),
 (13, 0.1259881576697424),
 (14, 0.2519763153394848),
 (15, 0.1259881576697424),
 (16, 0.0),
 (17, 0.13363062095621217),
 (18, 0.2519763153394848),
 (19, 0.0),
 (20, 0.2182178902359924),
 (21, 0.2519763153394848),
 (22, 0.1259881576697424),
 (23, 0.1543033499620919),
 (24, 0.11952286093343936),
 (25, 0.11952286093343936),
 (26, 0.0),
 (27, 0.1091089451179962),
 (28, 0.0),
 (29, 0.3585685828003181),
 (30, 0.3779644730092272),
 (31, 0.0),
 (32, 0.1091089451179962),
 (33, 0.28571428571428564),
 (34, 0.0),
 (35, 0.26726124191242434),
 (36, 0.20965696734438366),
 (37, 0.14285714285714282),
 (38, 0.1259881576697424),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.5976143046671969),
 (44,

In [46]:
scores[1][1]

0.13363062095621217

In [48]:
# for i in scores:
#     print(scores[i][1]) # Not working

In [50]:
sorted_selected_course = sorted(scores, key= lambda x: x[1], reverse=True)

In [51]:
sorted_selected_course

[(4, 0.9999999999999998),
 (43, 0.5976143046671969),
 (461, 0.5714285714285713),
 (59, 0.5698028822981898),
 (414, 0.5669467095138407),
 (67, 0.5345224838248487),
 (116, 0.50709255283711),
 (385, 0.50709255283711),
 (111, 0.5039526306789696),
 (165, 0.5039526306789696),
 (1116, 0.47809144373375745),
 (68, 0.4629100498862757),
 (86, 0.4629100498862757),
 (1883, 0.4629100498862757),
 (145, 0.4558423058385518),
 (203, 0.4364357804719848),
 (408, 0.4364357804719848),
 (428, 0.4364357804719848),
 (185, 0.4285714285714285),
 (220, 0.4285714285714285),
 (692, 0.4285714285714285),
 (1818, 0.4285714285714285),
 (3543, 0.4285714285714285),
 (1764, 0.4193139346887673),
 (648, 0.40089186286863654),
 (699, 0.40089186286863654),
 (1120, 0.40089186286863654),
 (1127, 0.40089186286863654),
 (1354, 0.40089186286863654),
 (1517, 0.40089186286863654),
 (2839, 0.40089186286863654),
 (3022, 0.40089186286863654),
 (804, 0.3903600291794132),
 (1897, 0.3903600291794132),
 (2502, 0.3903600291794132),
 (30, 0.3

In [64]:
# for i in sorted_selected_course[1:]:
#     print(i[0])

In [58]:
selected_index = [i[0] for i in sorted_selected_course[1:]]

In [59]:
selected_scores = [i[1] for i in sorted_selected_course[1:]]

In [63]:
df.iloc[selected_index]['course_title'].head().values

array(['Options Trading - How to Win with Weekly Options',
       'Options Trading Foundation: Your journey to competency...',
       'How to Buy Cheap Options - Options Trading Pricing Model',
       'How to trade options', 'How to Win 97% of Your Options Trades'],
      dtype=object)

In [None]:
# 'How To Maximize Your Profits Trading Options'

In [67]:
def my_rec_sys(my_title):
    course_index = pd.Series(df.index, index=df['course_title']).drop_duplicates()
    title = my_title
    course_index[title]
    scores = list(enumerate(sim_matrix[course_index[title]])) 
    sorted_selected_course = sorted(scores, key= lambda x: x[1], reverse=True)
    selected_index = [i[0] for i in sorted_selected_course[1:]]
    selected_scores = [i[1] for i in sorted_selected_course[1:]]
    rec_courses = df.iloc[selected_index]['course_title'].head().values
    return rec_courses