In [21]:
import numpy as np
import pandas as pd

import neattext.functions as nfx


In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [22]:
df = pd.read_csv('udemy_courses.csv')
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance


In [23]:
df.shape

(3683, 12)

In [24]:
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [25]:
#important feature
df['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3678    Learn jQuery from Scratch - Master of JavaScri...
3679    How To Design A WordPress Website With No Codi...
3680                        Learn and Build using Polymer
3681    CSS Animations: Create Amazing Effects on Your...
3682    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3683, dtype: object

In [26]:
df['course_title_name'] = df['course_title'].apply(nfx.remove_stopwords)

In [27]:
df['course_title_name'] = df['course_title'].apply(nfx.remove_special_characters)

In [28]:
df[['course_title','course_title_name']]

Unnamed: 0,course_title,course_title_name
0,Ultimate Investment Banking Course,Ultimate Investment Banking Course
1,Complete GST Course & Certification - Grow You...,Complete GST Course Certification Grow Your ...
2,Financial Modeling for Business Analysts and C...,Financial Modeling for Business Analysts and C...
3,Beginner to Pro - Financial Analysis in Excel ...,Beginner to Pro Financial Analysis in Excel 2017
4,How To Maximize Your Profits Trading Options,How To Maximize Your Profits Trading Options
...,...,...
3678,Learn jQuery from Scratch - Master of JavaScri...,Learn jQuery from Scratch Master of JavaScrip...
3679,How To Design A WordPress Website With No Codi...,How To Design A WordPress Website With No Codi...
3680,Learn and Build using Polymer,Learn and Build using Polymer
3681,CSS Animations: Create Amazing Effects on Your...,CSS Animations Create Amazing Effects on Your ...


In [32]:
# Vectorize text

count_vector = CountVectorizer()
cv_mat = count_vector.fit_transform(df['course_title_name'])

In [33]:
cv_mat

<3683x3680 sparse matrix of type '<class 'numpy.int64'>'
	with 23448 stored elements in Compressed Sparse Row format>

In [34]:
cv_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
df_cv_words= pd.DataFrame(cv_mat.todense(),columns=count_vector.get_feature_names())



In [37]:
df_cv_words.head()

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# cosine Similarity
cosine_similar_mat = cosine_similarity(cv_mat)

In [39]:
cosine_similar_mat

array([[1.        , 0.1767767 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1767767 , 1.        , 0.        , ..., 0.        , 0.125     ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.16903085, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.16903085, ..., 1.        , 0.        ,
        0.31622777],
       [0.        , 0.125     , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.31622777, 0.        ,
        1.        ]])

In [42]:
#import seaborn as sns
#sns.heatmap(cosine_similar_mat[0:10], annot=True)

In [44]:
course_indices = pd.Series(df.index, index=df['course_title']).drop_duplicates()

In [45]:
course_indices

course_title
Ultimate Investment Banking Course                                0
Complete GST Course & Certification - Grow Your CA Practice       1
Financial Modeling for Business Analysts and Consultants          2
Beginner to Pro - Financial Analysis in Excel 2017                3
How To Maximize Your Profits Trading Options                      4
                                                               ... 
Learn jQuery from Scratch - Master of JavaScript library       3678
How To Design A WordPress Website With No Coding At All        3679
Learn and Build using Polymer                                  3680
CSS Animations: Create Amazing Effects on Your Website         3681
Using MODX CMS to Build Websites: A Beginner's Guide           3682
Length: 3683, dtype: int64

In [47]:
course_indices['Learn and Build using Polymer']

3680

In [49]:
id_s = course_indices['Learn and Build using Polymer']

In [50]:
id_s

3680

In [86]:
scores = list(enumerate(cosine_similar_mat[id_s]))

In [83]:
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.1690308509457033),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.15811388300841894),
 (7, 0.0),
 (8, 0.15811388300841894),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.1690308509457033),
 (13, 0.0),
 (14, 0.0),
 (15, 0.14907119849998596),
 (16, 0.13483997249264842),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.3162277660168379),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.1690308509457033),
 (38, 0.14907119849998596),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.14907119849998596),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.13483997249264842),
 (51, 0.25819888974716115),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.15811388300841894),
 (56, 0.22360679774997896),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.0),
 (62, 0.0),
 (63, 0.15811388300841894),
 (64, 

In [55]:
#score of the id_s with above search
scores[id_s]

(3680, 0.9999999999999999)

In [56]:
sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)

In [74]:
sorted_scores

[(3680, 0.9999999999999999),
 (2727, 0.50709255283711),
 (3042, 0.50709255283711),
 (3484, 0.50709255283711),
 (3560, 0.50709255283711),
 (3506, 0.47809144373375745),
 (3170, 0.47434164902525683),
 (3303, 0.47434164902525683),
 (3393, 0.47434164902525683),
 (2623, 0.4472135954999579),
 (3251, 0.4472135954999579),
 (3256, 0.4472135954999579),
 (2555, 0.44721359549995787),
 (3202, 0.44721359549995787),
 (3349, 0.44721359549995787),
 (2695, 0.4242640687119285),
 (3153, 0.4242640687119285),
 (3408, 0.4242640687119285),
 (3576, 0.4242640687119285),
 (1845, 0.40451991747794525),
 (3665, 0.40451991747794525),
 (1723, 0.39999999999999997),
 (2774, 0.39999999999999997),
 (2794, 0.39999999999999997),
 (1686, 0.3872983346207417),
 (2066, 0.3730019232961255),
 (890, 0.36514837167011077),
 (1587, 0.36514837167011077),
 (2169, 0.36514837167011077),
 (2273, 0.36514837167011077),
 (2352, 0.36514837167011077),
 (2533, 0.36514837167011077),
 (2608, 0.36514837167011077),
 (2942, 0.36514837167011077),
 (2

In [61]:
# selected courses Indices
selected_course_indices = [i[0] for i in sorted_scores[1:]]

In [79]:
selected_course_score = [i[1] for i in sorted_scores[1:]]

In [80]:
selected_course_score

[0.50709255283711,
 0.50709255283711,
 0.50709255283711,
 0.50709255283711,
 0.47809144373375745,
 0.47434164902525683,
 0.47434164902525683,
 0.47434164902525683,
 0.4472135954999579,
 0.4472135954999579,
 0.4472135954999579,
 0.44721359549995787,
 0.44721359549995787,
 0.44721359549995787,
 0.4242640687119285,
 0.4242640687119285,
 0.4242640687119285,
 0.4242640687119285,
 0.40451991747794525,
 0.40451991747794525,
 0.39999999999999997,
 0.39999999999999997,
 0.39999999999999997,
 0.3872983346207417,
 0.3730019232961255,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.36514837167011077,
 0.3380617018914066,
 0.3380617018914066,
 0.3380617018914066,
 0.3380617018914066,
 0.3380617018914066,
 0.33806170189

In [62]:
selected_course_indices

[2727,
 3042,
 3484,
 3560,
 3506,
 3170,
 3303,
 3393,
 2623,
 3251,
 3256,
 2555,
 3202,
 3349,
 2695,
 3153,
 3408,
 3576,
 1845,
 3665,
 1723,
 2774,
 2794,
 1686,
 2066,
 890,
 1587,
 2169,
 2273,
 2352,
 2533,
 2608,
 2942,
 2995,
 3072,
 3308,
 3358,
 3361,
 3386,
 3545,
 3613,
 312,
 533,
 737,
 1665,
 2024,
 2734,
 2897,
 3039,
 3270,
 3322,
 3474,
 3524,
 3577,
 3633,
 28,
 268,
 443,
 871,
 1231,
 1343,
 1460,
 1519,
 1555,
 1666,
 1743,
 2458,
 2490,
 2545,
 2565,
 2575,
 2769,
 2790,
 2837,
 2928,
 2967,
 3004,
 3046,
 3106,
 3111,
 3207,
 3302,
 3423,
 3439,
 3533,
 3535,
 3575,
 3600,
 3676,
 3682,
 1752,
 2115,
 2357,
 2507,
 2588,
 2595,
 2639,
 2658,
 2697,
 2760,
 2878,
 2900,
 2966,
 2973,
 3096,
 3107,
 3121,
 3163,
 3254,
 3330,
 3421,
 3514,
 3553,
 3557,
 3563,
 438,
 478,
 515,
 753,
 1395,
 1420,
 1517,
 1956,
 1990,
 2078,
 2153,
 2232,
 2350,
 2622,
 2645,
 2788,
 2947,
 2989,
 3044,
 3078,
 3084,
 3158,
 3185,
 3327,
 3463,
 346,
 923,
 942,
 2084,
 2253,
 

In [65]:
recommended_results = df['course_title'].iloc[selected_course_indices]

In [69]:
rec_df = pd.DataFrame(recommended_results)

In [70]:
rec_df.head()

Unnamed: 0,course_title
2727,Learn to Build Websites using Twitter Bootstrap
3042,Learn to Build Web Apps using D3JS
3484,Learn To Build Apps Using Play Framework
3560,"Learn animation using CSS3, Javascript and HTML5"
3506,Learn to build 20 websites and build 14 iOS9 a...


In [81]:
rec_df['similarity_scores'] = selected_course_score

In [82]:
rec_df

Unnamed: 0,course_title,similarity_scores
2727,Learn to Build Websites using Twitter Bootstrap,0.507093
3042,Learn to Build Web Apps using D3JS,0.507093
3484,Learn To Build Apps Using Play Framework,0.507093
3560,"Learn animation using CSS3, Javascript and HTML5",0.507093
3506,Learn to build 20 websites and build 14 iOS9 a...,0.478091
...,...,...
3671,XPATH :basics for beginners,0.000000
3673,A how to guide in HTML,0.000000
3674,Building Better APIs with GraphQL,0.000000
3679,How To Design A WordPress Website With No Codi...,0.000000


In [100]:
def recommend_courses(title,num_of_remonedation=10):
    #id for title
    id_s = course_indices[title]
    #course indice
    # search indice cosine_mat
    scores = list(enumerate(cosine_similar_mat[id_s]))
    #$score
    #sort_score
    sorted_scores = sorted(scores, key=lambda x:x[1], reverse=True)
    #recommend
    selected_course_indices = [i[0] for i in sorted_scores[1:]]
    selected_course_score = [i[1] for i in sorted_scores[1:]]
    recommended_results = df['course_title'].iloc[selected_course_indices]
    rec_df = pd.DataFrame(recommended_results)
    rec_df['similarity_scores'] = selected_course_score
    rec_df.reset_index(drop=True, inplace=True)
    print("Your courses...")
    return rec_df.head()

In [101]:
recommend_courses('XPATH :basics for beginners')

Your courses...


Unnamed: 0,course_title,similarity_scores
0,Jquery :basics for beginners,0.75
1,AJAX :basics for beginners,0.75
2,Bootstrap :basics for beginners,0.75
3,PHP :basics for beginners,0.75
4,JSON :basics for beginners,0.75
