In [1]:
import pandas as pd
import numpy as np

In [2]:
import pymysql

In [3]:
from rake_nltk import Rake
from random import randint

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

In [5]:
connection = pymysql.connect(host='localhost',
                             user='root',
                             password='',
                             db='times_shiksha2')

df = pd.read_sql('SELECT * FROM mdl_course', con=connection)

In [6]:
df = df.copy()[['id', 'fullname', 'summary']]

In [7]:
summarylist = []
for line in df['summary']:
    line = line.replace('\n','').replace('\r','').replace('\t','').lower()
    summarylist.append(line)
    
df['summary'] = summarylist

In [8]:
df['fullname'] = df['fullname'].str.lower()

In [9]:
df.head()

Unnamed: 0,id,fullname,summary
0,1,timesshiksha,a great place to start
1,6,microsoft word,microsoft word is agraphicalwordprocessing pro...
2,7,microsoft excel,microsoftexcelis aspreadsheet programusedto st...
3,8,microsoft powerpoint,power pointis apresentation program softwarepa...
4,12,powerful speaking,powerful speakingis an invaluable set of skill...


In [10]:
df.isna().sum()

id          0
fullname    0
summary     0
dtype: int64

In [11]:
def concatenate_list(list):
    result= ''
    for element in list:
        result += ' ' + str(element)
    return result

In [12]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = str(row['summary'])
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = concatenate_list(list(key_words_dict_scores.keys()))
    df['Key_words'][index] = row['Key_words'].strip()
    
# dropping the Plot column
df.drop(columns = ['summary'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [14]:
tfidf_matrix = tf.fit_transform(df['Key_words'])

In [15]:
df.head()

Unnamed: 0,id,fullname,Key_words
0,1,timesshiksha,great place start
1,6,microsoft word,itspurpose helpful tools type save documents l...
2,7,microsoft excel,favourite feature rowssome features1 excel for...
3,8,microsoft powerpoint,display information inserted andformatted allo...
4,12,powerful speaking,thispowerful speakingcourse should1focus peopl...


In [16]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
results = {}
for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
    similar_items = [(cosine_similarities[idx][i], df['fullname'][i]) for i in similar_indices] 
    results[row['fullname']] = {'id': row['id'], 'similarities': similar_items[1:]}

In [17]:
results

{'timesshiksha': {'id': 1,
  'similarities': [(0.03108797179007013, 'microsoft powerpoint'),
   (0.0, 'adwords essential training'),
   (0.0, 'microsoft dynamics crm customization config'),
   (0.0, 'time management'),
   (0.0, 'project management fundamentals'),
   (0.0, 'yammer 2018 essential training'),
   (0.0, 'email communication'),
   (0.0, 'google drive essential training tutorial'),
   (0.0, 'analytics essential training'),
   (0.0, 'creating a business plan'),
   (0.0, 'sales secrets for small business'),
   (0.0, 'cbt nuggets microsoft sharepoint'),
   (0.0, 'cbt nuggets - mpls fundamentals'),
   (0.0, 'cbt nuggets cloud computing'),
   (0.0, 'powerful speaking'),
   (0.0, 'microsoft excel'),
   (0.0, 'microsoft word'),
   (0.0, 'google cloud platform fundamentals'),
   (0.0, 'cloud computing'),
   (0.0, 'html'),
   (0.0, 'microsoft office 365'),
   (0.0, 'tpl - learning management system'),
   (0.0, 'stress management at workplace'),
   (0.0, 'campusnexus crm'),
   (0.0, 'l

In [23]:
corr = []
for progName, details in results.items():
#     print(details)
#     print(progName)
#     break
    corr.append({'id': details['id'], 'programName': progName, 'similarities': details['similarities']})

In [24]:
corr

[{'id': 1,
  'programName': 'timesshiksha',
  'similarities': [(0.03108797179007013, 'microsoft powerpoint'),
   (0.0, 'adwords essential training'),
   (0.0, 'microsoft dynamics crm customization config'),
   (0.0, 'time management'),
   (0.0, 'project management fundamentals'),
   (0.0, 'yammer 2018 essential training'),
   (0.0, 'email communication'),
   (0.0, 'google drive essential training tutorial'),
   (0.0, 'analytics essential training'),
   (0.0, 'creating a business plan'),
   (0.0, 'sales secrets for small business'),
   (0.0, 'cbt nuggets microsoft sharepoint'),
   (0.0, 'cbt nuggets - mpls fundamentals'),
   (0.0, 'cbt nuggets cloud computing'),
   (0.0, 'powerful speaking'),
   (0.0, 'microsoft excel'),
   (0.0, 'microsoft word'),
   (0.0, 'google cloud platform fundamentals'),
   (0.0, 'cloud computing'),
   (0.0, 'html'),
   (0.0, 'microsoft office 365'),
   (0.0, 'tpl - learning management system'),
   (0.0, 'stress management at workplace'),
   (0.0, 'campusnexus c

In [21]:
for i in corr:
    print(i['programName'])

timesshiksha
microsoft word
microsoft excel
microsoft powerpoint
powerful speaking
cbt nuggets cloud computing
cbt nuggets - mpls fundamentals
cbt nuggets microsoft sharepoint
creating a business plan
adwords essential training
analytics essential training
google drive essential training tutorial
email communication
yammer 2018 essential training
project management fundamentals
time management
microsoft dynamics crm customization config
google cloud platform fundamentals
sales secrets for small business
marketing with facebook and twitter
techniques and concepts of big data
business-intellegence features in depth
analytics
hrm
strategy - innovation
et cases
sample
complete beginners introduction to sql
excel 2016 pivot tables in depth
microsoft office 365
microsoft sharepoint
microsoft onedrive
helpful technique while operating computers
leadership
campusnexus crm
stress management at workplace
tpl - learning management system
html
cloud computing


In [30]:
corr

[{'id': 1,
  'programName': 'timesshiksha',
  'similarities': [(0.03108797179007013, 'microsoft powerpoint'),
   (0.0, 'adwords essential training'),
   (0.0, 'microsoft dynamics crm customization config'),
   (0.0, 'time management'),
   (0.0, 'project management fundamentals'),
   (0.0, 'yammer 2018 essential training'),
   (0.0, 'email communication'),
   (0.0, 'google drive essential training tutorial'),
   (0.0, 'analytics essential training'),
   (0.0, 'creating a business plan'),
   (0.0, 'sales secrets for small business'),
   (0.0, 'cbt nuggets microsoft sharepoint'),
   (0.0, 'cbt nuggets - mpls fundamentals'),
   (0.0, 'cbt nuggets cloud computing'),
   (0.0, 'powerful speaking'),
   (0.0, 'microsoft excel'),
   (0.0, 'microsoft word'),
   (0.0, 'google cloud platform fundamentals'),
   (0.0, 'cloud computing'),
   (0.0, 'html'),
   (0.0, 'microsoft office 365'),
   (0.0, 'tpl - learning management system'),
   (0.0, 'stress management at workplace'),
   (0.0, 'campusnexus c

In [38]:
for i in corr:
    if i['programName'] == 'timesshiksha':
        i['id']

1


In [39]:
def getRecommendations(userData, corr=corr):
    recommendations = []
    
    print(userData['skills'])
    for i in userData['skills']:
        for j in corr:
            
            if i.strip() in j['programName'].strip():
                for k in j['similarities']:
                    
                    if k[0] > 0:
                        if k[1] not in recommendations:
                            for l in corr:
                                if l['programName'] == k[1]:
                                    recommendations.append(dict({'id': l['id'], 'courseName': k[1]}))
                    else:
                        break       
                
    return recommendations

In [40]:
userData = {
#     'aoi': 'finance',
    'skills': ['microsoft', 'cloud']
}

In [41]:
getRecommendations(userData)

['microsoft', 'cloud']


[{'id': 8, 'courseName': 'microsoft powerpoint'},
 {'id': 43, 'courseName': 'microsoft office 365'},
 {'id': 26, 'courseName': 'microsoft dynamics crm customization config'},
 {'id': 23, 'courseName': 'yammer 2018 essential training'},
 {'id': 44, 'courseName': 'microsoft sharepoint'},
 {'id': 21, 'courseName': 'google drive essential training tutorial'},
 {'id': 22, 'courseName': 'email communication'},
 {'id': 42, 'courseName': 'excel 2016 pivot tables in depth'},
 {'id': 22, 'courseName': 'email communication'},
 {'id': 44, 'courseName': 'microsoft sharepoint'},
 {'id': 43, 'courseName': 'microsoft office 365'},
 {'id': 21, 'courseName': 'google drive essential training tutorial'},
 {'id': 45, 'courseName': 'microsoft onedrive'},
 {'id': 26, 'courseName': 'microsoft dynamics crm customization config'},
 {'id': 41, 'courseName': 'complete beginners introduction to sql'},
 {'id': 32, 'courseName': 'business-intellegence features in depth'},
 {'id': 1, 'courseName': 'timesshiksha'},
 {

In [114]:
a = 'a, v,v, c'
list(pd.Series(a.split(',')).apply(lambda x: x.strip()))

['a', 'v', 'v', 'c']

In [128]:
a = ['<a dsa>Hello, wow</a>', '<p><h1>Nope,                        "", kil</h1></p>']

In [129]:
import re
import string

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '',sentence)
    return cleantext

In [130]:
for i in a:
    print(cleanHtml(i).translate(str.maketrans('', '', string.punctuation)))

Hello wow
Nope                         kil
