## Recommendation system

In [272]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [273]:
df=pd.read_csv('friends.csv')
df

Unnamed: 0,index,name,experience,interest-1,interest-2,interest-3,linkedIn,github,proficiency-1,proficiency-2,about-me
0,1,George,3,Deep Learning,Machine Learning,Natural Language Processing,500,50,Python,R,I am passionate in data science would like a c...
1,2,Suresh,2,Deep Learning,Full Stack web development,Natural Language Processing,350,20,Octave,MEAN stack,I am flexible with both deep learning and the ...
2,3,Geeta,5,Internet of things,Computer Vision,App development,800,40,Arduino,Python,I am really into real world applications of Io...
3,4,Suneeta\t,1,Frontend development,App development,Backend development,100,5,MERN stack,Flutter,I am person with more taste in web development...
4,5,Girish,10,Deep Learning,Machine Learning,Natural Language Processing,1000,100,Python,C++,I am passionate in data science would like a c...


In [274]:
df['name']

0       George
1       Suresh
2        Geeta
3    Suneeta\t
4       Girish
Name: name, dtype: object

In [275]:
#converting all integer values to string so that they can be concattinated later in the program
for column in df.columns:
    df[column]=df[column].apply(str)

In [276]:
user = 'George'
userspecs = list(np.array(df[df['name']==user])[0])[1:]
userspecs

['George',
 '3',
 'Deep Learning',
 'Machine Learning',
 'Natural Language Processing',
 '500',
 '50',
 'Python',
 'R',
 'I am passionate in data science would like a carrier in the same. I am in the 5th semester of engineering. I like data science because of the way it has changed the world. I am passionate in data science would like a carrier in the same. I am in the 5th semester of engineering. I like data science because of the way it has changed the world. I am passionate in data science would like a carrier in the same. I am in the 5th semester of engineering. I like data science because of the way it has changed the world.']

In [277]:
#a function to combine the values of the different columns into a single string
def combined_specs(row):
    return(row['experience']+" "+row['interest-1']+" "+row['interest-2']+" "+row['interest-3']+" "+row['linkedIn']+" "+row['github']+" "+row['proficiency-1']+" "+row['proficiency-2'])

#apply the function to each row in the dataframe to store the combined strings into a new column called combimed_specs
df['combined_specs']=df.apply(combined_specs,axis=1)

In [278]:
#convert a collection of data into count_matrix
count_matrix=CountVectorizer()
count_matrix=count_matrix.fit_transform(df['combined_specs'])

#convert a the count_matrix into cosine similarity matrix
cosine_sim_mat=cosine_similarity(count_matrix)
print(cosine_sim_mat)

[[1.         0.4330127  0.08703883 0.         0.80064077]
 [0.4330127  1.         0.07537784 0.3125     0.41602515]
 [0.08703883 0.07537784 1.         0.30151134 0.0836242 ]
 [0.         0.3125     0.30151134 1.         0.06933752]
 [0.80064077 0.41602515 0.0836242  0.06933752 1.        ]]


In [279]:
friends = cos_mat[df.index[df['name']==user]][0].tolist()
friends_list = []
for i,j in enumerate(friends):
    friends_list.append((i,j));
friends_list.sort(key=lambda x:x[1],reverse=True)
friends_list

[(0, 1.0000000000000002),
 (4, 0.8006407690254358),
 (1, 0.43301270189221935),
 (2, 0.08703882797784893),
 (3, 0.0)]

In [280]:
# find top 3 friends
for i,j in friends_list[1:4]:
    print(df.iloc[i]['name'])

Girish
Suresh
Geeta


## Topic modelling

In [281]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Sumukh Raju
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [282]:
documents = df[['about-me']]
documents.head()

Unnamed: 0,about-me
0,I am passionate in data science would like a c...
1,I am flexible with both deep learning and the ...
2,I am really into real world applications of Io...
3,I am person with more taste in web development...
4,I am passionate in data science would like a c...


In [283]:
my_stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer() 
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=True):
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [lemmatizer.lemmatize(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [284]:
df['about-me'] = df['about-me'].apply(clean_tweet)
df['about-me']

0    passionate data science would like carrier th ...
1    flexible deep learning full stack web developm...
2    really real world application iot computer vis...
3    person taste web development frontend backend ...
4    passionate data science would like carrier th ...
Name: about-me, dtype: object

In [285]:
# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['about-me']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [286]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [287]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [290]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [291]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,development,4.1,science,0.1,science,0.1,science,0.1,computer,6.1,science,0.1,full,6.1,science,0.1,science,14.1,science,0.1
1,good,2.1,world,0.1,world,0.1,world,0.1,like,6.1,world,0.1,computer_science,3.1,world,0.1,data,14.1,world,0.1
2,also,2.1,like,0.1,like,0.1,like,0.1,iot,6.1,like,0.1,web,3.1,like,0.1,data_science,14.1,like,0.1
3,good_app,2.1,web_development,0.1,web_development,0.1,web_development,0.1,world,3.1,web_development,0.1,web_development,3.1,web_development,0.1,like,14.1,web_development,0.1
4,frontend_backend,2.1,web,0.1,web,0.1,web,0.1,real,3.1,web,0.1,flexible_deep,3.1,web,0.1,would,7.1,web,0.1
5,frontend,2.1,good,0.1,good,0.1,good,0.1,really,3.1,good,0.1,stack_web,3.1,good,0.1,passionate,7.1,good,0.1
6,efficiently_good,2.1,development,0.1,development,0.1,development,0.1,application_iot,3.1,development,0.1,stack,3.1,development,0.1,passionate_data,7.1,development,0.1
7,efficiently,2.1,world_,0.1,world_,0.1,world_,0.1,sensor_iot,3.1,world_,0.1,fledged,3.1,world_,0.1,engineering_like,7.1,world_,0.1
8,taste,2.1,computer_science,0.1,computer_science,0.1,computer_science,0.1,sensor,3.1,computer_science,0.1,fledged_computer,3.1,computer_science,0.1,engineering,7.1,computer_science,0.1
9,taste_web,2.1,computer,0.1,computer,0.1,computer,0.1,computer_vision,3.1,computer,0.1,flexible,3.1,computer,0.1,changed_world,7.1,computer,0.1
