In [2]:
import pandas as pd
import string
import re
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
import itertools
from math import isnan

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/utkarsh.verma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/utkarsh.verma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Data for the model
articles = pd.read_csv (r'archiveNLP/shared_articles.csv')
users = pd.read_csv (r'archiveNLP/users_interactions.csv')

# combining the articles and users on contentid
combined = pd.merge(articles, users,how='left', on='contentId')
#combined = combined.iloc[1:1000,]

In [4]:
# Creating Dictionary 
dict_article1 = dict(zip(combined.title , combined.text))
dict_article=dict(itertools.islice(dict_article1.items(),500))

In [5]:
# Data Preprocessing 

SPECIAL_CHARS = '[^A-Za-z0-9\d]+'
def preprocess(text):
    tokenized = [word for sent in [re.sub(SPECIAL_CHARS, '', element).split(' ') for 
                                   element in nltk.word_tokenize(text)] for word in sent]
    #tokens_without_sw = [word for word in tokenized if not word in stopwords.words()]
    lowered = [word.lower() for word in tokenized]
    return lowered

In [6]:
# Load Google's pre-trained Word2Vec model.

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
model.init_sims(replace=True)

In [8]:
# Generating User2Title Dictionary

usertitle = combined[['title','personId']].groupby("personId")['title'].apply(lambda x: x.tolist()).to_dict()

# Generating User2Title Dictionary

title2text = {k:preprocess (v) for (k, v) in dict_article.items()}

In [9]:
def titlevec_cal(first_map,second_map):
    temp = list()
    new_vec = dict()
    for (k, v) in first_map.items():
        for w in v:
            if w in second_map.vocab and len(w)<20:
                temp.append(second_map[w])
        new_vec[k]= np.mean(temp,axis=0)    
    return new_vec 

In [10]:
def uservec_cal(first_map,second_map):
    temp = list()
    new_vec = dict()
    for (k, v) in first_map.items():
        for w in v:
            if w in second_map.keys() :
                temp.append(second_map[w])
        new_vec[k]= np.mean(temp,axis=0)    
    return new_vec 

In [11]:
titlevec=titlevec_cal(title2text,model)
Uservec=uservec_cal(usertitle,titlevec)

In [12]:
# Removing the nan value from dictionary

clean_dict = {k:[elem for elem in v if elem is not np.nan] for k,v in Uservec.items()}

In [13]:
# function to calculate Similar users

from sklearn.metrics.pairwise import cosine_similarity
def similar_users(user_vec,users):
    sim=list()
    for i,v in users.items():
        sim.append(cosine_similarity(user_vec.reshape(1, -1), v.reshape(1, -1))[0][0] )
    return sim    

In [14]:
x=similar_users(Uservec[-9.223121837663644e+18],Uservec)


In [15]:
sorted(x, key=float)[-9:]

[0.9998336, 0.9998336, 0.9998524, 0.9999914, 1.0, 1.0, 1.0, 1.0, 1.0]

In [16]:
title2text['IDEO founder David Kelley talks design, Steve Jobs, cancer, and the importance of empathy']


['cbs',
 'posted',
 'an',
 'excellent',
 'interview',
 'of',
 'david',
 'kelley',
 'this',
 'evening',
 '',
 'kelley',
 'discusses',
 'steve',
 'jobs',
 'at',
 '300',
 'and',
 'then',
 'again',
 'at',
 '740',
 '',
 'but',
 'the',
 'whole',
 'video',
 'is',
 'definitely',
 'worth',
 'a',
 'watch',
 '',
 'a',
 'longer',
 'jobs',
 'clip',
 'and',
 'the',
 'transcript',
 'is',
 'below',
 '',
 'it',
 'is',
 'a',
 'concept',
 'that',
 'had',
 'its',
 'genesis',
 'in',
 '1978',
 '',
 'when',
 'kelley',
 'and',
 'some',
 'stanford',
 'pals',
 'took',
 'the',
 'notion',
 'of',
 'mixing',
 'human',
 'behavior',
 'and',
 'design',
 'and',
 'started',
 'the',
 'company',
 'that',
 'would',
 'eventually',
 'become',
 'ideo',
 '',
 'one',
 'of',
 'their',
 'first',
 'clients',
 'was',
 'the',
 'owner',
 'of',
 'a',
 'fastgrowing',
 'personal',
 'computer',
 'manufacturer',
 'by',
 'the',
 'name',
 'of',
 'steve',
 'jobs',
 '',
 'david',
 'kelley',
 '',
 'he',
 'made',
 'ideo',
 '',
 'because',
 'he'

In [17]:
dict_article['IDEO founder David Kelley talks design, Steve Jobs, cancer, and the importance of empathy']


'CBS posted an excellent interview of David Kelley this evening. Kelley discusses Steve Jobs at 3:00 and then again at 7:40, but the whole video is definitely worth a watch. A longer Jobs clip and the transcript is below: It is a concept that had its genesis in 1978, when Kelley and some Stanford pals took the notion of mixing human behavior and design and started the company that would eventually become IDEO. One of their first clients was the owner of a fast-growing personal computer manufacturer by the name of Steve Jobs. David Kelley: He made IDEO. Because he was such a good client. We did our best work for him. We became friends and he\'d call me at 3 o\'clock in the morning. Charlie Rose: At 3 a.m.? David Kelley: Yeah, we were both bachelors so he knew he could call me, right? So he\'d call me at 3 o\'clock and he\'d just like, with no preamble, say, "Hey, it\'s Steve." First, I knew if it was 3 o\'clock in the morning, it was him. There was no preamble. And he\'d just start- and

In [16]:
titlevec['IDEO founder David Kelley talks design, Steve Jobs, cancer, and the importance of empathy']

array([ 0.0187978 ,  0.02755446,  0.03082108,  0.08414944, -0.06050459,
       -0.00446651,  0.03056705, -0.06865846,  0.06067567,  0.0459625 ,
       -0.05670228, -0.09718478, -0.03218032,  0.01864124, -0.09078405,
        0.06107385,  0.05197671,  0.09060356,  0.00245518, -0.03630758,
       -0.03516839,  0.01928091,  0.00056261,  0.01595686,  0.02263595,
       -0.01612492, -0.08167408,  0.06536686, -0.00046865, -0.01875781,
       -0.00406033,  0.00423141, -0.03360781, -0.02252589,  0.00691593,
       -0.01770249, -0.00407463, -0.00927517,  0.05780309,  0.03115172,
        0.06065028, -0.01827352,  0.09166411,  0.01095194, -0.01659064,
       -0.05043962, -0.0379487 ,  0.00572253, -0.00594457,  0.03361903,
       -0.00199291,  0.04135054, -0.01797899, -0.02144322, -0.01523225,
        0.0538193 , -0.04883977, -0.06041277,  0.03819546, -0.05442293,
       -0.03895511,  0.06473847, -0.08369755, -0.06173811, -0.02723093,
       -0.02492643, -0.04472785,  0.07779798, -0.03304167,  0.05