This notebook mainly focus on how to use Word2Vec.

So basically,there are two ways of using word2vec

1. Use pretrained embedddings

2. Create manually depending on your content

## USE PRE-TRAINED EMBEDDINGS

In [81]:
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [83]:
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [84]:
len(wiki_embeddings['king'])

100

In [85]:
wiki_embeddings.most_similar('king')

[('prince', 0.7682328820228577),
 ('queen', 0.7507690787315369),
 ('son', 0.7020888328552246),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.6919989585876465),
 ('kingdom', 0.6811409592628479),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712858080863953),
 ('ii', 0.6676074266433716)]

In [109]:
from scipy import spatial

cosine_similarity = lambda x,y: 1 - spatial.distance.cosine(x,y)

def find_similar_words(embeddings, new_vector):
    
    computed_similarities = []
    
    available_words = wiki_embeddings.index_to_key
    
    for word in available_words:
        v1 = embeddings[word]
        similarity = cosine_similarity(v1,new_vector)
        computed_similarities.append((word,similarity))
        
    sorted_similarities = sorted(computed_similarities,key = lambda x:-x[1])
    
    top_5_similar_words = [i[0] for i in sorted_similarities[0:5]]
    
    return top_5_similar_words
        
    

In [106]:
cosine_similarity(wiki_embeddings['warm'],wiki_embeddings['hot'])

0.6978106498718262

In [110]:
new_vector = wiki_embeddings['king'] - wiki_embeddings['man'] + wiki_embeddings['woman']

print(find_similar_words(wiki_embeddings,new_vector))

['king', 'queen', 'monarch', 'throne', 'daughter']


## ===============================================================

## ===============================================================

## MANUAL TRAIN

In [1]:
import pandas as pd
import numpy as np
import gensim
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('spam.csv',encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [6]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)

In [7]:
df.columns = ['label','text']

In [8]:
df['text_cleaned'] = df['text'].apply(lambda x : gensim.utils.simple_preprocess(x))

In [9]:
df.head()

Unnamed: 0,label,text,text_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['text_cleaned'],
                                                    df['label'], test_size=0.2)

In [11]:
w2v_model = gensim.models.Word2Vec(X_train, min_count=2,vector_size= 100, window= 5)

In [12]:
w2v_model.wv['king']

array([-0.0205045 ,  0.05365179,  0.01489247, -0.0183033 ,  0.02203726,
       -0.12637927,  0.036799  ,  0.14808877, -0.04618878, -0.0425126 ,
       -0.03814064, -0.09280475,  0.00620271,  0.03084498,  0.02735333,
       -0.03231999,  0.01190963, -0.07291749, -0.0035638 , -0.14075959,
        0.05446997,  0.03892641,  0.03451211, -0.02152217, -0.00127978,
        0.00117782, -0.03681574, -0.01903795, -0.04066343,  0.00361291,
        0.04974056,  0.031753  ,  0.05143105, -0.05182542, -0.01480591,
        0.07828163,  0.0247078 , -0.05979678, -0.03966893, -0.11662914,
       -0.00379102, -0.06532697, -0.00861912,  0.00850444,  0.07078084,
       -0.0305617 , -0.04925452, -0.00786092,  0.0363782 ,  0.02567681,
        0.05085293, -0.05620576, -0.0155253 ,  0.00563988, -0.0197465 ,
        0.03479912,  0.0221549 , -0.0119337 , -0.06157551,  0.03844644,
        0.02864727, -0.00434565,  0.00095439,  0.01757558, -0.09153175,
        0.06214089,  0.01362321,  0.08902425, -0.09478061,  0.10

In [13]:
len(w2v_model.wv['king'])

100

In [14]:
all_words = w2v_model.wv.index_to_key
len(all_words)

3417

In [15]:
w2v_model.wv.most_similar('king')

[('would', 0.9944824576377869),
 ('ya', 0.9944550395011902),
 ('big', 0.9943901896476746),
 ('hey', 0.9943796992301941),
 ('see', 0.9943765997886658),
 ('about', 0.9943297505378723),
 ('all', 0.9943192005157471),
 ('missing', 0.9943186044692993),
 ('its', 0.9943153262138367),
 ('wid', 0.9943034648895264)]

### Preprocessing for ML

In [16]:
X_test.shape

(1115,)

In [20]:
print(X_test.iloc[0],"----------------",len(X_test.iloc[0]),end="\n========\n")
print(X_test.iloc[1],"----------------",len(X_test.iloc[1]))

['double', 'mins', 'double', 'txt', 'price', 'linerental', 'on', 'latest', 'orange', 'bluetooth', 'mobiles', 'call', 'mobileupd', 'for', 'the', 'very', 'latest', 'offers', 'or', 'call', 'optout', 'lf'] ---------------- 22
['for', 'ur', 'chance', 'to', 'win', 'cash', 'every', 'wk', 'txt', 'action', 'to', 'www', 'movietrivia', 'tv', 'custcare', 'wk'] ---------------- 16


In [None]:
# test_w2v = np.array([[w2v_model.wv[w] for w in txt if w in all_words] for txt in X_test])
# print(test_w2v.shape)       (1115,)
# print(type(test_w2v[0]))     list
# print(len(test_w2v[0]))       21 i.e the no. of tokens in first sentence or might be less than that depending on the
#                                     availability in vocab


In [36]:
test_w2v =  np.array([np.array([w2v_model.wv[w] for w in txt if w in all_words]) for txt in X_test])

  test_w2v =  np.array([np.array([w2v_model.wv[w] for w in txt if w in all_words]) for txt in X_test])


In [37]:
test_w2v.shape

(1115,)

In [38]:
test_w2v[0].shape

(21, 100)

In [39]:
test_w2v[1].shape

(15, 100)

We can see that every row of test_w2v has different shapes. But in order to fit into a ML model,its size must be same. So we would try to convert it into a fized size of shape 100 by averaging them along axis =1

In [40]:
test_w2v_avg = []

for vect in test_w2v:
    if len(vect)!=0:
        test_w2v_avg.append(vect.mean(axis=0))
    else:
        test_w2v_avg.append(np.zeros(100))

In [42]:
test_w2v_avg = np.array(test_w2v_avg)

In [43]:
test_w2v_avg.shape

(1115, 100)

Here, averaging the vectors of the words might lead to loss in information. So we can also opt for doc2vec.