## Basics on Word2Vec

In [1]:
import pandas as pd
import gensim
import gensim.downloader as api

In [3]:
# get the embeddings from the wikipedia...
wiki_embeddings = api.load("glove-wiki-gigaword-100")

In [4]:
# Explore the word vector King...
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [5]:
# Find the words which are most similar to the word king based on the numeric representation
wiki_embeddings.most_similar('king')

[('prince', 0.7682329416275024),
 ('queen', 0.7507690787315369),
 ('son', 0.7020887732505798),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.691999077796936),
 ('kingdom', 0.6811410188674927),
 ('father', 0.680202841758728),
 ('emperor', 0.6712858080863953),
 ('ii', 0.6676074266433716)]

In [7]:
wiki_embeddings.most_similar('senate')

[('congressional', 0.8280037641525269),
 ('republican', 0.8142463564872742),
 ('republicans', 0.8140667676925659),
 ('democrats', 0.8006194829940796),
 ('senator', 0.7951719760894775),
 ('congress', 0.7897807359695435),
 ('gop', 0.7813129425048828),
 ('legislature', 0.7724141478538513),
 ('lawmakers', 0.7692404389381409),
 ('sen.', 0.7573816180229187)]

### Training On Word2Vec Model

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import string
from gensim.models import Word2Vec
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support

In [16]:
# read the spam data
spamData = pd.read_csv('spam.csv', encoding='latin-1')
list_stopwords = list(stopwords.words("english"))
punctuations = string.punctuation

In [17]:
spamData = spamData.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [18]:
print("Shape Of the dataset is : ", spamData.shape)

Shape Of the dataset is :  (5572, 2)


In [19]:
spamData.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
print("Columns in the dataset are : ", spamData.columns)

Columns in the dataset are :  Index(['label', 'text'], dtype='object')


In [21]:
print("Distribution of Spam and Ham in the dataset are : ")
spamData['label'].value_counts()

Distribution of Spam and Ham in the dataset are : 


ham     4825
spam     747
Name: label, dtype: int64

### Cleaning Data

In [23]:
# function to clean the text data...
def cleanData(text):
    # Removing the punctuations from the text...
    text = "".join([word.lower() for word in text if word not in punctuations])
    
    # Splitting the text into tokens...
    tokens = re.split('\W+')
    # Removing stopwords from the tokens...
    text = [token for token in tokens if token not in list_stopwords]
    
    return text

In [26]:
spamData['text_clean'] = spamData['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [27]:
spamData.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [28]:
# Split the data into train and test...
X_train, X_test, y_train, y_test = train_test_split(spamData['text_clean'], spamData['label'], test_size=0.2)

In [31]:
# Train the Word2Vec model....
model_word2vec = Word2Vec(X_train, 
                          size=100, 
                          window=5, 
                          min_count=2)

In [35]:
model_word2vec.wv['king']

array([-3.61793488e-02, -2.82642078e-02,  3.43020819e-02, -2.67879758e-02,
       -3.79222035e-02,  4.73653190e-02, -1.06185623e-01,  5.62766520e-03,
        2.83254776e-02, -1.80747658e-02,  4.33870479e-02, -1.24006877e-02,
       -2.14063842e-02, -2.35639587e-02,  6.71626031e-02, -3.23265791e-02,
        1.46245107e-03, -4.53999266e-02, -1.84117798e-02,  4.53259610e-02,
        4.43041772e-02, -1.16688553e-02, -5.49235847e-03, -6.08196668e-02,
        7.40364566e-02, -4.28445455e-05,  9.80522260e-02, -4.53325175e-02,
        6.93175290e-03, -3.95511538e-02,  7.85833821e-02,  2.89071482e-02,
        8.20717250e-04,  6.79363012e-02, -2.20183353e-03,  1.02490976e-01,
       -4.78897467e-02,  4.46676165e-02, -3.78106572e-02, -2.59843171e-02,
        6.99566826e-02, -6.73679039e-02, -4.52250987e-02, -2.98104472e-02,
        3.83341461e-02, -2.77774706e-02, -1.23660475e-01,  7.54287280e-03,
        2.05309764e-02,  1.16777676e-03, -1.77274793e-02,  3.23546827e-02,
        1.27530480e-02,  

In [36]:
# Find most similar words to king...
model_word2vec.wv.most_similar('king')

[('liao', 0.998287558555603),
 ('play', 0.9982121586799622),
 ('hello', 0.9982022643089294),
 ('cost', 0.9981951713562012),
 ('yeah', 0.9981865882873535),
 ('car', 0.9981831312179565),
 ('before', 0.998181164264679),
 ('everything', 0.9981784820556641),
 ('already', 0.9981728792190552),
 ('those', 0.9981702566146851)]

In [38]:
# Generate the list of words the word2vec model learned word vectors for...
print(model_word2vec.wv.index2word)



In [39]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence...
w2v_vect = np.array([np.array([model_word2vec.wv[i] for i in ls if i in model_word2vec.wv.index2word])
                    for ls in X_test])

In [46]:
for i,v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

16 15
5 4
8 8
12 11
14 12
3 2
26 25
11 11
9 7
17 17
20 19
5 5
27 25
12 12
24 20
21 20
5 4
21 20
25 22
7 7
6 6
11 11
10 10
25 21
7 7
14 13
25 24
8 8
23 23
6 6
30 30
24 24
30 30
27 26
18 18
42 39
12 10
27 24
3 3
24 23
6 6
12 10
10 8
4 3
35 28
5 5
24 24
9 7
10 9
8 7
21 21
29 29
6 6
28 26
26 25
24 20
21 19
13 10
17 14
16 15
26 24
34 30
11 9
16 15
4 3
18 18
20 15
16 12
3 3
19 19
4 3
30 30
12 11
5 5
4 4
22 21
1 1
10 9
3 3
11 11
29 28
17 13
7 7
5 5
11 10
10 10
23 18
14 12
28 23
11 10
17 17
8 8
18 18
10 10
25 20
10 10
20 18
6 5
17 13
11 11
16 13
5 5
4 3
4 4
5 3
11 11
14 14
13 12
10 9
14 11
6 6
17 17
5 5
7 7
17 13
4 4
28 27
8 7
6 6
7 7
10 9
23 22
11 10
7 6
7 7
26 23
22 19
16 14
21 17
18 18
4 4
14 11
23 23
22 21
6 6
11 10
11 10
12 12
20 20
14 12
28 27
31 28
9 8
5 5
1 0
12 8
5 5
23 22
10 10
21 20
9 9
22 18
7 7
14 11
33 30
5 5
24 20
22 20
21 20
4 3
4 4
11 8
5 5
26 21
8 8
15 15
4 4
19 19
3 2
5 5
4 4
30 30
9 8
7 7
8 8
5 4
23 19
30 25
9 9
6 6
22 17
11 11
6 4
6 4
21 21
8 7
30 28
20 16
13 11
9 7
21 9
7

In [49]:
# Compute the average of the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))


In [54]:
for i,v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

16 100
5 100
8 100
12 100
14 100
3 100
26 100
11 100
9 100
17 100
20 100
5 100
27 100
12 100
24 100
21 100
5 100
21 100
25 100
7 100
6 100
11 100
10 100
25 100
7 100
14 100
25 100
8 100
23 100
6 100
30 100
24 100
30 100
27 100
18 100
42 100
12 100
27 100
3 100
24 100
6 100
12 100
10 100
4 100
35 100
5 100
24 100
9 100
10 100
8 100
21 100
29 100
6 100
28 100
26 100
24 100
21 100
13 100
17 100
16 100
26 100
34 100
11 100
16 100
4 100
18 100
20 100
16 100
3 100
19 100
4 100
30 100
12 100
5 100
4 100
22 100
1 100
10 100
3 100
11 100
29 100
17 100
7 100
5 100
11 100
10 100
23 100
14 100
28 100
11 100
17 100
8 100
18 100
10 100
25 100
10 100
20 100
6 100
17 100
11 100
16 100
5 100
4 100
4 100
5 100
11 100
14 100
13 100
10 100
14 100
6 100
17 100
5 100
7 100
17 100
4 100
28 100
8 100
6 100
7 100
10 100
23 100
11 100
7 100
7 100
26 100
22 100
16 100
21 100
18 100
4 100
14 100
23 100
22 100
6 100
11 100
11 100
12 100
20 100
14 100
28 100
31 100
9 100
5 100
1 100
12 100
5 100
23 100
10 100
21 10