In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [60]:
df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [61]:
df['msg_clean'] = df['Message'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [62]:
df.head()

Unnamed: 0,Category,Message,msg_clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [63]:
# Encoding the label column
df['Category'] = df['Category'].map({'ham':1,'spam':0})

In [64]:
df.head()

Unnamed: 0,Category,Message,msg_clean
0,1,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,1,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,1,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [65]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (df['msg_clean'], df['Category'] , test_size=0.2)

In [66]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [67]:
w2v_model.wv.index_to_key

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'are',
 'now',
 'can',
 'so',
 'or',
 'but',
 'not',
 'do',
 'we',
 'if',
 'at',
 'get',
 'will',
 'no',
 'just',
 'with',
 'be',
 'ur',
 'this',
 'up',
 'how',
 'when',
 'gt',
 'free',
 'lt',
 'all',
 'what',
 'go',
 'll',
 'from',
 'ok',
 'out',
 'know',
 'day',
 'like',
 'come',
 'am',
 'then',
 'he',
 'got',
 'there',
 'good',
 'was',
 'its',
 'only',
 'time',
 'love',
 'send',
 'text',
 'one',
 'want',
 'txt',
 'by',
 'as',
 'going',
 'stop',
 'don',
 'see',
 'she',
 'our',
 'need',
 'lor',
 'sorry',
 'about',
 'home',
 'today',
 'back',
 'still',
 'da',
 'they',
 'dont',
 'mobile',
 'take',
 'her',
 'reply',
 'later',
 'any',
 'tell',
 'dear',
 'been',
 'phone',
 'well',
 'new',
 'did',
 'hi',
 'think',
 'please',
 'week',
 'who',
 'great',
 'has',
 'here',
 'an',
 'some',
 'much',
 'pls',
 'claim',
 're',
 'night',
 'hey',
 'msg',
 'where',
 'him',
 'oh',


In [68]:
len(w2v_model.wv.index_to_key)

3371

In [69]:
w2v_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x2167c93c610>

In [70]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('fine', 0.9935320615768433),
 ('hope', 0.9934976100921631),
 ('time', 0.9932515025138855),
 ('doing', 0.9932102560997009),
 ('having', 0.9931671619415283),
 ('xmas', 0.9931668639183044),
 ('late', 0.9930840134620667),
 ('missing', 0.993073046207428),
 ('do', 0.993070662021637),
 ('life', 0.9930705428123474)]

In [71]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


In [72]:
X_train_vect[0].shape

(13, 100)

In [73]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

13 13
12 12
8 7
22 20
17 13
8 7
11 10
8 7
21 19
25 25
24 23
6 6
13 13
9 9
26 26
6 6
24 16
12 10
7 3
22 22
44 43
18 18
17 17
11 11
24 24
30 27
4 4
23 23
25 25
77 77
27 25
7 7
27 23
30 28
16 16
4 4
20 19
6 6
7 7
10 7
26 26
25 25
11 10
19 16
7 7
46 42
9 6
8 7
1 1
23 21
7 6
5 5
27 26
18 17
20 18
5 5
9 9
30 28
4 4
23 23
23 20
12 10
3 3
8 8
6 6
21 18
12 10
27 25
6 6
12 10
7 7
15 15
7 7
20 20
24 23
5 4
26 24
6 6
3 3
13 13
7 7
10 10
17 17
9 9
6 6
5 5
11 11
18 17
14 14
22 19
5 5
6 5
10 9
4 4
10 10
5 5
14 13
19 19
8 7
21 21
2 2
26 25
12 11
2 2
21 21
5 5
4 4
12 12
6 6
12 10
19 18
5 5
11 9
31 31
5 5
20 20
4 3
28 24
6 6
1 1
8 8
23 23
13 13
6 5
12 11
5 4
3 3
8 7
5 5
22 22
8 8
9 9
35 32
4 4
12 12
6 6
24 20
22 20
11 11
4 4
32 31
22 20
28 25
19 16
5 4
16 16
24 24
11 11
21 21
29 28
7 6
46 46
8 5
22 19
4 4
25 25
6 6
31 30
26 26
4 4
23 23
27 26
22 17
5 5
7 7
24 23
11 11
6 6
8 7
8 8
5 3
5 3
25 23
8 8
10 9
8 8
18 14
15 14
22 22
11 11
28 27
5 5
6 6
27 27
7 7
26 25
8 6
27 27
23 23
8 7
2 2
5 5
22 22
9 9
12 11


16 16
23 23
23 22
5 5
13 13
9 7
5 5
9 9
16 13
5 5
10 10
22 22
24 24
7 7
29 26
8 8
8 7
12 8
8 7
20 18
23 23
7 7
6 6
22 22
7 7
20 20
19 17
8 7
47 47
7 7
9 9
5 5
7 7
21 21
19 16
11 9
11 10
6 6
11 11
4 3
13 13
30 30
25 24
4 4
23 23
21 19
5 5
11 11
8 8
23 23
8 8
27 27
9 9
6 5
17 16
3 2
9 5
12 11
30 29
31 31
9 8
24 22
6 6
6 6
8 7
1 1
6 6
11 11
9 9
12 12
5 4
18 18
25 21
15 15
9 9
6 6
5 4
38 38
7 6
17 13
7 7
4 4
5 5
3 3
11 11
27 27
5 4
13 13
6 5
7 7
13 13
30 27
5 5
8 8
16 14
6 4
8 8
17 14
35 35
21 21
29 27
27 27
15 14
8 8
8 8
9 7
17 16
11 9
31 28
8 8
12 10
5 5
7 7
8 8
5 3
5 5
56 54
23 20
4 2
22 21
31 29
30 26
21 19
3 3
16 14
6 5
9 9
9 8
10 8
15 15
4 4
9 9
16 15
19 17
8 7
16 14
7 7
42 39
24 16
13 12
11 11
24 21
3 3
6 5
13 13
16 15
26 25
13 13
12 12
15 15
30 30
7 7
28 28
7 6
5 5
22 22
25 21
15 15
6 6
23 21
6 6
11 9
31 29
36 36
43 38
16 16
22 22
4 4
8 8
23 23
22 22
7 7
8 8
16 15
5 5
5 5
14 5
65 60
31 31
13 13
2 2
24 22
5 5
8 8
26 23
5 5
15 15
17 14
7 6
10 8
9 9
6 6
9 9
22 22
9 8
21 21
5 4
5 5
31 

In [53]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [54]:
X_train_vect_avg

[array([-0.11345411,  0.31056267, -0.0739532 ,  0.138403  ,  0.07677114,
        -0.7389774 ,  0.30005726,  1.0703418 , -0.5152435 , -0.2992939 ,
        -0.17930987, -0.68584716, -0.1301287 ,  0.30504507,  0.1815455 ,
        -0.31283483,  0.20898958, -0.464072  ,  0.01254195, -1.0445825 ,
         0.15394624,  0.16447453,  0.3291494 , -0.26924446, -0.1774149 ,
         0.00461579, -0.32273322, -0.35646942, -0.3439196 ,  0.03808224,
         0.47810468, -0.08029602,  0.08720156, -0.3513367 , -0.22505413,
         0.4869344 ,  0.09851891, -0.44967633, -0.32972902, -0.66916865,
         0.15088394, -0.34718105, -0.21158639,  0.16987877,  0.2914795 ,
        -0.03058648, -0.36637583, -0.05643476,  0.2403307 ,  0.25105244,
         0.30802613, -0.25051787, -0.04490549,  0.02098401, -0.22577368,
         0.04444182,  0.29713112, -0.05207008, -0.37165087,  0.13887005,
         0.02148348,  0.17637387,  0.15065582, -0.10050351, -0.4263757 ,
         0.31470662,  0.12171485,  0.38999873, -0.6

In [55]:
X_test_vect_avg

[array([-0.11576758,  0.27637216, -0.06562949,  0.12181182,  0.07387395,
        -0.68194604,  0.26640514,  0.985887  , -0.47545698, -0.28242597,
        -0.16613406, -0.6413036 , -0.11581535,  0.2835354 ,  0.1698355 ,
        -0.27568027,  0.18317392, -0.42552355,  0.01043977, -0.9783138 ,
         0.14375907,  0.1580139 ,  0.30073133, -0.24224399, -0.16072129,
        -0.00885275, -0.29648057, -0.3274037 , -0.30866092,  0.03737387,
         0.43710142, -0.0669709 ,  0.06411816, -0.3210639 , -0.20516281,
         0.45463288,  0.08472661, -0.43079928, -0.2926087 , -0.6288039 ,
         0.13840328, -0.31678376, -0.205102  ,  0.14839777,  0.2720308 ,
        -0.01538   , -0.3361149 , -0.04228699,  0.22703482,  0.2318676 ,
         0.27012378, -0.23475116, -0.04742818,  0.04404397, -0.2129038 ,
         0.04790141,  0.26449212, -0.05210072, -0.33221433,  0.14227693,
         0.02089363,  0.16285464,  0.13403052, -0.08647265, -0.38936684,
         0.28514677,  0.11796986,  0.35124782, -0.5

In [56]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

7 100
10 100
13 100
6 100
7 100
13 100
12 100
21 100
16 100
7 100
10 100
9 100
18 100
3 100
3 100
22 100
26 100
14 100
20 100
16 100
30 100
5 100
36 100
23 100
20 100
15 100
28 100
13 100
19 100
17 100
10 100
10 100
3 100
13 100
5 100
4 100
14 100
3 100
44 100
23 100
30 100
25 100
15 100
31 100
20 100
5 100
17 100
25 100
4 100
9 100
1 100
6 100
34 100
23 100
6 100
9 100
9 100
4 100
3 100
11 100
8 100
44 100
22 100
19 100
19 100
6 100
4 100
14 100
5 100
7 100
14 100
6 100
9 100
55 100
3 100
10 100
9 100
15 100
7 100
2 100
4 100
8 100
33 100
26 100
6 100
4 100
24 100
8 100
4 100
6 100
13 100
7 100
8 100
8 100
11 100
19 100
14 100
21 100
7 100
14 100
10 100
12 100
4 100
8 100
7 100
24 100
5 100
29 100
7 100
5 100
11 100
4 100
1 100
7 100
17 100
3 100
7 100
7 100
7 100
15 100
14 100
58 100
27 100
7 100
17 100
11 100
11 100
21 100
29 100
4 100
10 100
11 100
4 100
5 100
11 100
18 100
6 100
14 100
12 100
5 100
4 100
24 100
16 100
25 100
6 100
5 100
13 100
9 100
9 100
5 100
18 100
26 100
10 10

4 100
20 100
11 100
22 100
8 100
16 100
26 100
5 100
7 100
15 100
2 100
21 100
12 100
13 100
24 100
13 100
4 100
17 100
13 100
19 100
26 100
9 100
9 100
26 100
9 100
4 100
10 100
8 100
6 100
24 100
28 100
6 100
30 100
6 100
14 100
4 100
23 100
24 100
2 100
9 100
4 100
8 100
30 100
14 100
11 100
9 100
18 100
7 100
5 100
25 100
13 100
9 100
14 100
18 100
22 100
5 100
29 100
7 100
6 100
15 100
17 100
23 100
14 100
8 100
1 100
16 100
25 100
9 100
29 100
1 100
25 100
86 100
7 100
43 100
19 100
4 100
24 100
7 100
12 100
22 100
22 100
6 100
19 100
24 100
14 100
5 100
18 100
4 100
12 100
15 100
5 100
11 100
4 100
29 100
13 100
4 100
4 100
5 100
4 100
23 100
11 100
6 100
31 100
5 100
11 100
5 100
11 100
18 100
21 100
23 100
7 100
31 100
19 100
7 100
21 100
11 100
5 100
13 100
3 100
9 100
21 100
5 100
8 100
28 100
15 100
18 100
16 100
27 100
21 100
9 100
0 100
8 100
24 100
19 100
3 100
8 100
15 100
6 100
77 100
6 100
7 100
29 100
17 100
5 100
29 100
6 100
12 100
21 100
8 100
12 100
22 100
36 100

8 100
25 100
23 100
5 100
6 100
11 100
8 100
23 100
8 100
8 100
6 100
8 100
6 100
20 100
14 100
12 100
17 100
27 100
5 100
6 100
25 100
14 100
5 100
1 100
22 100
22 100
22 100
24 100
23 100
8 100
13 100
11 100
11 100
12 100
14 100
11 100
6 100
7 100
10 100
9 100
9 100
2 100
13 100
20 100
6 100
13 100
3 100
5 100
7 100
4 100
8 100
12 100
16 100
3 100
18 100
22 100
22 100
17 100
3 100
3 100
14 100
21 100
23 100
20 100
9 100
8 100
23 100
9 100
17 100
19 100
14 100
17 100
16 100
17 100
29 100
5 100
18 100
20 100
13 100
7 100
8 100
9 100
16 100
7 100
55 100
22 100
29 100
27 100
6 100
6 100
15 100
11 100
6 100
9 100
9 100
7 100
18 100
5 100
9 100
6 100
5 100
19 100
8 100
13 100
19 100
16 100
5 100
13 100
7 100
10 100
34 100
23 100
23 100
8 100
101 100
31 100
11 100
6 100
4 100
30 100
17 100
7 100
7 100
6 100
25 100
21 100
6 100
30 100
18 100
23 100
10 100
4 100
13 100
17 100
15 100
8 100
3 100
7 100
7 100
4 100
15 100
24 100
1 100
17 100
9 100
9 100
6 100
27 100
19 100
6 100
9 100
24 100
10 

7 100
22 100
6 100
15 100
29 100
12 100
7 100
22 100
9 100
20 100
6 100
2 100
12 100
4 100
29 100
13 100
8 100
7 100
17 100
25 100
25 100
8 100
5 100
8 100
21 100
11 100
4 100
23 100
18 100
9 100
10 100
19 100
11 100
9 100
4 100
2 100
8 100
5 100
9 100
13 100
6 100
33 100
6 100
6 100
16 100
16 100
29 100
25 100
10 100
26 100
26 100
9 100
5 100
6 100
11 100
13 100
29 100
7 100
11 100
5 100
15 100
13 100
6 100
4 100
12 100
10 100
25 100
10 100
47 100
6 100
11 100
23 100
19 100
25 100
11 100
7 100
21 100
4 100
22 100
21 100
9 100
2 100
7 100
4 100
12 100
22 100
5 100
6 100
13 100
6 100
8 100
25 100
23 100
25 100
22 100
10 100
56 100
22 100
24 100
6 100
24 100
29 100
26 100
13 100
24 100
9 100
15 100
25 100
20 100
5 100
14 100
10 100
9 100
15 100
14 100
29 100
14 100
25 100
4 100
6 100
7 100
13 100
5 100
9 100
5 100
16 100
7 100
1 100
20 100
8 100
21 100
7 100
23 100
16 100
4 100
13 100
14 100
21 100
6 100
5 100
30 100
17 100
11 100
9 100
10 100
1 100
27 100
12 100
24 100
18 100
6 100
19 1

In [57]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [58]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [59]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.968 / Recall: 0.995 / Accuracy: 0.967
