# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [9]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [14]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)
messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [15]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [16]:
# Encoding the label column
messages['label']=messages['label'].map({'ham':1,'spam':0})
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (messages['text_clean'], messages['label'] , test_size=0.2)

In [45]:
y_test

646     1
3859    1
2643    1
5331    1
709     0
       ..
3521    1
5495    1
329     1
3306    1
3472    1
Name: label, Length: 1115, dtype: int64

### Create word2vec Vectors

In [17]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [21]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('fuck', 0.9879058599472046),
 ('make', 0.9878668785095215),
 ('tell', 0.9876595139503479),
 ('babe', 0.9876301884651184),
 ('unsubscribe', 0.9876028299331665),
 ('last', 0.9875856637954712),
 ('need', 0.9875347018241882),
 ('as', 0.9875059723854065),
 ('didn', 0.9874580502510071),
 ('finally', 0.9874293804168701)]

In [29]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0)
                         if len([i for i in ls if i in words]) > 0
                         else np.zeros(w2v_model.vector_size)
                         for ls in X_train])
X_test_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in words], axis=0)
                         if len([i for i in ls if i in words]) > 0
                         else np.zeros(w2v_model.vector_size)
                         for ls in X_test])

In [43]:
X_train_vect

array([[-0.1379898 ,  0.30345789,  0.08002605, ..., -0.32526428,
         0.1298307 , -0.1111047 ],
       [-0.18580902,  0.40065333,  0.10245626, ..., -0.43599811,
         0.16912405, -0.15236582],
       [-0.15680124,  0.34029946,  0.08536308, ..., -0.3679474 ,
         0.14330894, -0.12657416],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.22458875,  0.44936523,  0.14282233, ..., -0.49510252,
         0.21083243, -0.15327717],
       [-0.14549118,  0.30408415,  0.08300597, ..., -0.3312577 ,
         0.13461675, -0.11248601]])

In [42]:
# What does the unaveraged version look like?
X_train_vect[0]

array([-1.37989804e-01,  3.03457886e-01,  8.00260529e-02,  9.81569886e-02,
        1.02108583e-01, -5.25503039e-01,  2.69987345e-01,  6.78740084e-01,
       -3.36819261e-01, -2.32861653e-01, -1.64803624e-01, -4.65315521e-01,
       -5.48042022e-02,  9.04158279e-02,  1.64244980e-01, -2.42927864e-01,
        6.73984215e-02, -4.13614988e-01,  1.45000883e-03, -5.68342328e-01,
        1.64556950e-01,  4.45453450e-02,  2.31211200e-01, -2.25041971e-01,
       -1.40416950e-01,  7.61006922e-02, -3.62688631e-01, -1.16342537e-01,
       -1.81547865e-01,  8.45461264e-02,  3.27828497e-01, -8.95109698e-02,
        1.31067485e-01, -2.16266006e-01, -9.00142342e-02,  3.41562927e-01,
        6.25119954e-02, -1.66623235e-01, -1.77214310e-01, -4.93053019e-01,
        1.48656920e-01, -3.22483927e-01, -2.04732567e-01,  9.80710029e-04,
        2.75866896e-01, -1.03654303e-01, -2.88018554e-01,  2.10054349e-02,
        2.22932026e-01,  1.51610151e-01,  1.57469153e-01, -2.91062295e-01,
       -5.79073210e-04, -

### Fit RandomForestClassifier On Top Of Word Vectors

In [37]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [39]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [46]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.97 / Recall: 0.988 / Accuracy: 0.963
