# Model testing using embeddings computed using our own Word2Vec model

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

from gensim.models import Word2Vec
from helpers import *
from preprocessing import *

import pandas as pd
import numpy as np

_Loading model that was trained on all the training data. See file `word2vec_training.py`._

In [2]:
model = Word2Vec.load('word2vec.model')

_Let's see how many distinct words we have in the model._

In [3]:
len(model.wv.vocab)

94930

_The vectors are of size 250._

In [4]:
dim = 250

In [5]:
def toVector(tokens, model, dim):
    """
    This method convert an array of token into a vector of size dim given that word vector in model are of size dim.
    
    Attributes:
    
    tokens: arr
            tokens to consider for the outputed embedding
    
    model: gensin model
            used to convert a token to a vector of size din
            
    dim: int 
            size of the vector    
    """
    
    vector = np.zeros(dim)
    word_count = 0
    
    for token in tokens:
        if token in model.wv:
            vector += model.wv[token]
            word_count += 1
            
    if word_count == 0:
        return vector
    else:
        return vector / float(word_count)

_For testing purpose let's build model on the subset of tweets provided._

In [6]:
tweets = load_tweets(full = False)

## 1. tweets with just the tags <user\> and <url\> removed

_Let's first remove tags and tokenize the tweet._

In [7]:
tweets_1 = remove_tags(tweets)
tweets_1 = tokenize_tweets(tweets_1)
tweets_1.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not . only j...,1,"[i, dunno, justin, read, my, mention, or, not,..."
1,"because your logic is so dumb , i won't even c...",1,"[because, your, logic, is, so, dumb, i, wo, nt..."
2,just put casper in a box ! looved the battle ...,1,"[just, put, casper, in, a, box, looved, the, b..."
3,thanks sir > > don't trip lil mama ... just ke...,1,"[thanks, sir, do, nt, trip, lil, mama, just, k..."
4,visiting my brother tmr is the bestest birthda...,1,"[visiting, my, brother, tmr, is, the, bestest,..."


_Using the tokens we can now compute vector embedding._

In [8]:
tweets_1['vector'] = tweets_1['tokens'].apply(lambda tokens: toVector(tokens, model, dim))

In [83]:
tweets_1[['vector', 'polarity']].head()

Unnamed: 0,vector,polarity
0,"[-0.10623393052568038, 0.27006867496917647, 0....",1
1,"[0.3716979566961527, 0.009992905064589448, -0....",1
2,"[0.4348416645079851, 0.5073618728201836, 0.431...",1
3,"[-0.14007517136633396, -0.730455274383227, 0.3...",1
4,"[0.4355273447930813, 0.40050916727632285, 0.68...",1


_Let's obtain training and testing data._

In [84]:
train_1, test_1 = train_test_split(tweets_1[['vector', 'polarity']], test_size = 0.2)

In [85]:
print(len(train_1))

157576


In [86]:
x_train_1 = np.concatenate(train_1['vector'].values).reshape((len(train_1), dim))
y_train_1 = train_1['polarity'].values

In [87]:
x_test_1 = np.concatenate(test_1['vector'].values).reshape((len(test_1), dim))
y_test_1 = test_1['polarity'].values

We will now consider several models for classification.

### Logistic Regression

_Without regularization._

In [93]:
logistic_reg_1 = LogisticRegression(solver='lbfgs', max_iter = 500)
logistic_reg_1.fit(x_train_1, y_train_1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [122]:
print('Accuracy = {a}'.format(a = logistic_reg_1.score(x_test_1, y_test_1)))

Accuracy = 0.7754226531959182


_Tuning regularization parameter._

In [92]:
# C is the inverse of the regularization strength
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C)

# We first define a new logistic regression model
logistic_reg_1 = LogisticRegression(solver='lbfgs', max_iter = 500)
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(logistic_reg_1, hyperparameters, cv = 5)

In [95]:
best_logistic_1 = clf.fit(x_train_1, y_train_1)

In [96]:
print('Best C:', best_logistic_1.best_estimator_.get_params()['C'])

Best C: 21.544346900318832


In [123]:
print('Accuracy = {a}'.format(a = best_logistic_1.score(x_test_1, y_test_1)))

Accuracy = 0.779636492866934


### Ridge regression

_Without regularization._

In [125]:
ridge_clf_1 = RidgeClassifier().fit(x_train_1, y_train_1)
print('Accuracy = {a}'.format(a = ridge_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.7754480377722496


_Tuning regularization parameter._

In [100]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

ridge_clf_1 = RidgeClassifier()
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(ridge_clf_1, hyperparameters, cv = 5)

In [102]:
best_ridge_clf_1 = clf.fit(x_train_1, y_train_1)

In [103]:
print('Best alpha:', best_ridge_clf_1.best_estimator_.get_params()['alpha'])

Best alpha: 1.0


In [124]:
print('Accuracy = {a}'.format(a = best_ridge_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.7754480377722496


### Support Vector Machine

_Without hyperparameters tuning._

In [105]:
sgd_clf_1 = SGDClassifier(loss = 'hinge').fit(x_train_1, y_train_1)

In [126]:
print('Accuracy = {a}'.format(a = sgd_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.7726049652231304


_With hyperparameters tuning._

In [107]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

svm_clf_1 = SGDClassifier(loss = 'hinge')
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
svm_clf_1 = GridSearchCV(svm_clf_1, hyperparameters, cv = 5)

In [108]:
best_svm_clf_1 = svm_clf_1.fit(x_train_1, y_train_1)

In [109]:
print('Best alpha:', best_svm_clf_1.best_estimator_.get_params()['alpha'])

Best alpha: 1.0


In [128]:
print('Accuracy = {a}'.format(a = best_svm_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.6432197796618775


## 2. tweets with the tags <user\> and <url\> as well as english stop words removed

_Let's first remove tags and tokenize the tweets with stop words removal._

In [21]:
tweets_2 = remove_tags(tweets)
tweets_2 = tokenize_tweets(tweets_2, stop_words = True)
tweets_2.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not . only j...,1,"[dunno, justin, read, mention, justin, god, kn..."
1,"because your logic is so dumb , i won't even c...",1,"[logic, dumb, wo, nt, even, crop, name, photo,..."
2,just put casper in a box ! looved the battle ...,1,"[put, casper, box, looved, battle, crakkbitch]"
3,thanks sir > > don't trip lil mama ... just ke...,1,"[thanks, sir, nt, trip, lil, mama, keep, doin,..."
4,visiting my brother tmr is the bestest birthda...,1,"[visiting, brother, tmr, bestest, birthday, gi..."


_We then obtain tweets embeddings._

In [22]:
tweets_2['vector'] = tweets_2['tokens'].apply(lambda tokens: toVector(tokens, model, dim))

In [111]:
tweets_2[['vector', 'polarity']].head()

Unnamed: 0,vector,polarity
0,"[-0.2076540985568003, 0.03211568295955658, 0.8...",1
1,"[-0.1722315645052327, 0.040821442380547523, -0...",1
2,"[-0.010814150795340538, 0.20931822061538696, 1...",1
3,"[-0.2327376276254654, -0.8353355824947357, 0.4...",1
4,"[0.1854836568236351, 0.673973988209452, 1.2295...",1


In [112]:
train_2, test_2 = train_test_split(tweets_2[['vector', 'polarity']], test_size = 0.2)

In [113]:
print(len(train_2))

157576


In [114]:
x_train_2 = np.concatenate(train_2['vector'].values).reshape((len(train_2), dim))
y_train_2 = train_2['polarity'].values

In [115]:
x_test_2 = np.concatenate(test_2['vector'].values).reshape((len(test_2), dim))
y_test_2 = test_2['polarity'].values

### Logistic Regression

_Without regularization._

In [131]:
logistic_reg_2 = LogisticRegression(solver='lbfgs', max_iter = 1000)
logistic_reg_2.fit(x_train_2, y_train_2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [132]:
print('Accuracy = {a}'.format(a = logistic_reg_2.score(x_test_2, y_test_2)))

Accuracy = 0.7669188201248921


_Tuning regularization parameter._

In [118]:
# C is the inverse of the regularization strenght
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C)

logistic_reg_2 = LogisticRegression(solver='lbfgs', max_iter = 500)
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(logistic_reg_2, hyperparameters, cv = 5)

In [119]:
best_logistic_2 = clf.fit(x_train_2, y_train_2)

In [120]:
print('Best C:', best_logistic_2.best_estimator_.get_params()['C'])

Best C: 1291.5496650148827


In [133]:
print('Accuracy = {a}'.format(a = best_logistic_2.score(x_test_2, y_test_2)))

Accuracy = 0.766969589277555


### Ridge regression

_Without regularization._

In [134]:
ridge_clf_2 = RidgeClassifier().fit(x_train_2, y_train_2)
print('Accuracy = {a}'.format(a = ridge_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7657257450373153


_Tuning regularization parameter._

In [43]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

ridge_clf_2 = RidgeClassifier()
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(ridge_clf_2, hyperparameters, cv = 5)

In [44]:
best_ridge_clf_2 = clf.fit(x_train_2, y_train_2)

In [45]:
print('Best alpha:', best_ridge_clf_2.best_estimator_.get_params()['alpha'])

Best alpha: 59.94842503189409


In [135]:
print('Accuracy = {a}'.format(a = best_ridge_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7659288216479667


### Support Vector Machine

_Without hyperparameters tuning._

In [47]:
sgd_clf_2 = SGDClassifier(loss = 'hinge').fit(x_train_2, y_train_2)

In [136]:
print('Accuracy = {a}'.format(a = sgd_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7657511296136468


_With hyperparameters tuning._

In [49]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

svm_clf_2 = SGDClassifier(loss = 'hinge')
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
svm_clf = GridSearchCV(svm_clf_2, hyperparameters, cv = 5)

In [50]:
best_svm_clf_2 = clf.fit(x_train_2, y_train_2)

In [51]:
print('Best alpha:', best_svm_clf_2.best_estimator_.get_params()['alpha'])

Best alpha: 59.94842503189409


In [138]:
print('Accuracy = {a}'.format(a = best_svm_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7659288216479667


***Conclusion: Removing english stopwords decreases the performance.***

## 3. tweets with tags <user\> and <url\> removed, and also with word stemming

_We now remove stop words and add stemming too. We trained another Word2Vec model for this purpose._

In [7]:
model_stemming = Word2Vec.load('word2vec_stemming.model')

_The total number of entries in our Word2Vec model is now less because we used stemming._

In [8]:
len(model_stemming.wv.vocab)

77889

_We remove tags, stop words and perform stemming._

In [9]:
tweets_3 = remove_tags(tweets)
tweets_3 = tokenize_tweets(tweets_3, stop_words = False, stemming = True)
tweets_3.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not . only j...,1,"[i, dunno, justin, read, my, mention, or, not,..."
1,"because your logic is so dumb , i won't even c...",1,"[becaus, your, logic, is, so, dumb, i, wo, nt,..."
2,just put casper in a box ! looved the battle ...,1,"[just, put, casper, in, a, box, loov, the, bat..."
3,thanks sir > > don't trip lil mama ... just ke...,1,"[thank, sir, do, nt, trip, lil, mama, just, ke..."
4,visiting my brother tmr is the bestest birthda...,1,"[visit, my, brother, tmr, is, the, bestest, bi..."


In [10]:
tweets_3['vector'] = tweets_3['tokens'].apply(lambda tokens: toVector(tokens, model_stemming, dim))

In [11]:
tweets_3[['vector', 'polarity']].head()

Unnamed: 0,vector,polarity
0,"[0.12418689959061642, -0.15489598460650691, 0....",1
1,"[0.8505662182966868, -0.33197376662347877, -0....",1
2,"[0.20243584281868404, -0.6361863786975542, -0....",1
3,"[0.6778068460213641, -0.27396117507790524, 0.1...",1
4,"[0.343004809319973, -0.5250801883637906, 0.209...",1


In [12]:
train_3, test_3 = train_test_split(tweets_3[['vector', 'polarity']], test_size = 0.2)

In [13]:
print(len(train_3))

157576


In [14]:
x_train_3 = np.concatenate(train_3['vector'].values).reshape((len(train_3), dim))
y_train_3 = train_3['polarity'].values

In [15]:
x_test_3 = np.concatenate(test_3['vector'].values).reshape((len(test_3), dim))
y_test_3 = test_3['polarity'].values

### Logistic Regression

_Without regularization._

In [16]:
logistic_reg_3 = LogisticRegression(solver='lbfgs', max_iter = 1000)
logistic_reg_3.fit(x_train_3, y_train_3)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
print('Accuracy = {a}'.format(a = logistic_reg_3.score(x_test_3, y_test_3)))

Accuracy = 0.7776564959130832


_Tuning regularization parameter._

In [18]:
# C is the inverse of the regularization strenght
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C)

logistic_reg_3 = LogisticRegression(solver='lbfgs', max_iter = 500)
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(logistic_reg_3, hyperparameters, cv = 5)

In [19]:
best_logistic_3 = clf.fit(x_train_3, y_train_3)

In [20]:
print('Best C:', best_logistic_3.best_estimator_.get_params()['C'])

Best C: 2.7825594022071245


In [21]:
print('Accuracy = {a}'.format(a = best_logistic_3.score(x_test_3, y_test_3)))

Accuracy = 0.7774534193024318


### Ridge regression

_Without regularization._

In [22]:
ridge_clf_3 = RidgeClassifier().fit(x_train_3, y_train_3)
print('Accuracy = {a}'.format(a = ridge_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7719195816621821


_Tuning regularization parameter._

In [23]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

ridge_clf_3 = RidgeClassifier()
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(ridge_clf_3, hyperparameters, cv = 5)

In [24]:
best_ridge_clf_3 = clf.fit(x_train_3, y_train_3)

In [25]:
print('Best alpha:', best_ridge_clf_3.best_estimator_.get_params()['alpha'])

Best alpha: 59.94842503189409


In [26]:
print('Accuracy = {a}'.format(a = best_ridge_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7713357364065594


### Support Vector Machine

_Without hyperparameters tuning._

In [27]:
sgd_clf_3 = SGDClassifier(loss = 'hinge').fit(x_train_3, y_train_3)

In [28]:
print('Accuracy = {a}'.format(a = sgd_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7748134233639641


_With hyperparameters tuning._

In [29]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

svm_clf_3 = SGDClassifier(loss = 'hinge')
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
svm_clf = GridSearchCV(svm_clf_3, hyperparameters, cv = 5)

In [30]:
best_svm_clf_3 = clf.fit(x_train_3, y_train_3)

In [31]:
print('Best alpha:', best_svm_clf_3.best_estimator_.get_params()['alpha'])

Best alpha: 59.94842503189409


In [33]:
print('Accuracy = {a}'.format(a = best_svm_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7713357364065594
