# Model testing using embeddings computed using a pretrained GloVe model

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

from gensim.models import Word2Vec
import gensim.downloader as api
from helpers import *
from preprocessing import *

import pandas as pd
import numpy as np

_Loading model that was pretrained on 2 billion+ tweets._

In [4]:
model = api.load('glove-twitter-200')

_Let's see how many distinct words we have in the model._

In [9]:
len(model.wv.vocab)

  """Entry point for launching an IPython kernel.


1193514

_The vectors are of size 200._

In [10]:
dim = 200

In [11]:
def toVector(tokens, model, dim):
    """
    This method convert an array of token into a vector of size dim given that word vector in model are of size dim.
    
    Attributes:
    
    tokens: arr
            tokens to consider for the outputed embedding
    
    model: gensin model
            used to convert a token to a vector of size din
            
    dim: int 
            size of the vector    
    """
    
    vector = np.zeros(dim)
    word_count = 0
    
    for token in tokens:
        if token in model.wv:
            vector += model.wv[token]
            word_count += 1
            
    if word_count == 0:
        return vector
    else:
        return vector / float(word_count)

_For testing purpose let's build model on the subset of tweets provided._

In [8]:
tweets = load_tweets(full = False)

## 1. tweets with just the tags <user\> and <url\> removed

_Let's first remove tags and tokenize the tweet._

In [12]:
tweets_1 = remove_tags(tweets)
tweets_1 = tokenize_tweets(tweets_1)
tweets_1.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not . only j...,1,"[i, dunno, justin, read, my, mention, or, not,..."
1,"because your logic is so dumb , i won't even c...",1,"[because, your, logic, is, so, dumb, i, wo, nt..."
2,just put casper in a box ! looved the battle ...,1,"[just, put, casper, in, a, box, looved, the, b..."
3,thanks sir > > don't trip lil mama ... just ke...,1,"[thanks, sir, do, nt, trip, lil, mama, just, k..."
4,visiting my brother tmr is the bestest birthda...,1,"[visiting, my, brother, tmr, is, the, bestest,..."


_Using the tokens we can now compute vector embedding._

In [13]:
tweets_1['vector'] = tweets_1['tokens'].apply(lambda tokens: toVector(tokens, model, dim))



In [14]:
tweets_1[['vector', 'polarity']].head()

Unnamed: 0,vector,polarity
0,"[0.08971212630200645, 0.4280543470674235, -0.0...",1
1,"[0.1027381899766624, 0.19680555940916142, -0.0...",1
2,"[-0.0016766687234242756, 0.33439555598629844, ...",1
3,"[-0.09868999943137169, 0.20102208495760956, -0...",1
4,"[0.2870795548790031, 0.046254554556475744, 0.1...",1


_Let's obtain training and testing data._

In [15]:
train_1, test_1 = train_test_split(tweets_1[['vector', 'polarity']], test_size = 0.2)

In [17]:
print(len(train_1))

157576


In [18]:
x_train_1 = np.concatenate(train_1['vector'].values).reshape((len(train_1), dim))
y_train_1 = train_1['polarity'].values

In [19]:
x_test_1 = np.concatenate(test_1['vector'].values).reshape((len(test_1), dim))
y_test_1 = test_1['polarity'].values

We will now consider several models for classification.

### Logistic Regression

_Without regularization._

In [20]:
logistic_reg_1 = LogisticRegression(solver='lbfgs', max_iter = 500)
logistic_reg_1.fit(x_train_1, y_train_1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
print('Accuracy = {a}'.format(a = logistic_reg_1.score(x_test_1, y_test_1)))

Accuracy = 0.7734934253947302


_Tuning regularization parameter._

In [24]:
# C is the inverse of the regularization strenght
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C)

# We first define a new logistic regression model
logistic_reg_1 = LogisticRegression(solver='lbfgs', max_iter = 500)
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(logistic_reg_1, hyperparameters, cv = 5)

In [25]:
best_logistic_1 = clf.fit(x_train_1, y_train_1)

In [26]:
print('Best C:', best_logistic_1.best_estimator_.get_params()['C'])

Best C: 21.544346900318832


In [27]:
print('Accuracy = {a}'.format(a = best_logistic_1.score(x_test_1, y_test_1)))

Accuracy = 0.7750418845509468


### Ridge regression

_Without regularization._

In [28]:
ridge_clf_1 = RidgeClassifier().fit(x_train_1, y_train_1)
print('Accuracy = {a}'.format(a = ridge_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.7707772757272681


_Tuning regularization parameter._

In [29]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

ridge_clf_1 = RidgeClassifier()
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(ridge_clf_1, hyperparameters, cv = 5)

In [30]:
best_ridge_clf_1 = clf.fit(x_train_1, y_train_1)

In [31]:
print('Best alpha:', best_ridge_clf_1.best_estimator_.get_params()['alpha'])

Best alpha: 59.94842503189409


In [32]:
print('Accuracy = {a}'.format(a = best_ridge_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.7709803523379195


### Support Vector Machine

_Without hyperparameters tuning._

In [33]:
sgd_clf_1 = SGDClassifier(loss = 'hinge').fit(x_train_1, y_train_1)

In [34]:
print('Accuracy = {a}'.format(a = sgd_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.7706757374219424


_With hyperparameters tuning._

In [35]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

svm_clf_1 = SGDClassifier(loss = 'hinge')
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
svm_clf_1 = GridSearchCV(svm_clf_1, hyperparameters, cv = 5)

In [36]:
best_svm_clf_1 = svm_clf_1.fit(x_train_1, y_train_1)

In [37]:
print('Best alpha:', best_svm_clf_1.best_estimator_.get_params()['alpha'])

Best alpha: 1.0


In [38]:
print('Accuracy = {a}'.format(a = best_svm_clf_1.score(x_test_1, y_test_1)))

Accuracy = 0.6060313753363457


## 2. tweets with the tags <user\> and <url\> as well as english stop words removed

_Let's first remove tags and tokenize the tweets with stop words removal._

In [39]:
tweets_2 = remove_tags(tweets)
tweets_2 = tokenize_tweets(tweets_2, stop_words = True)
tweets_2.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not . only j...,1,"[dunno, justin, read, mention, justin, god, kn..."
1,"because your logic is so dumb , i won't even c...",1,"[logic, dumb, wo, nt, even, crop, name, photo,..."
2,just put casper in a box ! looved the battle ...,1,"[put, casper, box, looved, battle, crakkbitch]"
3,thanks sir > > don't trip lil mama ... just ke...,1,"[thanks, sir, nt, trip, lil, mama, keep, doin,..."
4,visiting my brother tmr is the bestest birthda...,1,"[visiting, brother, tmr, bestest, birthday, gi..."


_We then obtain tweets embeddings._

In [40]:
tweets_2['vector'] = tweets_2['tokens'].apply(lambda tokens: toVector(tokens, model, dim))



In [41]:
tweets_2[['vector', 'polarity']].head()

Unnamed: 0,vector,polarity
0,"[0.19875700175762176, 0.6318346992135048, 0.03...",1
1,"[0.04200667950014273, 0.19379967575271925, -0....",1
2,"[-0.10455999970436096, 0.3976099997758865, -0....",1
3,"[-0.06771099865436554, 0.172387501783669, -0.2...",1
4,"[0.2506899982690811, -0.03688483188549677, 0.0...",1


In [42]:
train_2, test_2 = train_test_split(tweets_2[['vector', 'polarity']], test_size = 0.2)

In [43]:
print(len(train_2))

157576


In [44]:
x_train_2 = np.concatenate(train_2['vector'].values).reshape((len(train_2), dim))
y_train_2 = train_2['polarity'].values

In [45]:
x_test_2 = np.concatenate(test_2['vector'].values).reshape((len(test_2), dim))
y_test_2 = test_2['polarity'].values

### Logistic Regression

_Without regularization._

In [46]:
logistic_reg_2 = LogisticRegression(solver='lbfgs', max_iter = 1000)
logistic_reg_2.fit(x_train_2, y_train_2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
print('Accuracy = {a}'.format(a = logistic_reg_2.score(x_test_2, y_test_2)))

Accuracy = 0.7628319033355333


_Tuning regularization parameter._

In [49]:
# C is the inverse of the regularization strenght
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C)

logistic_reg_2 = LogisticRegression(solver='lbfgs', max_iter = 500)
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(logistic_reg_2, hyperparameters, cv = 5)

In [50]:
best_logistic_2 = clf.fit(x_train_2, y_train_2)

In [51]:
print('Best C:', best_logistic_2.best_estimator_.get_params()['C'])

Best C: 1291.5496650148827


In [52]:
print('Accuracy = {a}'.format(a = best_logistic_2.score(x_test_2, y_test_2)))

Accuracy = 0.7629080570645276


### Ridge regression

_Without regularization._

In [53]:
ridge_clf_2 = RidgeClassifier().fit(x_train_2, y_train_2)
print('Accuracy = {a}'.format(a = ridge_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7601919073970655


_Tuning regularization parameter._

In [54]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

ridge_clf_2 = RidgeClassifier()
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(ridge_clf_2, hyperparameters, cv = 5)

In [55]:
best_ridge_clf_2 = clf.fit(x_train_2, y_train_2)

In [56]:
print('Best alpha:', best_ridge_clf_2.best_estimator_.get_params()['alpha'])

Best alpha: 7.742636826811269


In [57]:
print('Accuracy = {a}'.format(a = best_ridge_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7602680611260598


### Support Vector Machine

_Without hyperparameters tuning._

In [58]:
sgd_clf_2 = SGDClassifier(loss = 'hinge').fit(x_train_2, y_train_2)

In [59]:
print('Accuracy = {a}'.format(a = sgd_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.761968827740265


_With hyperparameters tuning._

In [60]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

svm_clf_2 = SGDClassifier(loss = 'hinge')
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
svm_clf = GridSearchCV(svm_clf_2, hyperparameters, cv = 5)

In [61]:
best_svm_clf_2 = clf.fit(x_train_2, y_train_2)

In [62]:
print('Best alpha:', best_svm_clf_2.best_estimator_.get_params()['alpha'])

Best alpha: 7.742636826811269


In [63]:
print('Accuracy = {a}'.format(a = best_svm_clf_2.score(x_test_2, y_test_2)))

Accuracy = 0.7602680611260598


## 3. tweets with tags <user\> and <url\> removed and also with word stemming

_We remove tags, stop words and perform stemming._

In [14]:
tweets_3 = remove_tags(tweets)
tweets_3 = tokenize_tweets(tweets_3, stop_words = False, stemming = True)
tweets_3.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not . only j...,1,"[i, dunno, justin, read, my, mention, or, not,..."
1,"because your logic is so dumb , i won't even c...",1,"[becaus, your, logic, is, so, dumb, i, wo, nt,..."
2,just put casper in a box ! looved the battle ...,1,"[just, put, casper, in, a, box, loov, the, bat..."
3,thanks sir > > don't trip lil mama ... just ke...,1,"[thank, sir, do, nt, trip, lil, mama, just, ke..."
4,visiting my brother tmr is the bestest birthda...,1,"[visit, my, brother, tmr, is, the, bestest, bi..."


In [15]:
tweets_3['vector'] = tweets_3['tokens'].apply(lambda tokens: toVector(tokens, model, dim))



In [16]:
tweets_3[['vector', 'polarity']].head()

Unnamed: 0,vector,polarity
0,"[0.09940403963074736, 0.4317200871427422, -0.0...",1
1,"[0.10893402367623316, 0.17930333719899258, -0....",1
2,"[0.014401104715135362, 0.23129667176140678, 0....",1
3,"[-0.09823416670163472, 0.22043208482985696, -0...",1
4,"[0.2913651106258233, 0.09299010949002372, 0.13...",1


In [17]:
train_3, test_3 = train_test_split(tweets_3[['vector', 'polarity']], test_size = 0.2)

In [18]:
print(len(train_3))

157576


In [19]:
x_train_3 = np.concatenate(train_3['vector'].values).reshape((len(train_3), dim))
y_train_3 = train_3['polarity'].values

In [20]:
x_test_3 = np.concatenate(test_3['vector'].values).reshape((len(test_3), dim))
y_test_3 = test_3['polarity'].values

### Logistic Regression

_Without regularization._

In [21]:
logistic_reg_3 = LogisticRegression(solver='lbfgs', max_iter = 1000)
logistic_reg_3.fit(x_train_3, y_train_3)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
print('Accuracy = {a}'.format(a = logistic_reg_3.score(x_test_3, y_test_3)))

Accuracy = 0.7551657612834441


_Tuning regularization parameter._

In [23]:
# C is the inverse of the regularization strenght
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C)

logistic_reg_3 = LogisticRegression(solver='lbfgs', max_iter = 500)
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(logistic_reg_3, hyperparameters, cv = 5)

In [24]:
best_logistic_3 = clf.fit(x_train_3, y_train_3)

In [25]:
print('Best C:', best_logistic_3.best_estimator_.get_params()['C'])

Best C: 2.7825594022071245


In [26]:
print('Accuracy = {a}'.format(a = best_logistic_3.score(x_test_3, y_test_3)))

Accuracy = 0.7549880692491242


### Ridge regression

_Without regularization._

In [27]:
ridge_clf_3 = RidgeClassifier().fit(x_train_3, y_train_3)
print('Accuracy = {a}'.format(a = ridge_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7521957658526679


_Tuning regularization parameter._

In [28]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

ridge_clf_3 = RidgeClassifier()
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
clf = GridSearchCV(ridge_clf_3, hyperparameters, cv = 5)

In [29]:
best_ridge_clf_3 = clf.fit(x_train_3, y_train_3)

In [30]:
print('Best alpha:', best_ridge_clf_3.best_estimator_.get_params()['alpha'])

Best alpha: 21.544346900318832


In [31]:
print('Accuracy = {a}'.format(a = best_ridge_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7519419200893537


### Support Vector Machine

_Without hyperparameters tuning._

In [32]:
sgd_clf_3 = SGDClassifier(loss = 'hinge').fit(x_train_3, y_train_3)

In [33]:
print('Accuracy = {a}'.format(a = sgd_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7525765344976393


_With hyperparameters tuning._

In [34]:
# alpha is a constant that multiplies the regularization term
alpha = np.logspace(0, 4, 10)
hyperparameters = dict(alpha=alpha)

svm_clf_3 = SGDClassifier(loss = 'hinge')
# we now perform grid search to find optimal hyperparameter
# we use cross-validation with 5 folds
svm_clf_3 = GridSearchCV(svm_clf_3, hyperparameters, cv = 5)

In [35]:
best_svm_clf_3 = clf.fit(x_train_3, y_train_3)

In [36]:
print('Best alpha:', best_svm_clf_3.best_estimator_.get_params()['alpha'])

Best alpha: 21.544346900318832


In [37]:
print('Accuracy = {a}'.format(a = best_svm_clf_3.score(x_test_3, y_test_3)))

Accuracy = 0.7519419200893537
