In [1]:
# utilities
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier

# import xgboost

import gensim

In [2]:
# xgboost.__version__

In [3]:
df = pd.read_csv('twitter.cleaned.csv')
df


Unnamed: 0,text,target
0,love not wait se admin best server ever hapi,1.0
1,folowfriday,1.0
2,recruit comun server sharepoint dev gig austin...,1.0
3,not pay god seat watch miley gue bc switch god...,0.0
4,ah excit pushi home okay aw hapi right,1.0
...,...,...
1545966,go go big hous borow si guitar play mayb write,1.0
1545967,dish sen prepar websit would afraid recomend p...,1.0
1545968,birthday,1.0
1545969,huri not think jon kate,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1545971 entries, 0 to 1545970
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1545971 non-null  object 
 1   target  1545971 non-null  float64
dtypes: float64(1), object(1)
memory usage: 23.6+ MB


### Analyzing sequences to use Word2Vec

In [5]:
val = max(df['text'].apply(lambda x: len(x.split())))
val

34

In [6]:
max_sequence_length = (int(val/4)+1)*4
print(f'Padding length is {max_sequence_length}')

Padding length is 36


#### Max sequence length is {{val}}, assume {{max_sequence_length}} to keep sufficient padding

In [7]:
data = [alp.split() for alp in df['text']]

# corpus_text=[elem for elem in alp for alp in data]

In [8]:
vector_size = 100
model_name = "wv_model_full.w2v"

# model = gensim.models.Word2Vec(data, min_count = 1,vector_size=vector_size, window = 10, sg = 1)
# model.save(model_name)

In [9]:
model = gensim.models.Word2Vec.load(model_name)

In [10]:
word = 'bad'

In [11]:
model.wv.most_similar(word)

[('horibl', 0.7457845211029053),
 ('teribl', 0.7002303600311279),
 ('god', 0.6490662693977356),
 ('shiti', 0.6369373202323914),
 ('crapi', 0.5898011922836304),
 ('motorcycleor', 0.5466794371604919),
 ('badli', 0.5404649376869202),
 ('guilti', 0.5315821766853333),
 ('upset', 0.5277889370918274),
 ('hangert', 0.5233110189437866)]

In [12]:
len(model.wv.index_to_key)

295926

In [12]:
model.wv['list'].shape

(100,)

## Splitting, vectorising, and flattening the data

In [13]:
minidf = df.sample(frac=0.1)
minidf['target'].value_counts()

0.0    77317
1.0    77280
Name: target, dtype: int64

Balanced!

In [14]:
X_train, X_test, y_train, y_test = train_test_split(minidf['text'].apply(lambda x: x.split()), 
                                    minidf['target'].values, test_size=0.2, random_state=42)

In [15]:
X_train_embed = [[model.wv[word] for word in sequ] for sequ in X_train]
X_test_embed = [[model.wv[word] for word in sequ] for sequ in X_test]

In [16]:
def zero_padding(vector):
    temp = [np.zeros((vector_size))]
    comp = max_sequence_length-len(vector)
    if(comp>0):
        vector += temp*comp
    else:
        vector = vector[:max_sequence_length]
    return np.array(vector).reshape(vector_size*max_sequence_length)

# It gives padding or truncation if needed to sequences

In [17]:
X_train_embed = np.array([zero_padding(vector) for vector in X_train_embed])
X_test_embed = np.array([zero_padding(vector) for vector in X_test_embed])


In [18]:
X_train_embed.shape

(123677, 3600)

In [19]:
X_test_embed.shape

(30920, 3600)

## Using w2v embeddings on models to check performance

In [37]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C = 0.1)
svc.fit(X_train_embed, y_train)




LinearSVC(C=0.1)

In [38]:
y_pred = svc.predict(X_test_embed)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.74      0.75     15583
         1.0       0.74      0.78      0.76     15337

    accuracy                           0.76     30920
   macro avg       0.76      0.76      0.76     30920
weighted avg       0.76      0.76      0.76     30920

[[11474  4109]
 [ 3371 11966]]


In [41]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, max_iter= 300)
lr.fit(X_train_embed, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1, max_iter=300)

In [42]:
y_pred2 = lr.predict(X_test_embed)
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

              precision    recall  f1-score   support

         0.0       0.77      0.75      0.76     15583
         1.0       0.75      0.77      0.76     15337

    accuracy                           0.76     30920
   macro avg       0.76      0.76      0.76     30920
weighted avg       0.76      0.76      0.76     30920

[[11631  3952]
 [ 3468 11869]]


In [25]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_embed,y_train)


BernoulliNB()

In [26]:
y_pred3 = bnb.predict(X_test_embed)
print(classification_report(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))

              precision    recall  f1-score   support

         0.0       0.57      0.43      0.49     15583
         1.0       0.54      0.67      0.60     15337

    accuracy                           0.55     30920
   macro avg       0.55      0.55      0.54     30920
weighted avg       0.55      0.55      0.54     30920

[[ 6652  8931]
 [ 5027 10310]]


## Using "better" methods for Word2Vec

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(3600,100,10,1),activation='identity',solver='adam',
    learning_rate_init=0.01, verbose=True, early_stopping=True,validation_fraction=0.1)
mlp.fit(X_train_embed, y_train)

Iteration 1, loss = 0.58766601
Validation score: 0.747170
Iteration 2, loss = 0.53009372
Validation score: 0.750647
Iteration 3, loss = 0.53003242
Validation score: 0.752668
Iteration 4, loss = 0.52863508
Validation score: 0.753072
Iteration 5, loss = 0.52869412
Validation score: 0.756307
Iteration 6, loss = 0.52943644
Validation score: 0.754285
Iteration 7, loss = 0.52849058
Validation score: 0.756064
Iteration 8, loss = 0.52829204
Validation score: 0.754124
Iteration 9, loss = 0.52943896
Validation score: 0.756630
Iteration 10, loss = 0.53033565
Validation score: 0.757843
Iteration 11, loss = 0.52719528
Validation score: 0.754043
Iteration 12, loss = 0.52549439
Validation score: 0.758005
Iteration 13, loss = 0.52982174
Validation score: 0.752506
Iteration 14, loss = 0.53355626
Validation score: 0.755175
Iteration 15, loss = 0.52289118
Validation score: 0.755741
Iteration 16, loss = 0.52708446
Validation score: 0.756468
Iteration 17, loss = 0.52330158
Validation score: 0.757277
Iterat

MLPClassifier(activation='identity', early_stopping=True,
              hidden_layer_sizes=(3600, 100, 10, 1), learning_rate_init=0.01,
              verbose=True)

In [23]:
y_pred = mlp.predict(X_test_embed)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.75      0.76      0.75     15369
         1.0       0.76      0.75      0.76     15551

    accuracy                           0.76     30920
   macro avg       0.76      0.76      0.76     30920
weighted avg       0.76      0.76      0.76     30920

[[11625  3744]
 [ 3831 11720]]
