In [1]:
# utilities
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
# from sklearn.neural_network import MLPClassifier

# import xgboost

import gensim

In [2]:
# xgboost.__version__

In [3]:
df = pd.read_csv('twitter.cleaned.csv')
df


Unnamed: 0,text,target
0,love not wait se admin best server ever hapi,1.0
1,folowfriday,1.0
2,recruit comun server sharepoint dev gig austin...,1.0
3,not pay god seat watch miley gue bc switch god...,0.0
4,ah excit pushi home okay aw hapi right,1.0
...,...,...
1545966,go go big hous borow si guitar play mayb write,1.0
1545967,dish sen prepar websit would afraid recomend p...,1.0
1545968,birthday,1.0
1545969,huri not think jon kate,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1545971 entries, 0 to 1545970
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1545971 non-null  object 
 1   target  1545971 non-null  float64
dtypes: float64(1), object(1)
memory usage: 23.6+ MB


### Analyzing sequences to use Word2Vec

In [5]:
val = max(df['text'].apply(lambda x: len(x.split())))
val

34

In [6]:
max_sequence_length = (int(val/4)+1)*4
print(f'Padding length is {max_sequence_length}')

Padding length is 36


#### Max sequence length is {{val}}, assume {{max_sequence_length}} to keep sufficient padding

In [7]:
data = [alp.split() for alp in df['text']]

# corpus_text=[elem for elem in alp for alp in data]

In [8]:
vector_size = 1000
# model_name = "wv_model_2000.w2v"
model_name = "wv_model_1000_count5.w2v"

In [9]:
# model = gensim.models.Word2Vec(data, min_count = 5,vector_size=vector_size, window = 10, sg = 0)
# model.save(model_name)

In [10]:
model = gensim.models.Word2Vec.load(model_name)

In [11]:
word = 'bad'

model.wv.most_similar(word)

[('horibl', 0.6040530800819397),
 ('teribl', 0.5678789615631104),
 ('shiti', 0.5293053388595581),
 ('god', 0.4836724102497101),
 ('crapi', 0.4783765375614166),
 ('badli', 0.4530486464500427),
 ('icki', 0.4226621985435486),
 ('guilti', 0.4175047278404236),
 ('yucki', 0.40732482075691223),
 ('foul', 0.3995888829231262)]

In [12]:
word_list = model.wv.index_to_key

In [13]:
len(word_list)

39217

In [14]:
type(model.wv['list'])

numpy.ndarray

In [15]:
model.wv['list'].shape

(1000,)

## Splitting, vectorising, and flattening the data

In [16]:
minidf = df.sample(frac=0.1)
minidf['target'].value_counts()

1.0    77358
0.0    77239
Name: target, dtype: int64

Balanced!

In [17]:
X_train, X_test, y_train, y_test = train_test_split(minidf['text'].apply(lambda x: x.split()), 
                                    minidf['target'].values, test_size=0.2, random_state=42)

In [18]:
# this function is passed with a sequence- list of words and returns the vector
# If normal = True, vector will be averaged over all words, otherwise summed
def get_embedding(sequence, normal = False):
    vec = np.zeros((vector_size), dtype= float)
    count= 0
    for word in sequence:
        if word in word_list:
            vec += model.wv[word]
            count +=1
#             print(word)
    if(normal): return vec/(max(count,1))
    else: return vec

In [19]:
# get_embedding(X_train.iloc[0])

In [20]:
X_train_embed = np.array([get_embedding(sequ) for sequ in X_train])
X_test_embed = np.array([get_embedding(sequ) for sequ in X_test])

In [21]:
X_train_embed.shape

(123677, 1000)

In [22]:
X_test_embed.shape

(30920, 1000)

## Using w2v embeddings (summed) on models to check performance

In [23]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C = 0.02 )
svc.fit(X_train_embed, y_train)

y_pred = svc.predict(X_test_embed)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.78      0.75      0.76     15420
         1.0       0.76      0.80      0.78     15500

    accuracy                           0.77     30920
   macro avg       0.77      0.77      0.77     30920
weighted avg       0.77      0.77      0.77     30920

[[11501  3919]
 [ 3163 12337]]




In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, max_iter= 1000)
lr.fit(X_train_embed, y_train)

y_pred2 = lr.predict(X_test_embed)
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

              precision    recall  f1-score   support

         0.0       0.78      0.75      0.77     15420
         1.0       0.76      0.79      0.78     15500

    accuracy                           0.77     30920
   macro avg       0.77      0.77      0.77     30920
weighted avg       0.77      0.77      0.77     30920

[[11590  3830]
 [ 3255 12245]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_embed,y_train)

y_pred3 = bnb.predict(X_test_embed)
print(classification_report(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))

              precision    recall  f1-score   support

         0.0       0.70      0.74      0.72     15420
         1.0       0.72      0.68      0.70     15500

    accuracy                           0.71     30920
   macro avg       0.71      0.71      0.71     30920
weighted avg       0.71      0.71      0.71     30920

[[11401  4019]
 [ 4973 10527]]


## Using w2v embeddings (averaged)

In [26]:
X_train_embed = None
X_test_embed = None

X_train_embed = np.array([get_embedding(sequ, normal = True) for sequ in X_train])
X_test_embed = np.array([get_embedding(sequ, normal = True) for sequ in X_test])

In [27]:
X_train_embed.shape

(123677, 1000)

In [28]:
from sklearn.svm import LinearSVC

svc = None
svc = LinearSVC(C = 0.02 )
svc.fit(X_train_embed, y_train)


y_pred = svc.predict(X_test_embed)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76     15420
         1.0       0.76      0.78      0.77     15500

    accuracy                           0.77     30920
   macro avg       0.77      0.77      0.77     30920
weighted avg       0.77      0.77      0.77     30920

[[11684  3736]
 [ 3463 12037]]


In [29]:
from sklearn.linear_model import LogisticRegression

lr = None
lr = LogisticRegression(C=1, max_iter= 1000)
lr.fit(X_train_embed, y_train)

y_pred2 = lr.predict(X_test_embed)
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

              precision    recall  f1-score   support

         0.0       0.77      0.76      0.77     15420
         1.0       0.77      0.78      0.77     15500

    accuracy                           0.77     30920
   macro avg       0.77      0.77      0.77     30920
weighted avg       0.77      0.77      0.77     30920

[[11754  3666]
 [ 3479 12021]]


In [30]:
from sklearn.naive_bayes import BernoulliNB

bnb = None
bnb = BernoulliNB()
bnb.fit(X_train_embed,y_train)


y_pred3 = bnb.predict(X_test_embed)
print(classification_report(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))

              precision    recall  f1-score   support

         0.0       0.70      0.74      0.72     15420
         1.0       0.72      0.68      0.70     15500

    accuracy                           0.71     30920
   macro avg       0.71      0.71      0.71     30920
weighted avg       0.71      0.71      0.71     30920

[[11401  4019]
 [ 4973 10527]]


## Using "better" methods for Word2Vec

In [31]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(1000,100,100,10),activation='identity',solver='adam',
    alpha=0.0001, batch_size='auto',learning_rate='constant',learning_rate_init=0.001,power_t=0.5,
    max_iter=100,shuffle=True,random_state=None,tol=0.0001,verbose=True,warm_start=False,momentum=0.9,
    nesterovs_momentum=True, early_stopping=True,validation_fraction=0.1,beta_1=0.9,beta_2=0.999,
    epsilon=1e-08,n_iter_no_change=10,max_fun=15000)
mlp.fit(X_train_embed, y_train)

Iteration 1, loss = 0.50697665
Validation score: 0.758490
Iteration 2, loss = 0.49855903
Validation score: 0.760349
Iteration 3, loss = 0.49646868
Validation score: 0.765201
Iteration 4, loss = 0.49505028
Validation score: 0.763745
Iteration 5, loss = 0.49466252
Validation score: 0.763907
Iteration 6, loss = 0.49476374
Validation score: 0.763988
Iteration 7, loss = 0.49400008
Validation score: 0.759945
Iteration 8, loss = 0.49382800
Validation score: 0.764311
Iteration 9, loss = 0.49392560
Validation score: 0.763017
Iteration 10, loss = 0.49316220
Validation score: 0.759945
Iteration 11, loss = 0.49265520
Validation score: 0.763179
Iteration 12, loss = 0.49273505
Validation score: 0.761400
Iteration 13, loss = 0.49234464
Validation score: 0.763745
Iteration 14, loss = 0.49219730
Validation score: 0.761643
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(activation='identity', early_stopping=True,
              hidden_layer_sizes=(1000, 100, 100, 10), max_iter=100,
              verbose=True)

In [32]:
y_pred = mlp.predict(X_test_embed)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.75      0.76     15420
         1.0       0.76      0.78      0.77     15500

    accuracy                           0.77     30920
   macro avg       0.77      0.77      0.76     30920
weighted avg       0.77      0.77      0.76     30920

[[11588  3832]
 [ 3433 12067]]
