# Word2Vec featurization and modeling

In [54]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [23]:
data = pd.read_csv("AFFR_preprocessed_100k.csv")

In [24]:
data.tail(3)

Unnamed: 0,Reviews,rating
99997,celestial chia tea celestial chia tea tea real...,1
99998,miracle fruit tablets miracle fruit tablets se...,1
99999,not taste like lipton yellow label tea sold it...,0


In [27]:
reviews = data["Reviews"].to_list()

In [33]:
# This is to tokenize each row 
import nltk
review_tokens=[]
for i in reviews:
    nltk_tokens = nltk.word_tokenize(str(i))
    review_tokens.append(nltk_tokens)

In [36]:
# To train own Word2Vec model we can used pretrained models also which are very good  
# vector_size is the size of vector for each word 
# Min_count means if any word with 2 counts occurs then this will consider that word
#
import gensim
model = gensim.models.Word2Vec(review_tokens, vector_size=100,min_count=2,window=10,workers=4)

In [40]:
# Lets Test our model 
model.wv.most_similar('terrible')

[('horrible', 0.8505693674087524),
 ('awful', 0.8473909497261047),
 ('disgusting', 0.7512609362602234),
 ('horrid', 0.6952306628227234),
 ('nasty', 0.6607276201248169),
 ('yuck', 0.6557612419128418),
 ('ugh', 0.6451859474182129),
 ('aweful', 0.6308556199073792),
 ('gross', 0.6239864230155945),
 ('inedible', 0.6189229488372803)]

In [48]:
w2v_words = list(model.wv.key_to_index)
sent_vectors = []; 
for sent in tqdm(review_tokens):
    sent_vec = np.zeros(100) 
    cnt_words =0;
    for word in sent: 
        if word in w2v_words:
            vec = model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [09:45<00:00, 170.80it/s]

100000
100





In [52]:
y_label = data["rating"]

In [56]:
x_train, x_test, y_train, y_test = train_test_split(sent_vectors, y_label, test_size = 0.20)

In [61]:
%%time
LR_classifier = LogisticRegression(n_jobs=-1)
LR_classifier.fit(x_train, y_train)
y_pred = LR_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 2249   954]
 [  460 16337]]
Classification report: 
               precision    recall  f1-score   support

           0       0.83      0.70      0.76      3203
           1       0.94      0.97      0.96     16797

    accuracy                           0.93     20000
   macro avg       0.89      0.84      0.86     20000
weighted avg       0.93      0.93      0.93     20000

Accuracy score is:  0.9293
Model Saving ...
Wall time: 47.3 s


['AFFR_LR_model.pkl']

In [62]:
%%time
# Here Gaussian Naive Bayes does not taking Sparse matrix it requires dense 
KNN_classifier = KNeighborsClassifier(n_jobs=-1)
KNN_classifier.fit(x_train, y_train)
y_pred = KNN_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 1751  1452]
 [  414 16383]]
Classification report: 
               precision    recall  f1-score   support

           0       0.81      0.55      0.65      3203
           1       0.92      0.98      0.95     16797

    accuracy                           0.91     20000
   macro avg       0.86      0.76      0.80     20000
weighted avg       0.90      0.91      0.90     20000

Accuracy score is:  0.9067
Model Saving ...
Wall time: 6min 10s


In [63]:
%%time
# Here Gaussian Naive Bayes does not taking Sparse matrix it requires dense 
RF_classifier = RandomForestClassifier(n_jobs=-1)
RF_classifier.fit(x_train, y_train)
y_pred = RF_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 1695  1508]
 [  251 16546]]
Classification report: 
               precision    recall  f1-score   support

           0       0.87      0.53      0.66      3203
           1       0.92      0.99      0.95     16797

    accuracy                           0.91     20000
   macro avg       0.89      0.76      0.80     20000
weighted avg       0.91      0.91      0.90     20000

Accuracy score is:  0.91205
Model Saving ...
Wall time: 1min 32s


In [64]:
%%time
DT_classifier = DecisionTreeClassifier()
DT_classifier.fit(x_train, y_train)
y_pred = DT_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 1813  1390]
 [ 1479 15318]]
Classification report: 
               precision    recall  f1-score   support

           0       0.55      0.57      0.56      3203
           1       0.92      0.91      0.91     16797

    accuracy                           0.86     20000
   macro avg       0.73      0.74      0.74     20000
weighted avg       0.86      0.86      0.86     20000

Accuracy score is:  0.85655
Model Saving ...
Wall time: 43.1 s


['AFFR_DT_model.pkl']

In [65]:
%%time
svmclassifier = SVC(kernel='rbf',verbose=True,gamma="auto")
svmclassifier.fit(x_train, y_train)
y_pred = svmclassifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

[LibSVM]Confusion matrix: 
 [[ 2278   925]
 [  379 16418]]
Classification report: 
               precision    recall  f1-score   support

           0       0.86      0.71      0.78      3203
           1       0.95      0.98      0.96     16797

    accuracy                           0.93     20000
   macro avg       0.90      0.84      0.87     20000
weighted avg       0.93      0.93      0.93     20000

Accuracy score is:  0.9348
Model Saving ...
Wall time: 10min 22s


['AFFR_tfidfSVM_model.pkl']