# BOW Featurization and modeling

In [13]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("AFFR_preprocessed_100k.csv")

In [3]:
data.tail(3)

Unnamed: 0,Reviews,rating
99997,celestial chia tea celestial chia tea tea real...,1
99998,miracle fruit tablets miracle fruit tablets se...,1
99999,not taste like lipton yellow label tea sold it...,0


In [4]:
def text_splitter(text):
    return text.split()
# max_features = 20000 means we want only most useful(most occured) 20000 features not all 
vectorizer = CountVectorizer(tokenizer = text_splitter,ngram_range=(1, 3),max_features=20000,min_df=5, max_df=0.7)
review_vector = vectorizer.fit_transform(data['Reviews'].values.astype(str))

In [14]:
dump(vectorizer,"AFFR_vectorizer.pkl")

['AFFR_vectorizer.pkl']

In [5]:
review_vector.shape

(100000, 20000)

In [6]:
# Getting labels seperate 
y_label = data["rating"]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(review_vector, y_label, test_size = 0.20)

### SVM Classifier with RBF kernel 

In [18]:
%%time
svmclassifier = SVC(kernel='rbf',verbose=True,gamma="auto")
svmclassifier.fit(x_train, y_train)
y_pred = svmclassifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(svmclassifier,"AFFR_SVM_model.pkl")

[LibSVM]Confusion matrix:  [[   54  3120]
 [    2 16824]]
Classification report: 
               precision    recall  f1-score   support

           0       0.96      0.02      0.03      3174
           1       0.84      1.00      0.92     16826

    accuracy                           0.84     20000
   macro avg       0.90      0.51      0.47     20000
weighted avg       0.86      0.84      0.78     20000

Accuracy score is:  0.8439
Model Saving ...
Wall time: 1h 18min 11s


['AFFR_SVM_model.pkl']

### Naive Bayes 

In [11]:
%%time
# Here Gaussian Naive Bayes does not taking Sparse matrix it requires dense 
NB_classifier = MultinomialNB()
NB_classifier.fit(x_train, y_train)
y_pred = NB_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(NB_classifier,"AFFR_NB_model.pkl")

Confusion matrix: 
 [[ 2808   370]
 [ 1041 15781]]
Classification report: 
               precision    recall  f1-score   support

           0       0.73      0.88      0.80      3178
           1       0.98      0.94      0.96     16822

    accuracy                           0.93     20000
   macro avg       0.85      0.91      0.88     20000
weighted avg       0.94      0.93      0.93     20000

Accuracy score is:  0.92945
Model Saving ...
Wall time: 187 ms


['AFFR_NB_model.pkl']

In [27]:
%%time
LR_classifier = LogisticRegression(n_jobs=-1)
LR_classifier.fit(x_train, y_train)
y_pred = LR_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(LR_classifier,"AFFR_LR_model.pkl")

Confusion matrix: 
 [[ 2579   599]
 [  412 16410]]
Classification report: 
               precision    recall  f1-score   support

           0       0.86      0.81      0.84      3178
           1       0.96      0.98      0.97     16822

    accuracy                           0.95     20000
   macro avg       0.91      0.89      0.90     20000
weighted avg       0.95      0.95      0.95     20000

Accuracy score is:  0.94945
Model Saving ...
Wall time: 8.22 s
Parser   : 169 ms


['AFFR_LR_model.pkl']

In [26]:
%%time
KNN_classifier = KNeighborsClassifier(n_jobs=-1)
KNN_classifier.fit(x_train, y_train)
y_pred = KNN_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(KNN_classifier,"AFFR_KNN_model.pkl")

Confusion matrix: 
 [[ 1224  1954]
 [  647 16175]]
Classification report: 
               precision    recall  f1-score   support

           0       0.65      0.39      0.48      3178
           1       0.89      0.96      0.93     16822

    accuracy                           0.87     20000
   macro avg       0.77      0.67      0.71     20000
weighted avg       0.85      0.87      0.86     20000

Accuracy score is:  0.86995
Model Saving ...
Wall time: 2min 27s


['AFFR_KNN_model.pkl']

In [18]:
%%time
RF_classifier = RandomForestClassifier(n_jobs=-1)
RF_classifier.fit(x_train, y_train)
y_pred = RF_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(RF_classifier,"AFFR_RF_model.pkl")

Confusion matrix: 
 [[ 1632  1546]
 [  119 16703]]
Classification report: 
               precision    recall  f1-score   support

           0       0.93      0.51      0.66      3178
           1       0.92      0.99      0.95     16822

    accuracy                           0.92     20000
   macro avg       0.92      0.75      0.81     20000
weighted avg       0.92      0.92      0.91     20000

Accuracy score is:  0.91675
Model Saving ...
Wall time: 1min 40s


['AFFR_RF_model.pkl']

In [17]:
%%time
DT_classifier = DecisionTreeClassifier()
DT_classifier.fit(x_train, y_train)
y_pred = DT_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(DT_classifier,"AFFR_DT_model.pkl")

Confusion matrix: 
 [[ 1930  1248]
 [ 1115 15707]]
Classification report: 
               precision    recall  f1-score   support

           0       0.63      0.61      0.62      3178
           1       0.93      0.93      0.93     16822

    accuracy                           0.88     20000
   macro avg       0.78      0.77      0.78     20000
weighted avg       0.88      0.88      0.88     20000

Accuracy score is:  0.88185
Model Saving ...
Wall time: 1min 16s


['AFFR_DT_model.pkl']