# TFIDF Featurization and Modeling

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("AFFR_preprocessed_100k.csv")

In [4]:
data.tail(3)

Unnamed: 0,Reviews,rating
99997,celestial chia tea celestial chia tea tea real...,1
99998,miracle fruit tablets miracle fruit tablets se...,1
99999,not taste like lipton yellow label tea sold it...,0


In [5]:
def text_splitter(text):
    return text.split()
# max_features = 20000 means we want only most useful(most occured) 20000 features not all 
vectorizer = TfidfVectorizer(tokenizer = text_splitter,ngram_range=(1, 3),max_features=20000,min_df=5, max_df=0.7)
review_vector = vectorizer.fit_transform(data['Reviews'].values.astype(str))

In [6]:
dump(vectorizer,"AFFR_tfidfvectorizer.pkl")

['AFFR_tfidfvectorizer.pkl']

In [7]:
review_vector.shape

(100000, 20000)

In [8]:
# Getting labels seperate 
y_label = data["rating"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(review_vector, y_label, test_size = 0.20)

In [10]:
%%time
# Here Gaussian Naive Bayes does not taking Sparse matrix it requires dense 
NB_classifier = MultinomialNB()
NB_classifier.fit(x_train, y_train)
y_pred = NB_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 1830  1338]
 [  144 16688]]
Classification report: 
               precision    recall  f1-score   support

           0       0.93      0.58      0.71      3168
           1       0.93      0.99      0.96     16832

    accuracy                           0.93     20000
   macro avg       0.93      0.78      0.83     20000
weighted avg       0.93      0.93      0.92     20000

Accuracy score is:  0.9259
Model Saving ...
Wall time: 438 ms


In [11]:
%%time
LR_classifier = LogisticRegression(n_jobs=-1)
LR_classifier.fit(x_train, y_train)
y_pred = LR_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 2299   869]
 [  229 16603]]
Classification report: 
               precision    recall  f1-score   support

           0       0.91      0.73      0.81      3168
           1       0.95      0.99      0.97     16832

    accuracy                           0.95     20000
   macro avg       0.93      0.86      0.89     20000
weighted avg       0.94      0.95      0.94     20000

Accuracy score is:  0.9451
Model Saving ...
Wall time: 31.5 s


In [12]:
%%time
# Here Gaussian Naive Bayes does not taking Sparse matrix it requires dense 
KNN_classifier = KNeighborsClassifier(n_jobs=-1)
KNN_classifier.fit(x_train, y_train)
y_pred = KNN_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")


Confusion matrix: 
 [[  112  3056]
 [   35 16797]]
Classification report: 
               precision    recall  f1-score   support

           0       0.76      0.04      0.07      3168
           1       0.85      1.00      0.92     16832

    accuracy                           0.85     20000
   macro avg       0.80      0.52      0.49     20000
weighted avg       0.83      0.85      0.78     20000

Accuracy score is:  0.84545
Model Saving ...
Wall time: 1min 31s


In [13]:
%%time
# Here Gaussian Naive Bayes does not taking Sparse matrix it requires dense 
RF_classifier = RandomForestClassifier(n_jobs=-1)
RF_classifier.fit(x_train, y_train)
y_pred = RF_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")


Confusion matrix: 
 [[ 1695  1473]
 [  143 16689]]
Classification report: 
               precision    recall  f1-score   support

           0       0.92      0.54      0.68      3168
           1       0.92      0.99      0.95     16832

    accuracy                           0.92     20000
   macro avg       0.92      0.76      0.82     20000
weighted avg       0.92      0.92      0.91     20000

Accuracy score is:  0.9192
Model Saving ...
Wall time: 1min 38s


In [14]:
%%time
svmclassifier = SVC(kernel='rbf',verbose=True,gamma="auto")
svmclassifier.fit(x_train, y_train)
y_pred = svmclassifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")
dump(svmclassifier,"AFFR_tfidfSVM_model.pkl")

[LibSVM]Confusion matrix: 
 [[    0  3168]
 [    0 16832]]
Classification report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      3168
           1       0.84      1.00      0.91     16832

    accuracy                           0.84     20000
   macro avg       0.42      0.50      0.46     20000
weighted avg       0.71      0.84      0.77     20000

Accuracy score is:  0.8416
Model Saving ...
Wall time: 57min 40s


['AFFR_tfidfSVM_model.pkl']

In [15]:
%%time
DT_classifier = DecisionTreeClassifier()
DT_classifier.fit(x_train, y_train)
y_pred = DT_classifier.predict(x_test)
print("Confusion matrix: \n",confusion_matrix(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Accuracy score is: ",accuracy_score(y_test,y_pred))
print("Model Saving ...")

Confusion matrix: 
 [[ 1939  1229]
 [ 1176 15656]]
Classification report: 
               precision    recall  f1-score   support

           0       0.62      0.61      0.62      3168
           1       0.93      0.93      0.93     16832

    accuracy                           0.88     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.88      0.88      0.88     20000

Accuracy score is:  0.87975
Model Saving ...
Wall time: 6min 32s
