Import Packages

In [93]:
import os
import sys
import IPython
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import (
    RandomizedSearchCV,
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC

CHUNK_SIZE = 100000 # Process 50,000 reviews at a time

Load data set

In [94]:
train = pd.read_csv('data/train.csv',header=None, encoding='utf-8')
test = pd.read_csv('data/test.csv',header=None, encoding='utf-8')

Dropped title column because it had null values and does not seem to add anything of value to train the model

In [95]:
train = train.rename(columns={0:"label",1:"title",2:"content"})
train = train.drop(["title"], axis=1)
train = train[:50000]
print(train.head())

   label                                            content
0      2  This sound track was beautiful! It paints the ...
1      2  I'm reading a lot of reviews saying that this ...
2      2  This soundtrack is my favorite music of all ti...
3      2  I truly like this soundtrack and I enjoy video...
4      2  If you've played the game, you know how divine...


Dropped title column for test dataset as well because it had null values and does not seem to add anything of value to train the model

In [96]:
test = test.rename(columns={0:"label",1:"title",2:"content"})
test = test.drop(["title"], axis=1)
test

Unnamed: 0,label,content
0,2,My lovely Pat has one of the GREAT voices of h...
1,2,Despite the fact that I have only played a sma...
2,1,I bought this charger in Jul 2003 and it worke...
3,2,Check out Maha Energy's website. Their Powerex...
4,2,Reviewed quite a bit of the combo players and ...
...,...,...
399995,1,We bought this Thomas for our son who is a hug...
399996,1,My son recieved this as a birthday gift 2 mont...
399997,1,"I bought this toy for my son who loves the ""Th..."
399998,2,This is a compilation of a wide range of Mitfo...


In [97]:
train.isnull().sum()

label      0
content    0
dtype: int64

In [98]:
test.isnull().sum()

label      0
content    0
dtype: int64

In [99]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    50000 non-null  int64 
 1   content  50000 non-null  object
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [100]:
train_df, test_df = train_test_split(train, test_size=0.2, random_state=123)

In [101]:
X_train, y_train = train_df["content"], train_df["label"]
X_test, y_test = train_df["content"], train_df["label"]

In [102]:
train_df.head()


Unnamed: 0,label,content
2660,2,After many years of reading and prodding class...
32815,1,Thought it would be another great courtroom th...
35141,2,Essentially explains why many of us repeat des...
19390,1,Psychosexual? I figured out the plot twist in ...
34846,2,I have used this Gratitude Journal in the past...


In [103]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB


results_dict = {
    "alpha": [],
    "mean_train_score": [],
    "mean_cv_score": [],
    "std_cv_score": [],
    "std_train_score": [],
}
param_grid = {"bernoullinb__alpha": [0.1, 0.5, 1.0, 2.0, 5.0]}

pipe_nb = make_pipeline(
    CountVectorizer(max_features=10000, stop_words="english", binary=True),
    # TfidfVectorizer(max_features=10000, ngram_range=(1, 2)), 
    BernoulliNB()
)

search = GridSearchCV(
    pipe_nb, 
    param_grid,
    n_jobs=-1, 
    scoring='accuracy', 
    return_train_score=True,
    verbose=1
)

search.fit(X_train, y_train);

Fitting 5 folds for each of 5 candidates, totalling 25 fits


python(44217) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44218) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44219) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44220) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44221) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44222) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44223) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(44224) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [104]:
best_alpha = search.best_params_['bernoullinb__alpha']
print(f"Best alpha parameter found: {best_alpha}")
print(f"Best cross-validation score: {search.best_score_:.4f}")

Best alpha parameter found: 1.0
Best cross-validation score: 0.8257


In [105]:
final_model = search.best_estimator_

X_test_final, y_test_final = test["content"], test["label"]

y_pred_final = final_model.predict(X_test_final)

final_accuracy = accuracy_score(y_test_final, y_pred_final)

final_report = classification_report(y_test_final, y_pred_final)

print("Final Model Evaluation on External Test Dataset (test.csv) ---")
print(f"Total Test Reviews: {len(X_test_final)}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("\nClassification Report (Labels 1 and 2):")
print(final_report)

Final Model Evaluation on External Test Dataset (test.csv) ---
Total Test Reviews: 400000
Final Test Accuracy: 0.8151

Classification Report (Labels 1 and 2):
              precision    recall  f1-score   support

           1       0.82      0.80      0.81    200000
           2       0.81      0.83      0.82    200000

    accuracy                           0.82    400000
   macro avg       0.82      0.82      0.82    400000
weighted avg       0.82      0.82      0.82    400000

