# Import Packages

In [None]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import (
    RandomizedSearchCV,
    GridSearchCV,
)

from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline

CHUNK_SIZE = 100000 # Process 50,000 reviews at a time

# Load the datasets 

In [11]:
train = pd.read_csv('data/train.csv',header=None, encoding='utf-8')
test = pd.read_csv('data/test.csv',header=None, encoding='utf-8')

Dropped title column because it had null values and does not seem to add anything of value to train the model. Also taking a small chunk from the entire dataset.

In [21]:
train = train.rename(columns={0:"label",1:"title",2:"content"})
train = train.drop(["title"], axis=1)
train = train[:CHUNK_SIZE]
print(train.head())

KeyError: "['title'] not found in axis"

Dropped title column for test dataset as well because it had null values and does not seem to add anything of value to train the model

In [None]:
test = test.rename(columns={0:"label",1:"title",2:"content"})
test = test.drop(["title"], axis=1)
test

Unnamed: 0,label,content
0,2,My lovely Pat has one of the GREAT voices of h...
1,2,Despite the fact that I have only played a sma...
2,1,I bought this charger in Jul 2003 and it worke...
3,2,Check out Maha Energy's website. Their Powerex...
4,2,Reviewed quite a bit of the combo players and ...
...,...,...
399995,1,We bought this Thomas for our son who is a hug...
399996,1,My son recieved this as a birthday gift 2 mont...
399997,1,"I bought this toy for my son who loves the ""Th..."
399998,2,This is a compilation of a wide range of Mitfo...


In [None]:
train.isnull().sum()

label      0
content    0
dtype: int64

In [None]:
test.isnull().sum()

label      0
content    0
dtype: int64

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   label    100000 non-null  int64 
 1   content  100000 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


Checking if there is huge disparity in the label column for the two categorical values because this can cause problems when using Naive Bayes. There is no disparity between label 1 and 2 in the chunk I have taken

In [23]:
train['label'].value_counts()

label
2    51267
1    48733
Name: count, dtype: int64

# Taking the test and train df separately

In [27]:
X_train, y_train = train["content"], train["label"]
X_test, y_test = test["content"], test["label"]

Run the countVectoriser thru pipeline and then BernoulliNB

In [33]:
pipe_nb = Pipeline([
    # This step will be replaced by either CountVectorizer or TfidfVectorizer
    ('vectorizer', CountVectorizer()), 
    ('nb', BernoulliNB(alpha=1))
])

# Define the Expanded Parameter Grid:
param_grid = [
    # --- Option 1: Tune CountVectorizer ---
    {
        'vectorizer': [CountVectorizer(stop_words='english', binary=True)], # Use binary=True for BernoulliNB
        'vectorizer__max_features': [5000, 10000, 20000],
        'nb__alpha': [0.1, 1.0, 5.0]
    },
    
    # --- Option 2: Tune TfidfVectorizer ---
    {
        'vectorizer': [TfidfVectorizer(stop_words='english', binary=True)], # Also use binary=True
        'vectorizer__max_features': [5000, 10000, 20000],
        'vectorizer__ngram_range': [(1, 1), (1, 2)], # Test unigrams vs. unigrams+bigrams
        'nb__alpha': [0.1, 1.0, 5.0]
    }
]

search = GridSearchCV(
    pipe_nb, 
    param_grid,
    n_jobs=-1, 
    scoring='accuracy', 
    return_train_score=True,
    verbose=2
)

search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=5000; total time=   3.3s
[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=5000; total time=   3.2s
[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=5000; total time=   3.4s
[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=5000; total time=   3.4s
[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=5000; total time=   3.5s
[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=10000; total time=   3.3s
[CV] END nb__alpha=0.1, vectorizer=CountVectorizer(binary=True, stop_words='english'), vectorizer__max_features=10000; total time=   3.3s
[CV] END nb__alpha=0.1, vectorizer=Coun

0,1,2
,estimator,Pipeline(step...NB(alpha=1))])
,param_grid,"[{'nb__alpha': [0.1, 1.0, ...], 'vectorizer': [CountVectoriz...rds='english')], 'vectorizer__max_features': [5000, 10000, ...]}, {'nb__alpha': [0.1, 1.0, ...], 'vectorizer': [TfidfVectoriz...rds='english')], 'vectorizer__max_features': [5000, 10000, ...], 'vectorizer__ngram_range': [(1, ...), (1, ...)]}]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.1
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


In [35]:
results = pd.DataFrame(search.cv_results_)

# 2. Find the index of the best score
best_index = search.best_index_

# 3. Extract key details for the best model configuration
best_model_details = {
    "Best Parameters": search.best_params_,
    "Best CV Score (Mean Validation)": search.best_score_,
    "Mean Training Score": results.loc[best_index, 'mean_train_score'],
    "Std Dev CV Score": results.loc[best_index, 'std_test_score'],
    "Std Dev Training Score": results.loc[best_index, 'std_train_score'],
    "Vectorization Method Used": search.best_params_['nb__alpha'], # Alpha doesn't identify the vectorizer, so we rely on the pipeline structure.
    "Best Alpha (Smoothing)": search.best_params_['nb__alpha']
}

print("--- Best Model Configuration ---")
for key, value in best_model_details.items():
    if isinstance(value, float):
        print(f"**{key}**: {value:.4f}")
    else:
        print(f"**{key}**: {value}")

# 4. (Optional) Display a summary table of the top 3 results
print("\n--- Top 3 Model Configurations ---")
top_3_results = results.sort_values('rank_test_score').head(3)

# Display relevant columns
summary = top_3_results[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
summary.columns = ['Parameters', 'Mean Train Accuracy', 'Mean CV Accuracy', 'Rank']
print(summary)

--- Best Model Configuration ---
**Best Parameters**: {'nb__alpha': 0.1, 'vectorizer': TfidfVectorizer(binary=True, stop_words='english'), 'vectorizer__max_features': 20000, 'vectorizer__ngram_range': (1, 2)}
**Best CV Score (Mean Validation)**: 0.8399
**Mean Training Score**: 0.8634
**Std Dev CV Score**: 0.0035
**Std Dev Training Score**: 0.0008
**Vectorization Method Used**: 0.1000
**Best Alpha (Smoothing)**: 0.1000

--- Top 3 Model Configurations ---
                                           Parameters  Mean Train Accuracy  \
14  {'nb__alpha': 0.1, 'vectorizer': TfidfVectoriz...             0.863410   
20  {'nb__alpha': 1.0, 'vectorizer': TfidfVectoriz...             0.861962   
26  {'nb__alpha': 5.0, 'vectorizer': TfidfVectoriz...             0.857272   

    Mean CV Accuracy  Rank  
14           0.83986     1  
20           0.83941     2  
26           0.83732     3  


In [39]:
from sklearn.metrics import classification_report, accuracy_score


final_model = search.best_estimator_

X_test, y_test = test["content"], test["label"]

y_prediction = final_model.predict(X_test)

final_accuracy = accuracy_score(y_test, y_prediction)

final_report = classification_report(y_test, y_prediction)

print("Final Model Evaluation on External Test Dataset (test.csv) ---")
print(f"Total Test Reviews: {len(X_test)}")
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("\nClassification Report (Labels 1 and 2):")
print(final_report)

Final Model Evaluation on External Test Dataset (test.csv) ---
Total Test Reviews: 400000
Final Test Accuracy: 0.8387

Classification Report (Labels 1 and 2):
              precision    recall  f1-score   support

           1       0.85      0.83      0.84    200000
           2       0.83      0.85      0.84    200000

    accuracy                           0.84    400000
   macro avg       0.84      0.84      0.84    400000
weighted avg       0.84      0.84      0.84    400000

