In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('/content/pca_transformed_data_25.csv')

In [3]:
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

In [4]:
print(f"Original dataframe size: {len(df)}")
print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Testing set size: {len(df_test)}")

Original dataframe size: 50000
Training set size: 35000
Validation set size: 7500
Testing set size: 7500


In [5]:
# Split into features and target
X_train = df_train.drop(columns=['Test Results'])
y_train = df_train['Test Results']

X_val = df_val.drop(columns=['Test Results'])
y_val = df_val['Test Results']

X_test = df_test.drop(columns=['Test Results'])
y_test = df_test['Test Results']

In [6]:
# Define the model
gnb = GaussianNB()

# Hyperparameter grid
param_grid = {
    'var_smoothing': np.logspace(-12, -6, 7)
}

In [7]:
# GridSearch with 5-fold CV
grid_search = GridSearchCV(gnb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [8]:
# Best model
best_gnb = grid_search.best_estimator_
print("Best var_smoothing:", grid_search.best_params_)

Best var_smoothing: {'var_smoothing': np.float64(1e-12)}


In [9]:
# Evaluate on validation set
val_preds = best_gnb.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
print("\nValidation Classification Report:\n", classification_report(y_val, val_preds))


Validation Accuracy: 0.7452

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.76      0.77      2464
           1       0.68      0.71      0.70      2509
           2       0.79      0.76      0.77      2527

    accuracy                           0.75      7500
   macro avg       0.75      0.75      0.75      7500
weighted avg       0.75      0.75      0.75      7500



In [10]:
# Final test performance
test_preds = best_gnb.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, test_preds))
print("\nTest Classification Report:\n", classification_report(y_test, test_preds))


Test Accuracy: 0.7584

Test Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78      2547
           1       0.69      0.71      0.70      2440
           2       0.80      0.78      0.79      2513

    accuracy                           0.76      7500
   macro avg       0.76      0.76      0.76      7500
weighted avg       0.76      0.76      0.76      7500



In [11]:
# The accuracy
accuracy = accuracy_score(y_test, test_preds)
print(f"Accuracy: {accuracy * 100:.2f}%")
#

Accuracy: 75.84%
