In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

In [17]:
df = pd.read_csv('/content/Training_Set_Preprocessed_Final.csv')

In [18]:
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

In [19]:
print(f"Original dataframe size: {len(df)}")
print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Testing set size: {len(df_test)}")

Original dataframe size: 44167
Training set size: 30916
Validation set size: 6625
Testing set size: 6626


In [20]:
class HybridNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, categorical_features, numerical_features, alpha=1.0, var_smoothing=1e-9):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.alpha = alpha
        self.var_smoothing = var_smoothing
        self.multi_nb = MultinomialNB(alpha=self.alpha)
        self.gauss_nb = GaussianNB(var_smoothing=self.var_smoothing)
        self.classes_ = None

    def fit(self, X, y):
        X_cat = X[self.categorical_features].values
        X_num = X[self.numerical_features].values

        self.multi_nb = MultinomialNB(alpha=self.alpha)
        self.gauss_nb = GaussianNB(var_smoothing=self.var_smoothing)

        self.multi_nb.fit(X_cat, y)
        self.gauss_nb.fit(X_num, y)

        self.classes_ = self.multi_nb.classes_
        return self

    def predict_log_proba(self, X):
        X_cat = X[self.categorical_features].values
        X_num = X[self.numerical_features].values

        log_prob_cat = self.multi_nb.predict_log_proba(X_cat)
        log_prob_num = self.gauss_nb.predict_log_proba(X_num)

        return log_prob_cat + log_prob_num

    def predict(self, X):
        combined_log_proba = self.predict_log_proba(X)
        return self.classes_[np.argmax(combined_log_proba, axis=1)]

In [21]:
# === Define your dataset
numerical_features = ['Age', 'Billing Amount']
categorical_features = [col for col in df_train.columns if col not in numerical_features + ['Test Results']]
X = df_train[categorical_features + numerical_features]
y = df_train['Test Results']

# === Define parameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1.0],
    'var_smoothing': [1e-11, 1e-9, 1e-7]
}

# === Wrap in GridSearchCV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
hybrid_model = HybridNaiveBayes(categorical_features, numerical_features)

grid = GridSearchCV(hybrid_model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X, y)

# === Best model
print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters: {'alpha': 0.01, 'var_smoothing': 1e-11}
Best CV accuracy: 0.7758764433380432


In [22]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X)

In [23]:
X_val = df_val[categorical_features + numerical_features]
y_val = df_val['Test Results']

X_test = df_test[categorical_features + numerical_features]
y_test = df_test['Test Results']

# === Predict on validation set
y_val_pred = best_model.predict(X_val)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))

# === Predict on test set
y_test_pred = best_model.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      2238
           1       0.75      0.73      0.74      2221
           2       0.82      0.82      0.82      2166

    accuracy                           0.78      6625
   macro avg       0.78      0.78      0.78      6625
weighted avg       0.78      0.78      0.78      6625

Test Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      2250
           1       0.75      0.74      0.75      2233
           2       0.82      0.82      0.82      2143

    accuracy                           0.78      6626
   macro avg       0.78      0.78      0.78      6626
weighted avg       0.78      0.78      0.78      6626

