In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from joblib import dump

In [2]:
# Datasets Paths
X_train_resampled_path = os.path.join('..','datasets','prepared_data','X_train_resampled.csv') # SMOTE applied  
X_train_reduced_path = os.path.join('..','datasets','prepared_data','X_train_reduced.csv')     # PCA applied (It was resampled before)
X_test_path = os.path.join('..','datasets','prepared_data','X_test_transformed.csv') 
X_test_reduced_path = os.path.join('..','datasets','prepared_data','X_test_reduced.csv') 
y_train_path = os.path.join('..','datasets','prepared_data','y_train_resampled.csv') # SMOTE applied
y_test_path = os.path.join('..','datasets','prepared_data','y_test.csv')

# Load Datasets
X_train_resampled = pd.read_csv(X_train_resampled_path) # PCA Reduced Dataset
X_train_reduced = pd.read_csv(X_train_reduced_path)     # PCA applied (It was resampled before)
X_test_reduced = pd.read_csv(X_test_reduced_path)               # PCA
X_test = pd.read_csv(X_test_path)             
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Ensure y are 1D arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [3]:
# Define the base model
gnb = GaussianNB()

# Define the hyperparameters to search
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  # Adjust variance smoothing parameter
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=gnb,
    param_grid=param_grid,
    scoring='recall',  # Use 'roc_auc', 'f1', etc., depending on your target metric
    cv=10,               # Number of cross-validation folds
    n_jobs=-1,          # Use all available CPU cores
    verbose=1           # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_resampled, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best parameters: {'var_smoothing': 1e-09}
Best score: 0.8398779355584981


In [4]:
y_pred = best_model.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.67      0.77      1552
           1       0.47      0.82      0.60       561

    accuracy                           0.71      2113
   macro avg       0.69      0.75      0.69      2113
weighted avg       0.80      0.71      0.73      2113



In [6]:
# Save trained model
dump(best_model, os.path.join('..','trained_models','naive_bayes.joblib'))

['..\\trained_models\\naive_bayes.joblib']

### Testing with PCA Transformed Dataset

In [7]:
 # Define the base model
gnb = GaussianNB()

# Define the hyperparameters to search
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  # Adjust variance smoothing parameter
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=gnb,
    param_grid=param_grid,
    scoring='recall',  # Use 'roc_auc', 'f1', etc., depending on your target metric
    cv=10,               # Number of cross-validation folds
    n_jobs=-1,          # Use all available CPU cores
    verbose=1           # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_reduced, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model_w_pca = grid_search.best_estimator_

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best parameters: {'var_smoothing': 1e-09}
Best score: 0.7766601220644415


In [8]:
y_pred = best_model_w_pca.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.74      0.81      1552
           1       0.51      0.74      0.60       561

    accuracy                           0.74      2113
   macro avg       0.70      0.74      0.71      2113
weighted avg       0.79      0.74      0.75      2113

