In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score

# Load and preprocess your dataset (assuming you've loaded the dataset with skipped column names)
data = pd.read_csv(r'C:\Users\grguo\Dropbox\000000000000000learning\COMP4730\Assignment 1\higgs10k.csv', header=None, skiprows=1)

# Define the number of folds for cross-validation
num_folds = 10

# Extract the label and features from the dataset
X = data.iloc[:, 1:]  # Features: All columns except the first
y = data.iloc[:, 0]   # Label: First column

# Create a K-Fold cross-validation object
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Define a range of hyperparameters to search
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]  # Range of values to search for var_smoothing
}

# Initialize a Gaussian Naive Bayes classifier
nb = GaussianNB()

# Create a GridSearchCV instance to search for the best hyperparameters
grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=kf, scoring='accuracy')

# Initialize variables to store accuracy and AUC scores
cross_val_accuracies = []
cross_val_aucs = []

# Perform 10-fold cross-validation with hyperparameter tuning
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Perform hyperparameter tuning using GridSearchCV on the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best model with tuned hyperparameters
    best_nb = grid_search.best_estimator_

    # Predict on the test data
    y_pred = best_nb.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    cross_val_accuracies.append(accuracy)

    # Calculate AUC if the model has a predict_proba method
    if hasattr(best_nb, 'predict_proba'):
        y_prob = best_nb.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_prob)
        cross_val_aucs.append(auc)

# Calculate the average accuracy and AUC
average_accuracy = np.mean(cross_val_accuracies)
average_auc = np.nan if len(cross_val_aucs) == 0 else np.mean(cross_val_aucs)

# Print the results
print("Gaussian Naive Bayes 10-Fold Cross-Validation Accuracy:", average_accuracy)
print("Gaussian Naive Bayes 10-Fold Cross-Validation AUC:", average_auc)

# Print the best hyperparameters found by GridSearchCV
print("Best Hyperparameters:", grid_search.best_params_)
