In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, recall_score, roc_auc_score

# Load the Higgs dataset (replace 'higgs.csv.gz' with your dataset file)
data = pd.read_csv(r'C:\Users\grguo\Dropbox\000000000000000learning\COMP4730\Assignment 1\higgs10k.csv', skiprows=1, header=None)

# Extract the label and features from the dataset
X = data.iloc[:, 1:]  # Features: All columns except the first
y = data.iloc[:, 0]   # Label: First column

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define a range of hyperparameters to search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly']  
}

# Initialize the SVM classifier
svm = SVC(random_state=42)

# Create a GridSearchCV instance to search for the best hyperparameters
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=kf, scoring='accuracy')

# Lists to store accuracy, recall, and AUC for each fold
accuracies = []
recalls = []
auc_scores = []

# Perform 10-fold cross-validation with hyperparameter tuning
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Perform hyperparameter tuning using GridSearchCV on the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best model with tuned hyperparameters
    best_svm = grid_search.best_estimator_
    
    # Calculate predicted probability scores for AUC calculation
    y_scores = best_svm.decision_function(X_test)
    
    # Make predictions on the test data
    y_pred = best_svm.predict(X_test)
    
    # Calculate accuracy, recall, and AUC for this fold
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_scores)  # Use probability scores
    
    # Append accuracy, recall, and AUC to the respective lists
    accuracies.append(accuracy)
    recalls.append(recall)
    auc_scores.append(auc)

# Print average accuracy, recall, and AUC across all folds
print("Average Accuracy:", np.mean(accuracies))
print("Average Recall:", np.mean(recalls))
print("Average AUC:", np.mean(auc_scores))

# Print the best hyperparameters found by GridSearchCV
print("Best Hyperparameters:", grid_search.best_params_)


Average Accuracy: 0.65
Average Recall: 0.7886219318520691
Average AUC: 0.6835902695948125
Best Hyperparameters: {'C': 10, 'kernel': 'linear'}
