In [18]:
import os
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils import save_predictions_to_csv, standardize_data, calculate_auc_score, compare_auc_scores

In [19]:
#Load datasets
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))


for i in range(min(5, len(dataset_names))):
    print(f"Dataset: {dataset_names[i]}")
    print(f"X_train shape: {X_trains[i].shape}")
    print(f"y_train shape: {y_trains[i].shape}")
    print(f"X_test shape: {X_tests[i].shape}")
    print("-" * 30)

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------


## Put your code below

In [24]:
#RF test
from models import train_random_forest

rf_models = []
rf_auc_scores = []

# Loop through each dataset and train a Random Forest model
for i, dataset_name in enumerate(dataset_names):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Standardize the data (if necessary)
    tmp_X_train, tmp_X_test = standardize_data(tmp_X_train, tmp_X_test)
    
    # Train Random Forest model
    rf_model = train_random_forest(tmp_X_train, tmp_y_train, n_estimators=100)
    rf_models.append(rf_model)
    
    # Calculate the AUC score for the current dataset
    auc_score = calculate_auc_score(rf_model, tmp_X_test, tmp_y_test)
    rf_auc_scores.append(auc_score)
    
    # Print the AUC score for the current dataset
    #print(f"Dataset: {dataset_name} - Random Forest AUC Score: {auc_score:.4f}")

In [25]:
# SVM(with 2-degree poly kernel)
from models import train_svm

# Initialize lists to store results
svm_models = []
svm_auc_scores = []

# Loop through each dataset and train an SVM model
for i, dataset_name in enumerate(dataset_names):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Standardize the data (if necessary)
    tmp_X_train, tmp_X_test = standardize_data(tmp_X_train, tmp_X_test)
    
    # Train SVM model
    svm_model = train_svm(tmp_X_train, tmp_y_train, kernel='poly', degree=2)
    svm_models.append(svm_model)
    
    # Calculate the AUC score for the current dataset
    auc_score = calculate_auc_score(svm_model, tmp_X_test, tmp_y_test)
    svm_auc_scores.append(auc_score)
    
    # Print the AUC score for the current dataset
    #print(f"Dataset: {dataset_name} - SVM AUC Score: {auc_score:.4f}")

In [27]:
# Compare AUC scores between SVM and Random Forest
print("\nComparing AUC Scores between SVM(Model 1) and Random Forest(Model 2):")
auc_differences = compare_auc_scores(svm_auc_scores, rf_auc_scores, dataset_names)


Comparing AUC Scores between SVM(Model 1) and Random Forest(Model 2):
Dataset: Dataset_1 - AUC Difference: 0.0204 (Model 1 - Model 2)
Dataset: Dataset_10 - AUC Difference: -0.0765 (Model 1 - Model 2)
Dataset: Dataset_11 - AUC Difference: -0.0286 (Model 1 - Model 2)
Dataset: Dataset_12 - AUC Difference: -0.1381 (Model 1 - Model 2)
Dataset: Dataset_13 - AUC Difference: -0.1867 (Model 1 - Model 2)
Dataset: Dataset_14 - AUC Difference: -0.1733 (Model 1 - Model 2)
Dataset: Dataset_15 - AUC Difference: -0.0887 (Model 1 - Model 2)
Dataset: Dataset_16 - AUC Difference: -0.0024 (Model 1 - Model 2)
Dataset: Dataset_17 - AUC Difference: -0.2519 (Model 1 - Model 2)
Dataset: Dataset_18 - AUC Difference: 0.0000 (Model 1 - Model 2)
Dataset: Dataset_19 - AUC Difference: -0.1918 (Model 1 - Model 2)
Dataset: Dataset_2 - AUC Difference: -0.0024 (Model 1 - Model 2)
Dataset: Dataset_20 - AUC Difference: -0.0606 (Model 1 - Model 2)
Dataset: Dataset_21 - AUC Difference: -0.3304 (Model 1 - Model 2)
Dataset: 