In [1]:
import os
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from utils import save_predictions_to_csv, standardize_data, calculate_auc_score, compare_auc_scores

In [2]:
#Load datasets
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))


for i in range(min(5, len(dataset_names))):
    print(f"Dataset: {dataset_names[i]}")
    print(f"X_train shape: {X_trains[i].shape}")
    print(f"y_train shape: {y_trains[i].shape}")
    print(f"X_test shape: {X_tests[i].shape}")
    print("-" * 30)

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------


## Put your code below

In [8]:
def select_important_features(X_train, y_train, n_features=10):
    """
    Select the most important features using RandomForest feature importances.
    
    Parameters:
    X_train : pd.DataFrame
        The training features.
    y_train : pd.Series or array-like
        The training labels.
    n_features : int, optional
        The number of top important features to select (default is 10).
    
    Returns:
    pd.DataFrame
        A DataFrame containing only the selected important features.
    list
        The list of selected feature names.
    """
    # Train a RandomForest model to get feature importances
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Get feature importances
    importances = model.feature_importances_
    feature_indices = importances.argsort()[-n_features:]  # Select top n features
    
    # Return the DataFrame with selected important features
    selected_features = X_train.columns[feature_indices]
    return X_train[selected_features], selected_features

In [9]:
#RF test
from models import train_random_forest

for i in range(len(dataset_names)):
    print(f"Processing dataset: {dataset_names[i]}")
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_trains[i])
    X_test_scaled = scaler.transform(X_tests[i])
    
    X_train_df = pd.DataFrame(X_train_scaled, columns=X_trains[i].columns)
    X_test_df = pd.DataFrame(X_test_scaled, columns=X_tests[i].columns)
    
    X_train_selected, selected_features = select_important_features(X_train_df, y_trains[i].values.ravel(), n_features=10)
    
    X_test_selected = X_test_df[selected_features]
    
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    model_rf.fit(X_train_selected, y_trains[i].values.ravel())
    
    y_prob_train = model_rf.predict_proba(X_train_selected)[:, 1]
    auc_score = roc_auc_score(y_trains[i], y_prob_train)
    print(f"Dataset: {dataset_names[i]} - AUC after Feature Selection: {auc_score:.4f}")
    
    y_prob_test = model_rf.predict_proba(X_test_selected)[:, 1]
    
    save_predictions_to_csv(y_prob_test, dataset_names[i], folder_path='./Competition_data/')

Processing dataset: Dataset_1
Dataset: Dataset_1 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_10
Dataset: Dataset_10 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_11
Dataset: Dataset_11 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_12
Dataset: Dataset_12 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_13
Dataset: Dataset_13 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_14
Dataset: Dataset_14 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_15
Dataset: Dataset_15 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_16
Dataset: Dataset_16 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_17
Dataset: Dataset_17 - AUC after Feature Selection: 0.9999
Processing dataset: Dataset_18
Dataset: Dataset_18 - AUC after Feature Selection: 1.0000
Processing dataset: Dataset_19
Dataset: Dataset_19 - AUC after Feature Selection: 1.0000
Processing dataset: Dat

In [7]:
from sklearn.svm import SVC

for i in range(len(dataset_names)):
    print(f"Processing dataset: {dataset_names[i]}")
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_trains[i])
    X_test_scaled = scaler.transform(X_tests[i])
    
    X_train_df = pd.DataFrame(X_train_scaled, columns=X_trains[i].columns)
    X_test_df = pd.DataFrame(X_test_scaled, columns=X_tests[i].columns)
    
    X_train_selected, selected_features = select_important_features(X_train_df, y_trains[i].values.ravel(), n_features=10)
    
    X_test_selected = X_test_df[selected_features]

    model_svm = SVC(kernel='rbf', probability=True, random_state=42)
    model_svm.fit(X_train_selected, y_trains[i].values.ravel())
    
    y_prob_train = model_svm.predict_proba(X_train_selected)[:, 1]
    auc_score = roc_auc_score(y_trains[i], y_prob_train)
    print(f"Dataset: {dataset_names[i]} - SVM AUC after Feature Selection: {auc_score:.4f}")
    
    y_prob_test = model_svm.predict_proba(X_test_selected)[:, 1]
    
    save_predictions_to_csv(y_prob_test, dataset_names[i], folder_path='./Competition_data/')

Processing dataset: Dataset_1
Dataset: Dataset_1 - SVM AUC after Feature Selection: 0.9012
Processing dataset: Dataset_10
Dataset: Dataset_10 - SVM AUC after Feature Selection: 0.8750
Processing dataset: Dataset_11
Dataset: Dataset_11 - SVM AUC after Feature Selection: 1.0000
Processing dataset: Dataset_12
Dataset: Dataset_12 - SVM AUC after Feature Selection: 0.9420
Processing dataset: Dataset_13
Dataset: Dataset_13 - SVM AUC after Feature Selection: 0.9677
Processing dataset: Dataset_14
Dataset: Dataset_14 - SVM AUC after Feature Selection: 1.0000
Processing dataset: Dataset_15
Dataset: Dataset_15 - SVM AUC after Feature Selection: 0.8764
Processing dataset: Dataset_16
Dataset: Dataset_16 - SVM AUC after Feature Selection: 0.9978
Processing dataset: Dataset_17
Dataset: Dataset_17 - SVM AUC after Feature Selection: 0.9779
Processing dataset: Dataset_18
Dataset: Dataset_18 - SVM AUC after Feature Selection: 1.0000
Processing dataset: Dataset_19
Dataset: Dataset_19 - SVM AUC after Featu

In [10]:
# Loop through each dataset and save the results as binary (0/1)
for i, dataset_name in enumerate(dataset_names):
    # Standardize the test set using the previously fitted scaler
    X_test_standardized = standardize_data(X_trains[i], X_tests[i])[1]
    
    # Make predictions on the test set and convert to binary (0 or 1)
    y_prob = svm_models[i].predict_proba(X_test_standardized)[:, 1] 
    
    # Save predictions to CSV
    save_predictions_to_csv(y_prob, dataset_name, folder_path='./Competition_data/')

In [6]:
# Compare AUC scores between SVM and Random Forest
print("\nComparing AUC Scores between SVM(Model 1) and Random Forest(Model 2):")
auc_differences = compare_auc_scores(svm_auc_scores, rf_auc_scores, dataset_names)


Comparing AUC Scores between SVM(Model 1) and Random Forest(Model 2):
Dataset: Dataset_1 - AUC Difference: 0.0204 (Model 1 - Model 2)
Dataset: Dataset_10 - AUC Difference: -0.0765 (Model 1 - Model 2)
Dataset: Dataset_11 - AUC Difference: 0.0000 (Model 1 - Model 2)
Dataset: Dataset_12 - AUC Difference: -0.1381 (Model 1 - Model 2)
Dataset: Dataset_13 - AUC Difference: -0.1867 (Model 1 - Model 2)
Dataset: Dataset_14 - AUC Difference: -0.1733 (Model 1 - Model 2)
Dataset: Dataset_15 - AUC Difference: -0.0887 (Model 1 - Model 2)
Dataset: Dataset_16 - AUC Difference: -0.0024 (Model 1 - Model 2)
Dataset: Dataset_17 - AUC Difference: -0.2519 (Model 1 - Model 2)
Dataset: Dataset_18 - AUC Difference: 0.0000 (Model 1 - Model 2)
Dataset: Dataset_19 - AUC Difference: -0.1914 (Model 1 - Model 2)
Dataset: Dataset_2 - AUC Difference: -0.0024 (Model 1 - Model 2)
Dataset: Dataset_20 - AUC Difference: -0.0606 (Model 1 - Model 2)
Dataset: Dataset_21 - AUC Difference: -0.3304 (Model 1 - Model 2)
Dataset: D

In [12]:
# MLP (Multi-Layer Perceptron)
from sklearn.neural_network import MLPClassifier
from utils import standardize_data, calculate_auc_score, save_predictions_to_csv

# Initialize lists to store results
mlp_models = []
mlp_auc_scores = []

# Loop through each dataset and train an MLP model
for i, dataset_name in enumerate(dataset_names):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Standardize the data (important for MLP)
    tmp_X_train, tmp_X_test = standardize_data(tmp_X_train, tmp_X_test)
    
    # Train MLP model
    mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
    mlp_model.fit(tmp_X_train, tmp_y_train.squeeze())
    mlp_models.append(mlp_model)
    
    # Calculate the AUC score for the current dataset
    auc_score = calculate_auc_score(mlp_model, tmp_X_test, tmp_y_test)
    mlp_auc_scores.append(auc_score)
    
    # Print the AUC score for the current dataset
    print(f"Dataset: {dataset_name} - MLP AUC Score: {auc_score:.4f}")
    
    # Predict probabilities on the test set and save results as binary (0/1)
    y_prob = mlp_model.predict_proba(tmp_X_test)[:, 1]
    save_predictions_to_csv(y_prob, dataset_name, folder_path='./Competition_data/')



Dataset: Dataset_1 - MLP AUC Score: 0.7286




Dataset: Dataset_10 - MLP AUC Score: 0.7506
Dataset: Dataset_11 - MLP AUC Score: 0.3714
Dataset: Dataset_12 - MLP AUC Score: 0.9762
Dataset: Dataset_13 - MLP AUC Score: 0.9100
Dataset: Dataset_14 - MLP AUC Score: 0.9867




Dataset: Dataset_15 - MLP AUC Score: 0.6733
Dataset: Dataset_16 - MLP AUC Score: 0.9995
Dataset: Dataset_17 - MLP AUC Score: 0.9111
Dataset: Dataset_18 - MLP AUC Score: 1.0000
Dataset: Dataset_19 - MLP AUC Score: 0.9912
Dataset: Dataset_2 - MLP AUC Score: 0.9995
Dataset: Dataset_20 - MLP AUC Score: 0.9212
Dataset: Dataset_21 - MLP AUC Score: 0.9604
Dataset: Dataset_22 - MLP AUC Score: 0.7908
Dataset: Dataset_23 - MLP AUC Score: 0.9140




Dataset: Dataset_24 - MLP AUC Score: 0.5964
Dataset: Dataset_25 - MLP AUC Score: 0.9074
Dataset: Dataset_26 - MLP AUC Score: 0.7769
Dataset: Dataset_27 - MLP AUC Score: 1.0000
Dataset: Dataset_28 - MLP AUC Score: 0.8296
Dataset: Dataset_29 - MLP AUC Score: 0.9211
Dataset: Dataset_3 - MLP AUC Score: 0.8125
Dataset: Dataset_30 - MLP AUC Score: 0.6699
Dataset: Dataset_31 - MLP AUC Score: 0.7879
Dataset: Dataset_32 - MLP AUC Score: 0.8099
Dataset: Dataset_33 - MLP AUC Score: 1.0000
Dataset: Dataset_34 - MLP AUC Score: 0.8371




Dataset: Dataset_35 - MLP AUC Score: 0.8220
Dataset: Dataset_36 - MLP AUC Score: 0.9121
Dataset: Dataset_37 - MLP AUC Score: 0.8117
Dataset: Dataset_38 - MLP AUC Score: 0.7510
Dataset: Dataset_39 - MLP AUC Score: 0.9916
Dataset: Dataset_4 - MLP AUC Score: 0.6684




Dataset: Dataset_40 - MLP AUC Score: 0.8220
Dataset: Dataset_41 - MLP AUC Score: 0.9435
Dataset: Dataset_42 - MLP AUC Score: 0.9371
Dataset: Dataset_43 - MLP AUC Score: 0.8099
Dataset: Dataset_44 - MLP AUC Score: 1.0000
Dataset: Dataset_45 - MLP AUC Score: 1.0000
Dataset: Dataset_46 - MLP AUC Score: 0.9079
Dataset: Dataset_47 - MLP AUC Score: 0.6071
Dataset: Dataset_48 - MLP AUC Score: 0.9469
Dataset: Dataset_49 - MLP AUC Score: 0.9651
Dataset: Dataset_5 - MLP AUC Score: 0.9300
Dataset: Dataset_6 - MLP AUC Score: 0.9694
Dataset: Dataset_7 - MLP AUC Score: 0.9638
Dataset: Dataset_8 - MLP AUC Score: 0.8036
Dataset: Dataset_9 - MLP AUC Score: 0.8798
