# Read All Dataset CSV

In [2]:
import os
import csv
import pandas as pd
import numpy as np

In [3]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))


for i in range(min(5, len(dataset_names))):
    print(f"Dataset: {dataset_names[i]}")
    print(f"X_train shape: {X_trains[i].shape}")
    print(f"y_train shape: {y_trains[i].shape}")
    print(f"X_test shape: {X_tests[i].shape}")
    print("-" * 30)

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------


## Data Preprocessing & Feature Engineering

In [4]:
## your code here
# Summary of datasets loaded
def summarize_datasets(dataset_names, X_trains, y_trains, X_tests):
    """
    Print a summary of the loaded datasets.
    
    Parameters:
    dataset_names: List of dataset names
    X_trains: List of training feature matrices
    y_trains: List of training label arrays
    X_tests: List of testing feature matrices
    """
    #print(f"Total number of datasets loaded: {len(dataset_names)}\n")
    #for i in range(len(dataset_names)):
        #print(f"Dataset: {dataset_names[i]}")
        #print(f"X_train shape: {X_trains[i].shape}")
        #print(f"y_train shape: {y_trains[i].shape}")
        #print(f"X_test shape: {X_tests[i].shape}")
        #print("-" * 30)

summarize_datasets(dataset_names, X_trains, y_trains, X_tests)

## Compare AUC score function

In [45]:
def compare_auc(model1_auc_scores, model2_auc_scores, dataset_names):
    """
    Compare AUC scores of two models across datasets and calculate the difference.

    Parameters:
    model1_auc_scores (list): AUC scores of the first model for each dataset.
    model2_auc_scores (list): AUC scores of the second model for each dataset.
    dataset_names (list): List of dataset names.

    Returns:
    differences (dict): A dictionary containing dataset names as keys and AUC differences as values.
    """
    differences = {}
    
    for i in range(len(dataset_names)):
        diff = model1_auc_scores[i] - model2_auc_scores[i]
        differences[dataset_names[i]] = diff
        print(f"Dataset: {dataset_names[i]} - AUC Difference: {diff:.4f} (Model 1 - Model 2)")
    
    return differences

## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score

In [43]:
#KNN

models = []
auc_scores = []  # 用于存储每个数据集的 AUC 分数

for i in range(len(dataset_names)):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Initialize and train the KNN classifier
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # Predict the probabilities for the test set
    tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
    
    # Calculate the AUC score
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    
    # Store the model and AUC score
    models.append(model)
    auc_scores.append(auc)

# Optional: Print a summary of all AUC scores
print("\nSummary of AUC Scores:")
for i, auc in enumerate(auc_scores):
    print(f"{dataset_names[i]}: {auc:.4f}")


Summary of AUC Scores:
Dataset_1: 0.6343
Dataset_10: 0.7294
Dataset_11: 0.4571
Dataset_12: 0.8500
Dataset_13: 0.9167
Dataset_14: 0.9483
Dataset_15: 0.6258
Dataset_16: 0.9696
Dataset_17: 0.9000
Dataset_18: 0.7500
Dataset_19: 0.9766
Dataset_2: 0.9696
Dataset_20: 0.9273
Dataset_21: 0.8397
Dataset_22: 0.7974
Dataset_23: 0.9020
Dataset_24: 0.6349
Dataset_25: 0.7963
Dataset_26: 0.7769
Dataset_27: 1.0000
Dataset_28: 0.8194
Dataset_29: 0.8735
Dataset_3: 0.5625
Dataset_30: 0.7291
Dataset_31: 0.6061
Dataset_32: 0.6807
Dataset_33: 0.9834
Dataset_34: 0.7557
Dataset_35: 0.8054
Dataset_36: 0.9176
Dataset_37: 0.8704
Dataset_38: 0.6620
Dataset_39: 0.9547
Dataset_4: 0.5694
Dataset_40: 0.8054
Dataset_41: 0.9235
Dataset_42: 1.0000
Dataset_43: 0.6807
Dataset_44: 1.0000
Dataset_45: 1.0000
Dataset_46: 0.9000
Dataset_47: 0.4464
Dataset_48: 0.8937
Dataset_49: 0.9809
Dataset_5: 0.7467
Dataset_6: 0.9546
Dataset_7: 0.7751
Dataset_8: 0.6909
Dataset_9: 0.8822


## Inference Model

In [22]:
y_predicts=[]
for i in range(len(dataset_names)):
    y_predict_proba=models[i].predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    

## Save result

In [23]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)

## Try other different models here

In [33]:
#Logistic regression

from sklearn.linear_model import LogisticRegression

# Initialize lists to store models and AUC scores for Logistic Regression
logistic_models = []
logistic_auc_scores = []
logistic_y_predicts = []  # Store the predictions for each dataset

# Loop through each dataset and train a Logistic Regression model
for i in range(len(dataset_names)):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Initialize and train the Logistic Regression model
    logistic_model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
    logistic_model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # Predict the probabilities for the test set
    tmp_y_prob = logistic_model.predict_proba(tmp_X_test)[:, 1]
    
    # Calculate the AUC score
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    
    # Store the model and AUC score
    logistic_models.append(logistic_model)
    logistic_auc_scores.append(auc)
    
    # Store predictions
    logistic_y_predicts.append(pd.DataFrame(tmp_y_prob, columns=['y_predict_proba']))

# Optional: Print a summary of all Logistic Regression AUC scores
print("\nSummary of Logistic Regression AUC Scores:")
for i, auc in enumerate(logistic_auc_scores):
    print(f"{dataset_names[i]}: {auc:.4f}")



Summary of Logistic Regression AUC Scores:
Dataset_1: 0.7313
Dataset_10: 0.8033
Dataset_11: 0.3714
Dataset_12: 0.6667
Dataset_13: 0.9300
Dataset_14: 0.9900
Dataset_15: 0.6608
Dataset_16: 1.0000
Dataset_17: 0.9556
Dataset_18: 1.0000
Dataset_19: 0.9938
Dataset_2: 1.0000
Dataset_20: 0.9455
Dataset_21: 0.9476
Dataset_22: 0.7059
Dataset_23: 0.9096
Dataset_24: 0.6365
Dataset_25: 0.5062
Dataset_26: 0.7885
Dataset_27: 1.0000
Dataset_28: 0.8409
Dataset_29: 0.8865
Dataset_3: 0.8125
Dataset_30: 0.8195
Dataset_31: 0.7727
Dataset_32: 0.8295
Dataset_33: 0.9980
Dataset_34: 0.8614
Dataset_35: 0.8306
Dataset_36: 1.0000
Dataset_37: 0.8951
Dataset_38: 0.8550
Dataset_39: 0.9919
Dataset_4: 0.3684
Dataset_40: 0.8306
Dataset_41: 0.9342
Dataset_42: 1.0000
Dataset_43: 0.8295
Dataset_44: 1.0000
Dataset_45: 1.0000
Dataset_46: 0.9365
Dataset_47: 0.5000
Dataset_48: 0.9943
Dataset_49: 0.9766
Dataset_5: 0.9233
Dataset_6: 0.9327
Dataset_7: 0.9808
Dataset_8: 0.9382
Dataset_9: 0.9038


In [28]:
for idx, dataset_name in enumerate(dataset_names):
    df = logistic_y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False, header=True)

In [37]:
#SVM(no kernel)
from sklearn.svm import SVC

svm_models = []
svm_auc_scores = []
svm_y_predicts = []  

# Loop through each dataset and train an SVM model
for i in range(len(dataset_names)):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Initialize and train the SVM model with probability estimation enabled
    svm_model = SVC(kernel='poly', degree=2, probability=True)  # Using RBF kernel with probability estimates
    svm_model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # Predict the probabilities for the test set
    tmp_y_prob = svm_model.predict_proba(tmp_X_test)[:, 1]
    
    # Calculate the AUC score
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    
    # Store the model and AUC score
    svm_models.append(svm_model)
    svm_auc_scores.append(auc)
    
    # Store predictions
    svm_y_predicts.append(pd.DataFrame(tmp_y_prob, columns=['y_predict_proba']))

# Optional: Print a summary of all SVM AUC scores
print("\nSummary of SVM AUC Scores:")
for i, auc in enumerate(svm_auc_scores):
    print(f"{dataset_names[i]}: {auc:.4f}")


Summary of SVM AUC Scores:
Dataset_1: 0.6235
Dataset_10: 0.7534
Dataset_11: 0.4857
Dataset_12: 0.8667
Dataset_13: 0.9433
Dataset_14: 0.9800
Dataset_15: 0.6608
Dataset_16: 0.9938
Dataset_17: 0.9037
Dataset_18: 1.0000
Dataset_19: 0.8236
Dataset_2: 0.9938
Dataset_20: 0.8485
Dataset_21: 0.9289
Dataset_22: 0.7582
Dataset_23: 0.9167
Dataset_24: 0.6139
Dataset_25: 0.8889
Dataset_26: 0.7808
Dataset_27: 1.0000
Dataset_28: 0.5876
Dataset_29: 0.8108
Dataset_3: 0.3125
Dataset_30: 0.8026
Dataset_31: 0.6667
Dataset_32: 0.7286
Dataset_33: 0.9766
Dataset_34: 0.8114
Dataset_35: 0.7043
Dataset_36: 0.9927
Dataset_37: 0.8395
Dataset_38: 0.7460
Dataset_39: 0.9806
Dataset_4: 0.5337
Dataset_40: 0.7043
Dataset_41: 0.8943
Dataset_42: 0.6434
Dataset_43: 0.7286
Dataset_44: 1.0000
Dataset_45: 0.9333
Dataset_46: 0.9333
Dataset_47: 0.5357
Dataset_48: 0.9867
Dataset_49: 0.9604
Dataset_5: 0.8100
Dataset_6: 0.9551
Dataset_7: 0.9659
Dataset_8: 0.8836
Dataset_9: 0.7885


In [40]:
#MLP
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Initialize lists to store models, AUC scores, and predictions for Neural Network
nn_models = []
nn_auc_scores = []
nn_y_predicts = []

# Loop through each dataset and train a Neural Network model
for i in range(len(dataset_names)):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Standardize the features (important for neural networks)
    scaler = StandardScaler()
    tmp_X_train = scaler.fit_transform(tmp_X_train)
    tmp_X_test = scaler.transform(tmp_X_test)
    
    # Initialize and train the Neural Network model
    nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, learning_rate_init=0.01, random_state=42)
    nn_model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # Predict the probabilities for the test set
    tmp_y_prob = nn_model.predict_proba(tmp_X_test)[:, 1]
    
    # Calculate the AUC score
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    
    # Store the model and AUC score
    nn_models.append(nn_model)
    nn_auc_scores.append(auc)
    
    # Store predictions
    nn_y_predicts.append(pd.DataFrame(tmp_y_prob, columns=['y_predict_proba']))

# Optional: Print a summary of all Neural Network AUC scores
print("\nSummary of Neural Network AUC Scores:")
for i, auc in enumerate(nn_auc_scores):
    print(f"{dataset_names[i]}: {auc:.4f}")


Summary of Neural Network AUC Scores:
Dataset_1: 0.7449
Dataset_10: 0.7436
Dataset_11: 0.3143
Dataset_12: 0.9762
Dataset_13: 0.9500
Dataset_14: 0.9900
Dataset_15: 0.6925
Dataset_16: 0.9995
Dataset_17: 0.9407
Dataset_18: 1.0000
Dataset_19: 0.9956
Dataset_2: 0.9995
Dataset_20: 0.9212
Dataset_21: 0.9382
Dataset_22: 0.7386
Dataset_23: 0.9184
Dataset_24: 0.6016
Dataset_25: 0.9383
Dataset_26: 0.7731
Dataset_27: 1.0000
Dataset_28: 0.7866
Dataset_29: 0.9200
Dataset_3: 0.7917
Dataset_30: 0.6937
Dataset_31: 0.8030
Dataset_32: 0.8012
Dataset_33: 1.0000
Dataset_34: 0.8429
Dataset_35: 0.8371
Dataset_36: 0.9194
Dataset_37: 0.8210
Dataset_38: 0.7480
Dataset_39: 0.9913
Dataset_4: 0.6276
Dataset_40: 0.8371
Dataset_41: 0.9398
Dataset_42: 0.9650
Dataset_43: 0.8012
Dataset_44: 1.0000
Dataset_45: 1.0000
Dataset_46: 0.9111
Dataset_47: 0.6071
Dataset_48: 0.9412
Dataset_49: 0.9693
Dataset_5: 0.8900
Dataset_6: 0.9510
Dataset_7: 0.9787
Dataset_8: 0.7891
Dataset_9: 0.8702


In [42]:
#Random forest
from sklearn.ensemble import RandomForestClassifier

# Initialize lists to store models, AUC scores, and predictions for Random Forest
rf_models = []
rf_auc_scores = []
rf_y_predicts = []

# Loop through each dataset and train a Random Forest model
for i in range(len(dataset_names)):
    # Split the dataset into training and testing sets
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    # Initialize and train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can increase n_estimators for better performance
    rf_model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # Predict the probabilities for the test set
    tmp_y_prob = rf_model.predict_proba(tmp_X_test)[:, 1]
    
    # Calculate the AUC score
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    
    # Store the model and AUC score
    rf_models.append(rf_model)
    rf_auc_scores.append(auc)
    
    # Store predictions
    rf_y_predicts.append(pd.DataFrame(tmp_y_prob, columns=['y_predict_proba']))

# Optional: Print a summary of all Random Forest AUC scores
print("\nSummary of Random Forest AUC Scores:")
for i, auc in enumerate(rf_auc_scores):
    print(f"{dataset_names[i]}: {auc:.4f}")


Summary of Random Forest AUC Scores:
Dataset_1: 0.8284
Dataset_10: 0.7744
Dataset_11: 0.5143
Dataset_12: 1.0000
Dataset_13: 0.9367
Dataset_14: 1.0000
Dataset_15: 0.7412
Dataset_16: 0.9995
Dataset_17: 0.9185
Dataset_18: 1.0000
Dataset_19: 0.9907
Dataset_2: 0.9995
Dataset_20: 0.9273
Dataset_21: 0.9878
Dataset_22: 0.8627
Dataset_23: 0.9375
Dataset_24: 0.6088
Dataset_25: 0.9938
Dataset_26: 0.8038
Dataset_27: 1.0000
Dataset_28: 0.8519
Dataset_29: 0.9514
Dataset_3: 0.8750
Dataset_30: 0.8192
Dataset_31: 0.8106
Dataset_32: 0.7224
Dataset_33: 1.0000
Dataset_34: 0.8657
Dataset_35: 0.8500
Dataset_36: 0.9780
Dataset_37: 0.8765
Dataset_38: 0.7865
Dataset_39: 0.9951
Dataset_4: 0.6255
Dataset_40: 0.8500
Dataset_41: 0.9972
Dataset_42: 1.0000
Dataset_43: 0.7224
Dataset_44: 1.0000
Dataset_45: 1.0000
Dataset_46: 0.9683
Dataset_47: 0.5000
Dataset_48: 1.0000
Dataset_49: 0.9858
Dataset_5: 0.5800
Dataset_6: 0.9689
Dataset_7: 0.9606
Dataset_8: 0.9509
Dataset_9: 0.8702


In [52]:
for idx, dataset_name in enumerate(dataset_names):
    df = rf_y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False, header=True)

In [51]:
a = compare_auc_scores(rf_auc_scores, logistic_auc_scores, dataset_names)
print(a)

Dataset: Dataset_1 - AUC Difference: 0.0970 (Model 1 - Model 2)
Dataset: Dataset_10 - AUC Difference: -0.0289 (Model 1 - Model 2)
Dataset: Dataset_11 - AUC Difference: 0.1429 (Model 1 - Model 2)
Dataset: Dataset_12 - AUC Difference: 0.3333 (Model 1 - Model 2)
Dataset: Dataset_13 - AUC Difference: 0.0067 (Model 1 - Model 2)
Dataset: Dataset_14 - AUC Difference: 0.0100 (Model 1 - Model 2)
Dataset: Dataset_15 - AUC Difference: 0.0804 (Model 1 - Model 2)
Dataset: Dataset_16 - AUC Difference: -0.0005 (Model 1 - Model 2)
Dataset: Dataset_17 - AUC Difference: -0.0370 (Model 1 - Model 2)
Dataset: Dataset_18 - AUC Difference: 0.0000 (Model 1 - Model 2)
Dataset: Dataset_19 - AUC Difference: -0.0031 (Model 1 - Model 2)
Dataset: Dataset_2 - AUC Difference: -0.0005 (Model 1 - Model 2)
Dataset: Dataset_20 - AUC Difference: -0.0182 (Model 1 - Model 2)
Dataset: Dataset_21 - AUC Difference: 0.0402 (Model 1 - Model 2)
Dataset: Dataset_22 - AUC Difference: 0.1569 (Model 1 - Model 2)
Dataset: Dataset_23 -