# Read All Dataset CSV

In [2]:
import os
import csv
import pandas as pd
import numpy as np

In [4]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))


for i in range(min(5, len(dataset_names))):
    print(f"Dataset: {dataset_names[i]}")
    print(f"X_train shape: {X_trains[i].shape}")
    print(f"y_train shape: {y_trains[i].shape}")
    print(f"X_test shape: {X_tests[i].shape}")
    print("-" * 30)

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------


## Data Preprocessing & Feature Engineering

In [6]:
## your code here
# Summary of datasets loaded
def summarize_datasets(dataset_names, X_trains, y_trains, X_tests):
    """
    Print a summary of the loaded datasets.
    
    Parameters:
    dataset_names: List of dataset names
    X_trains: List of training feature matrices
    y_trains: List of training label arrays
    X_tests: List of testing feature matrices
    """
    print(f"Total number of datasets loaded: {len(dataset_names)}\n")
    for i in range(len(dataset_names)):
        print(f"Dataset: {dataset_names[i]}")
        print(f"X_train shape: {X_trains[i].shape}")
        print(f"y_train shape: {y_trains[i].shape}")
        print(f"X_test shape: {X_tests[i].shape}")
        print("-" * 30)

summarize_datasets(dataset_names, X_trains, y_trains, X_tests)

Total number of datasets loaded: 49

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------
Dataset: Dataset_14
X_train shape: (181, 57)
y_train shape: (181, 1)
X_test shape: (122, 57)
------------------------------
Dataset: Dataset_15
X_train shape: (368, 11)
y_train shape: (368, 1)
X_test shape: (246, 11)
------------------------------
Dataset: Dataset_16
X_train shape: (465, 24)
y_train shape: (465, 1)
X_test shape: (311, 24)
--------

## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score

In [19]:
def KNN(X, y, test_size=0.3, n_neighbors=5):
    """
    Train a K-Nearest Neighbors (KNN) classifier and evaluate using the ROC AUC score.
    
    Parameters:
    X: Feature matrix (numpy array or pandas DataFrame)
    y: Target labels (numpy array or pandas Series)
    test_size: Proportion of the dataset to include in the test split (default is 0.3)
    n_neighbors: Number of neighbors to use for KNN (default is 5)
    
    Returns:
    auc_score: ROC AUC score for the test set
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # Train the classifier
    knn.fit(X_train, y_train)
    
    # Predict probabilities for the test set
    y_pred_proba = knn.predict_proba(X_test)[:, 1]
    
    # Calculate the ROC AUC score
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    return auc_score

In [20]:
testx = X_trains[0]
testy = y_trains[0].values.ravel()

KNN(testx, testy)

0.6910430839002267

In [21]:
models=[]
for i in range(len(dataset_names)):
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(tmp_X_train, tmp_y_train.squeeze())
    tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    models.append(model)



## Inference Model

In [22]:
y_predicts=[]
for i in range(len(dataset_names)):
    y_predict_proba=models[i].predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    

## Save result

In [23]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)