In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

from utils import save_predictions_to_csv, standardize_data, calculate_auc_score, compare_auc_scores
from natsort import natsorted
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [2]:
#Load datasets
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]

for folder_name in natsorted(os.listdir("./Competition_data")):
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))

In [None]:
# 用於存儲每次訓練的驗證 AUC
validation_auc_scores = []

# 假設數據包含數值型和類別型特徵
for i in range(len(X_trains)):

    # 數據加載
    X_train = X_trains[i]
    y_train = y_trains[i].values.ravel()
    X_test = X_tests[i]

    # 區分數值型和類別型特徵
    numeric_features = X_train.select_dtypes(include=['float']).columns
    categorical_features = X_train.select_dtypes(include=['int']).columns

    # 定義數值型和類別型的處理器
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # 使用 ColumnTransformer 合併處理器
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # 平衡數據
    smote = BorderlineSMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 對數據進行預處理
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    # K-fold 交叉驗證
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_predictions_all_folds = []
    fold_auc_scores = []

    # 使用 5 折 K-fold
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):

        X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
        X_val_fold, y_val_fold = X_train[val_idx], y_train[val_idx]

        # 定義基模型
        rf = RandomForestClassifier(n_estimators=500, max_depth=50, random_state=43)
        mlp = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=400, random_state=44)
        svm = SVC(kernel='rbf', probability=True, random_state=45)
        xgb = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=46)
        lgbm = LGBMClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=47)

        # 堆疊模型
        stacking_model = StackingClassifier(
            estimators=[
                ('rf', rf),
                ('mlp',mlp),
                ('svm', svm),
                ('xgb', xgb),
                ('lgbm',lgbm),
            ],
            final_estimator=LogisticRegression(),
            stack_method='predict_proba',
            cv=3,
            n_jobs=-1
        )

        # 訓練堆疊分類器
        stacking_model.fit(X_train_fold, y_train_fold)

        # 驗證集預測
        y_val_pred = stacking_model.predict_proba(X_val_fold)[:, 1]
        val_auc = roc_auc_score(y_val_fold, y_val_pred)
        print(f"Fold {fold + 1} AUC: {val_auc:.4f}")
        fold_auc_scores.append(val_auc)

        # 測試集預測
        y_test_pred = stacking_model.predict_proba(X_test)[:, 1]
        test_predictions_all_folds.append(y_test_pred)

    # 計算加權測試集預測
    total_auc = sum(fold_auc_scores)
    fold_weights = [auc / total_auc for auc in fold_auc_scores]
    test_predictions_final = np.average(test_predictions_all_folds, axis=0, weights=fold_weights)

    # 計算平均驗證 AUC
    avg_val_auc = np.mean(fold_auc_scores)
    print(f"Average AUC for {i+1}'th Dataset: {avg_val_auc:.4f}")
    validation_auc_scores.append(avg_val_auc)

    # 儲存測試結果 CSV
    df = pd.DataFrame(test_predictions_final, columns=['y_predict_proba'])
    df.to_csv(f'./Competition_data/{dataset_names[i]}/y_predict.csv', index=False, header=True)

# 儲存 AUC 分數為 CSV 文件
auc_list = pd.DataFrame(validation_auc_scores, columns=["Validation AUC"])
auc_list.to_csv('./validation_auc_scores.csv', index_label='Dataset_Index', header=True)

Fold 1 AUC: 0.9330
Fold 2 AUC: 0.9094
Fold 3 AUC: 0.9045
Fold 4 AUC: 0.9194
Fold 5 AUC: 0.9324
Average AUC for 1'th Dataset: 0.9197
Fold 1 AUC: 1.0000
Fold 2 AUC: 1.0000
Fold 3 AUC: 1.0000
Fold 4 AUC: 1.0000
Fold 5 AUC: 1.0000
Average AUC for 2'th Dataset: 1.0000
Fold 1 AUC: 0.6531
Fold 2 AUC: 0.8571
Fold 3 AUC: 0.7551
Fold 4 AUC: 1.0000
Fold 5 AUC: 0.8980
Average AUC for 3'th Dataset: 0.8327
Fold 1 AUC: 0.9913
Fold 2 AUC: 0.9938
Fold 3 AUC: 0.9969
Fold 4 AUC: 0.9984
Fold 5 AUC: 0.9993
Average AUC for 4'th Dataset: 0.9959
Fold 1 AUC: 0.9997
Fold 2 AUC: 1.0000
Fold 3 AUC: 1.0000
Fold 4 AUC: 1.0000
Fold 5 AUC: 0.9842
Average AUC for 5'th Dataset: 0.9968
Fold 1 AUC: 0.9947
Fold 2 AUC: 1.0000
Fold 3 AUC: 0.9968
Fold 4 AUC: 0.9957
Fold 5 AUC: 1.0000
Average AUC for 6'th Dataset: 0.9974
Fold 1 AUC: 0.9988
Fold 2 AUC: 0.9988
Fold 3 AUC: 1.0000
Fold 4 AUC: 1.0000
Fold 5 AUC: 0.9985
Average AUC for 7'th Dataset: 0.9992
Fold 1 AUC: 0.9483
Fold 2 AUC: 0.8800
Fold 3 AUC: 0.9514
Fold 4 AUC: 0.9497
