In [103]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from utils import save_predictions_to_csv, standardize_data, calculate_auc_score, compare_auc_scores
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Add, BatchNormalization, Activation, Input
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, LearningRateScheduler
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [104]:
#Load datasets
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))


for i in range(min(5, len(dataset_names))):
    print(f"Dataset: {dataset_names[i]}")
    print(f"X_train shape: {X_trains[i].shape}")
    print(f"y_train shape: {y_trains[i].shape}")
    print(f"X_test shape: {X_tests[i].shape}")
    print("-" * 30)

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------


In [105]:
# 建立資料夾來儲存模型和結果
if not os.path.exists('./training_plots'):
    os.makedirs('./training_plots')

# 數據增強：加入隨機噪聲
def add_noise(data, noise_level=0.5):
    noise = noise_level * np.random.randn(*data.shape)
    return data + noise

# 設置 5 折分層交叉驗證
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 定義殘差塊
def residual_block(x, units):
    shortcut = x  # 保存輸入以進行殘差連接
    x = Dense(units, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(units)(x)
    x = BatchNormalization()(x)
    x = Add()([x, shortcut])  # 殘差連接
    x = Activation('relu')(x)
    return x

def adjust_learning_rate(epoch, lr):
    # 初始學習率
    if epoch == 100:
        return lr * 0.1  # 在第 100 個 epoch 將學習率降低到原來的 10%
    elif epoch == 150:
        return lr * 0.1  # 在第 150 個 epoch 再次將學習率降低到原來的 10%
    return lr  # 其他 epoch 保持原來的學習率

# 初始化 LearningRateScheduler 回調
lr_scheduler = LearningRateScheduler(adjust_learning_rate)

In [None]:
# 使用 Residual Network 進行交叉驗證
for i in range(len(X_trains)):
    # Get current dataset
    X_train = X_trains[i].values
    y_train = y_trains[i].values.ravel()
    X_test = X_tests[i].values

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # 數據增強
    X_train = add_noise(X_train)

    train_predictions = []
    test_predictions = []

    # 分層交叉驗證
    for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train), 1):
        print(f"Training fold {fold} for dataset {dataset_names[i]}...")

        # 分割數據
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        # 建立 ResNet 模型
        input_dim = X_train_fold.shape[1]
        inputs = Input(shape=(input_dim,))
        x = Dense(64, activation='relu')(inputs)
        x = BatchNormalization()(x)
        x = Dropout(0.5) (x)

        x = residual_block(x,64)

        x = Dense(32, activation='relu')(x)
        x = BatchNormalization() (x)
        x = Dropout(0.5) (x)

        outputs = Dense(1, activation='sigmoid')(x)

        # 定義模型
        model = Model(inputs, outputs)

        # 編譯模型
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        # 訓練模型
        history = model.fit(X_train_fold, y_train_fold, validation_data=(X_val_fold, y_val_fold),
                            epochs=200, batch_size=16, verbose=1, callbacks=[lr_scheduler])

        # 對整個 X_train 進行預測，並轉換為二進制分類（0 或 1）
        y_train_pred_binary = (model.predict(X_train) > 0.5).astype(int)
        train_predictions.append(y_train_pred_binary)

        y_test_pred_binary = (model.predict(X_test) > 0.5).astype(int)
        test_predictions.append(y_test_pred_binary)

        # 驗證模型
        val_loss, val_accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # 儲存每一個 fold 的訓練和驗證圖表
        plt.figure(figsize=(12, 5))

        # 子圖 1：Loss 圖表
        plt.subplot(1, 2, 1)
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'Loss for {dataset_names[i]} - Fold {fold}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend(loc='upper right')

        # 子圖 2：Accuracy 圖表
        plt.subplot(1, 2, 2)
        plt.plot(history.history['accuracy'], label='Training Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.title(f'Accuracy for {dataset_names[i]} - Fold {fold}')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend(loc='upper right')

        # 儲存圖表
        plt.suptitle(f'Training and Validation History for {dataset_names[i]} - Fold {fold}')
        plt.savefig(f'./training_plots/training_plot_{dataset_names[i]}_fold_{fold}.png')
        plt.close()

    # 使用多數決來決定最終的預測結果
    train_predictions = np.array(train_predictions)  # 轉換為 numpy array 形狀 (5, X_train樣本數, 1)
    final_prediction = (np.sum(train_predictions, axis=0) >= 3).astype(int)  # 多數決策略

    test_predictions = np.array(test_predictions)
    final_prediction_test = (np.sum(test_predictions, axis=0) >= 3).astype(int)

    # Create DataFrame and save to CSV
    df = pd.DataFrame(final_prediction_test, columns=['y_predict_proba'])
    df.to_csv(f'./Competition_data/{dataset_names[i]}/y_predict.csv', index=False, header=True)

    # 計算 AUC 分數
    try:
        auc_score = roc_auc_score(y_train, final_prediction)
        print(f"AUC score for training dataset {dataset_names[i]}: {auc_score:.4f}")
    except ValueError:
        auc_score = 'N/A'
        print(f"Cannot calculate AUC for dataset {dataset_names[i]} (possibly only one class present)")

Training fold 1 for dataset Dataset_1...
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 