In [1]:
import os
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from utils import save_predictions_to_csv, standardize_data, calculate_auc_score, compare_auc_scores

In [2]:
#Load datasets
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))


for i in range(min(5, len(dataset_names))):
    print(f"Dataset: {dataset_names[i]}")
    print(f"X_train shape: {X_trains[i].shape}")
    print(f"y_train shape: {y_trains[i].shape}")
    print(f"X_test shape: {X_tests[i].shape}")
    print("-" * 30)

Dataset: Dataset_1
X_train shape: (444, 20)
y_train shape: (444, 1)
X_test shape: (296, 20)
------------------------------
Dataset: Dataset_10
X_train shape: (467, 11)
y_train shape: (467, 1)
X_test shape: (312, 11)
------------------------------
Dataset: Dataset_11
X_train shape: (58, 62)
y_train shape: (58, 1)
X_test shape: (39, 62)
------------------------------
Dataset: Dataset_12
X_train shape: (154, 5)
y_train shape: (154, 1)
X_test shape: (104, 5)
------------------------------
Dataset: Dataset_13
X_train shape: (181, 54)
y_train shape: (181, 1)
X_test shape: (122, 54)
------------------------------


In [3]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt



In [4]:
# 建立資料夾來儲存模型和結果
if not os.path.exists('./saved_models'):
    os.makedirs('./saved_models')
if not os.path.exists('./training_plots'):
    os.makedirs('./training_plots')

# 用於儲存模型成果的字典
model_performance = {}

In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import StandardScaler
from utils import save_predictions_to_csv

# 建立資料夾來儲存模型和結果
if not os.path.exists('./saved_models'):
    os.makedirs('./saved_models')
if not os.path.exists('./training_plots'):
    os.makedirs('./training_plots')

# 用於儲存模型成果的字典
model_performance = {}

# Build and train an ANN for one dataset (as specified)
for i in range(len(X_trains)):
    # Get current dataset
    X_train = X_trains[i].values
    y_train = y_trains[i].values.ravel()
    X_test = X_tests[i].values

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the ANN model
    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Set callbacks for adaptive learning rate and early stopping
    lr_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)

    # Train the model with validation split
    print(f"Training on dataset {dataset_names[i]}...")
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,  # 20% of the training data used for validation
        epochs=100,
        batch_size=32,
        verbose=1,
        callbacks=[lr_reduction, early_stopping]
    )

    # Calculate final loss and accuracy on training data
    final_loss, final_accuracy = model.evaluate(X_train, y_train, verbose=0)

    # Calculate AUC score on training data
    try:
        y_train_pred_proba = model.predict(X_train)
        auc_score = roc_auc_score(y_train, y_train_pred_proba)
        print(f"AUC score for training dataset {dataset_names[i]}: {auc_score:.4f}")
    except ValueError:
        auc_score = 'N/A'
        print(f"Cannot calculate AUC for dataset {dataset_names[i]} (possibly only one class present)")

    # Save performance metrics to the dictionary
    model_performance[dataset_names[i]] = {
        'loss': final_loss,
        'accuracy': final_accuracy,
        'auc': auc_score
    }

    print(f"Final Loss for dataset {dataset_names[i]}: {final_loss:.4f}")
    print(f"Final Accuracy for dataset {dataset_names[i]}: {final_accuracy:.4f}")

    # Save the model
    model.save(f'./saved_models/model_{dataset_names[i]}.h5')
    print(f"Model for dataset {dataset_names[i]} saved!")

    # Predict on X_test
    y_test_pred_proba = model.predict(X_test)

    # Save predictions to CSV using the defined function
    save_predictions_to_csv(y_test_pred_proba, dataset_names[i], folder_path='./Competition_data/')
    print(f"Test predictions for dataset {dataset_names[i]} saved as CSV!")

    # Plot training history with validation data
    plt.figure()
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Training and Validation History for {dataset_names[i]}')
    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.legend(loc='upper right')
    plt.savefig(f'./training_plots/training_plot_{dataset_names[i]}.png')
    plt.close()
    print(f"Training plot for dataset {dataset_names[i]} saved!")

# 儲存模型成果至 CSV
results_df = pd.DataFrame([
    {'Dataset': dataset, 'Loss': metrics['loss'], 'Accuracy': metrics['accuracy'], 'AUC': metrics['auc']}
    for dataset, metrics in model_performance.items()
])

results_df.to_csv('./saved_models/model_performance.csv', index=False)
print("Model performance saved to './saved_models/model_performance.csv'")

Training on dataset Dataset_1...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
 1/12 [=>............................] - ETA: 0s - loss: 0.2115 - accuracy: 0.9375
Epoch 36: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
 1/12 [=>............................] - ETA: 0s - loss: 0.3128 - accuracy: 0.8125
Epoch 46: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Restoring model weights from the end of the best epoch: 26