In [1]:
import time
import utils_BiLReLU
import pandas as pd
import os
import torch
from torch import nn

In [None]:
# This script selects the hyper-parameters that yield the best performance across 5-fold cross validation and then evaluates the selected model on the test samples.
# Inputs: scaled_train.pkl, scaled_test.pkl, csv files under 'results' folder
# Outputs: target_set_performance.csv which contains the testing $R^2$ for each selected model.

INPUT_DIR = utils_BiLReLU.DATA_DIR / "raw"
OUTPUT_DIR = utils_BiLReLU.RESULTS_DIR / "S3" / "S3.1" / "S3.1.1" / "BiLSTMReLU"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Reading inputs from: {INPUT_DIR}")
print(f"Saving outputs to: {OUTPUT_DIR}")

# Loading dataset
train_path = INPUT_DIR / "NIAvalueUpdatedVersion_scaled_train.pkl"
test_path = INPUT_DIR / "NIAvalueUpdatedVersion_scaled_test.pkl"

if not train_path.exists():
    raise FileNotFoundError(f"Input file not found: {train_path}. Please run S1.1 first.")

scaled_train = pd.read_pickle(train_path)
scaled_test = pd.read_pickle(test_path)


# Different target_columns
target_sets = [ 
    ['amyloid'],
    ['niareagansc'],
    ['gpath'],
    ['tangles'],
    
    ['niareagansc', 'tangles'],
    ['gpath', 'tangles'],
    ['amyloid', 'gpath'], 
    ['niareagansc', 'gpath'],
    ['amyloid', 'tangles'],              
    ['amyloid', 'niareagansc'],
    
    ['amyloid', 'niareagansc', 'gpath'],
    ['amyloid', 'niareagansc', 'tangles'],
    ['gpath', 'tangles', 'amyloid'],
    ['gpath', 'tangles', 'niareagansc'],
    ['gpath', 'tangles', 'amyloid', 'niareagansc']
]

feature_columns = scaled_train.drop(columns=['projid', 'study', 'fu_year', 'cogdx',
                                             'amyloid', 'gpath', 'tangles', 'niareagansc']).columns.tolist()

In [None]:
# ---------------------------------------------------------
# 1) Define a helper function to handle a single target
# ---------------------------------------------------------
def run_model_for_target(
    scaled_train,
    scaled_test,
    feature_columns,
    target_columns,
    target,
    model_save_dir="results",
    num_epochs=500,
    patience=10,
    lr_scheduler_patience=5,
    lr_factor=0.5,
    seed=1217,
    temporary=True
):
    """d
    For a given set of target_columns and a specific target (e.g. 'gpath' or 'tangles'),
    load the CSV of hyperparam search results, pick the best row for the metric,
    train/evaluate the LSTM, and return a dictionary of results.
    """
    start_time = time.time()  # Timing starts

    # Create the sequences
    train_ids = scaled_train.projid.unique()
    train_sequences = utils_BiLReLU.create_sequences(
        scaled_train, train_ids, feature_columns, target_columns
    )

    test_ids = scaled_test.projid.unique()
    test_sequences = utils_BiLReLU.create_sequences(
        scaled_test, test_ids, feature_columns, target_columns
    )


    # Build the CSV filename for this set of targets
    filename = f"BiLSTMReLU_results_{'_'.join(target_columns)}.csv"

    # Read the hyperparameter tuning results
    cv_results = pd.read_csv(os.path.join('results', filename))
    best_row = cv_results.sort_values(by=target, ascending=False).iloc[0]

    # Extract the best hyperparameters
    hidden_size = int(best_row['hidden_size'])
    num_layers = int(best_row['num_layers'])
    batch_size = int(best_row['batch_size'])
    learning_rate = best_row['learning_rate']
    dropout_rate = best_row['dropout_rate']

    # Train/Evaluate the model
    test_r2, train_loss, val_loss, lr_history = utils_BiLReLU.train_and_evaluate_model(
        train_data=train_sequences,
        test_data=test_sequences,
        input_dim=len(feature_columns),
        output_dim=len(target_columns),
        num_epochs=num_epochs,
        patience=patience,
        lr_scheduler_patience=lr_scheduler_patience,
        lr_factor=lr_factor,
        hidden_size=hidden_size,
        num_layers=num_layers,
        batch_size=batch_size,
        learning_rate=learning_rate,
        seed=seed,
        dropout_rate=dropout_rate,
        temporary=temporary
    )

    result_dict = {
        'target': target,
        'train_columns': ', '.join(target_columns),
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'dropout_rate': dropout_rate,
        'gpath': None,
        'tangles': None,
        'amyloid': None,
        'niareagansc': None
    }

    for i, tgt in enumerate(target_columns):
        if i < len(test_r2):
            result_dict[tgt] = test_r2[i]

    end_time = time.time()  # Timing ends
    elapsed_time = end_time - start_time
    print(f"Time elapsed for target {target} with columns {target_columns}: {elapsed_time:.2f} seconds")

    return result_dict


In [None]:
# ---------------------------------------------------------
# 2) Main loop: run once for each set of targets
# ---------------------------------------------------------
overall_start = time.time()  # Timing starts
results = []

for target_columns in target_sets:
    # If “amyloid” is among these columns, run a model focusing on “amyloid”.
    if 'amyloid' in target_columns:
        result_amyloid = run_model_for_target(
            scaled_train,
            scaled_test,
            feature_columns,
            target_columns,
            target='amyloid'
        )
        results.append(result_amyloid)


    if 'niareagansc' in target_columns:
        result_niareagansc = run_model_for_target(
            scaled_train,
            scaled_test,
            feature_columns,
            target_columns,
            target='niareagansc'
        )
        results.append(result_niareagansc)

    if 'gpath' in target_columns:
        result_gpath = run_model_for_target(
            scaled_train,
            scaled_test,
            feature_columns,
            target_columns,
            target='gpath'
        )
        results.append(result_gpath)

    if 'tangles' in target_columns:
        result_tangles = run_model_for_target(
            scaled_train,
            scaled_test,
            feature_columns,
            target_columns,
            target='tangles'
        )
        results.append(result_tangles)


overall_end = time.time()  # Timing ends
total_elapsed = overall_end - overall_start
print(f"Total elapsed time for all target sets: {(total_elapsed/60):.2f} mins")

  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['amyloid']: 7.87 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['niareagansc']: 7.46 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['gpath']: 5.20 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['tangles']: 6.61 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['niareagansc', 'tangles']: 7.57 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['niareagansc', 'tangles']: 8.96 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['gpath', 'tangles']: 4.42 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['gpath', 'tangles']: 6.54 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['amyloid', 'gpath']: 15.67 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['amyloid', 'gpath']: 7.95 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['niareagansc', 'gpath']: 6.91 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['niareagansc', 'gpath']: 8.59 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['amyloid', 'tangles']: 8.57 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['amyloid', 'tangles']: 4.71 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['amyloid', 'niareagansc']: 7.52 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['amyloid', 'niareagansc']: 4.28 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['amyloid', 'niareagansc', 'gpath']: 12.43 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['amyloid', 'niareagansc', 'gpath']: 9.70 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['amyloid', 'niareagansc', 'gpath']: 7.65 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['amyloid', 'niareagansc', 'tangles']: 18.87 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['amyloid', 'niareagansc', 'tangles']: 9.92 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['amyloid', 'niareagansc', 'tangles']: 3.01 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['gpath', 'tangles', 'amyloid']: 16.61 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['gpath', 'tangles', 'amyloid']: 9.83 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['gpath', 'tangles', 'amyloid']: 10.65 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['gpath', 'tangles', 'niareagansc']: 9.62 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['gpath', 'tangles', 'niareagansc']: 21.36 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target tangles with columns ['gpath', 'tangles', 'niareagansc']: 5.74 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target amyloid with columns ['gpath', 'tangles', 'amyloid', 'niareagansc']: 20.59 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target niareagansc with columns ['gpath', 'tangles', 'amyloid', 'niareagansc']: 8.37 seconds


  model.load_state_dict(torch.load(temp_model_path))


Time elapsed for target gpath with columns ['gpath', 'tangles', 'amyloid', 'niareagansc']: 16.26 seconds
Time elapsed for target tangles with columns ['gpath', 'tangles', 'amyloid', 'niareagansc']: 8.63 seconds
Total elapsed time for all target sets: 5.14 mins


  model.load_state_dict(torch.load(temp_model_path))


In [None]:
# ---------------------------------------------------------
# 3) Convert the list of results to a DataFrame and save
# ---------------------------------------------------------

save_path = OUTPUT_DIR / "BiLReLU_1T_FinalResultsWithUpdatedNIAValue.csv"

# 3. 创建 DataFrame 并保存到指定路径
results_df = pd.DataFrame(results)
results_df.sort_values(by=['target']).to_csv(save_path, index=False)