In [4]:
import os
from typing import List, Optional, Union, Dict, Any, Tuple
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch


from transformers import PatchTSTConfig, PatchTSTForClassification, Trainer, TrainingArguments

from utils import ForecastDFDataset

In [3]:
import os
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch


# Import PatchTST classes from Hugging Face Transformers
from transformers import PatchTSTConfig, PatchTSTForClassification, Trainer, TrainingArguments

def load_and_merge_csvs(data_dir, label_dict, debug=False, max_patients=10):
    """
    Load per-patient CSV files from data_dir.
    Each CSV file is read, the patient ID is extracted from the filename,
    and the AKI label (from label_dict) is attached to each row.
    Returns a merged DataFrame.
    """
    all_dfs = []
    count = 0
    for fname in os.listdir(data_dir):
        if fname.endswith(".csv"):
            csv_path = os.path.join(data_dir, fname)
            # Extract patient ID from filename: e.g., "R94565_combined.csv" -> "R94565"
            patient_id = fname.split('_')[0]
            df_ts = pd.read_csv(csv_path)
            # Create a time index (assumed one row per second)
            df_ts["time_idx"] = range(len(df_ts))
            # Add patient ID and AKI label (default to 0 if not found)
            df_ts["ID"] = patient_id
            df_ts["Acute_kidney_injury"] = label_dict.get(patient_id, 0)
            all_dfs.append(df_ts)
            count += 1
            if debug and count >= max_patients:
                break
    df_merged = pd.concat(all_dfs, ignore_index=True)
    return df_merged

def truncate_pad_series(df, fixed_length, pad_value=0):
    """
    For one patient's DataFrame df (assumed sorted by time_idx), truncate if length > fixed_length;
    if length < fixed_length, pad with pad_value.
    Returns a DataFrame with exactly fixed_length rows.
    """
    current_length = len(df)
    if current_length >= fixed_length:
        return df.iloc[:fixed_length].copy()
    else:
        pad_df = pd.DataFrame(pad_value, index=range(fixed_length - current_length), columns=df.columns)
        # Keep constant columns for 'ID' and 'Acute_kidney_injury'
        for col in ["ID", "Acute_kidney_injury"]:
            if col in df.columns:
                pad_df[col] = df.iloc[0][col]
        # Create a continuing time_idx
        pad_df["time_idx"] = range(current_length, fixed_length)
        df_out = pd.concat([df, pad_df], ignore_index=True)
        return df_out

def pool_time_series(df, window_size=60, pooling_method='average'):
    """
    Pool a single patient's time series DataFrame over non-overlapping windows of size `window_size`.
    Each window is aggregated per column using the specified pooling method:
       'average' -> np.nanmean, 'max' -> np.nanmax, 'median' -> np.nanmedian.
    Returns a new DataFrame with ceil(len(df)/window_size) rows.
    Non-numeric columns ('ID', 'Acute_kidney_injury', 'time_idx') are preserved.
    """
    exclude_cols = {"ID", "Acute_kidney_injury", "time_idx"}
    feature_cols = [col for col in df.columns if col not in exclude_cols and np.issubdtype(df[col].dtype, np.number)]
    pooled_data = []
    n = len(df)
    num_windows = int(np.ceil(n / window_size))
    for i in range(num_windows):
        start = i * window_size
        end = min((i + 1) * window_size, n)
        window = df.iloc[start:end]
        pooled_row = {}
        pooled_row["ID"] = window.iloc[0]["ID"]
        pooled_row["Acute_kidney_injury"] = window.iloc[0]["Acute_kidney_injury"]
        pooled_row["time_idx"] = window["time_idx"].mean()
        for col in feature_cols:
            if pooling_method == 'average':
                pooled_row[col] = np.nanmean(window[col])
            elif pooling_method == 'max':
                pooled_row[col] = np.nanmax(window[col])
            elif pooling_method == 'median':
                pooled_row[col] = np.nanmedian(window[col])
            else:
                raise ValueError(f"Unknown pooling method: {pooling_method}")
        pooled_data.append(pooled_row)
    return pd.DataFrame(pooled_data)

# def main(args):
#     # Step 1: Load the Excel with AKI labels.
#     df_labels = pd.read_excel("imputed_demo_data.xlsx")
#     df_labels = df_labels[["ID", "Acute_kidney_injury"]].drop_duplicates()
#     label_dict = dict(zip(df_labels["ID"], df_labels["Acute_kidney_injury"]))
    
#     # Step 2: Load per-patient CSVs and merge them.
#     merged_df = load_and_merge_csvs(data_dir=args.data_dir, label_dict=label_dict, debug=args.debug, max_patients=args.max_patients)
    
#     # Step 3: Process each patient’s data to ensure a fixed length of 3 hours.
#     # For 1-second resolution, fixed length = 10800; or use pooling to get fewer points.
#     processed_dfs = []
#     for patient_id, group in merged_df.groupby("ID"):
#         group = group.sort_values("time_idx")
#         if args.process_mode == "truncate":
#             processed = truncate_pad_series(group, fixed_length=args.fixed_length, pad_value=0)
#         elif args.process_mode == "pool":
#             processed = pool_time_series(group, window_size=args.pool_window, pooling_method=args.pool_method)
#         else:
#             processed = group
#         processed_dfs.append(processed)
#     processed_df = pd.concat(processed_dfs, ignore_index=True)
    
#     # Option: if using pooling, all patients should now have the same number of rows.
#     # Split the data by patient ID to get train/val split.
#     unique_ids = processed_df["ID"].unique()
#     train_ids, val_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
#     train_df = processed_df[processed_df["ID"].isin(train_ids)]
#     val_df = processed_df[processed_df["ID"].isin(val_ids)]
    
#     # Determine the feature columns (time-varying unknowns) for the dataset.
#     feature_cols = [col for col in processed_df.columns if col not in {"ID", "Acute_kidney_injury", "time_idx"}]
    
#     # Create ForecastDFDataset objects.
#     # Here, history_length is set to the length of each patient's series (assuming uniform length after processing).
#     history_length = train_df.groupby("ID").size().max()
#     train_dataset = ForecastDFDataset(
#         df=train_df,
#         id_col="ID",
#         time_col="time_idx",
#         target_col="Acute_kidney_injury",
#         history_length=history_length,
#         forecast_length=1,
#         time_varying_unknown_cols=feature_cols,
#         static_reals_cols=[],
#     )
#     val_dataset = ForecastDFDataset(
#         df=val_df,
#         id_col="ID",
#         time_col="time_idx",
#         target_col="Acute_kidney_injury",
#         history_length=history_length,
#         forecast_length=1,
#         time_varying_unknown_cols=feature_cols,
#         static_reals_cols=[],
#     )
    
#     # For simplicity, we assume the entire dataset fits into memory.
#     # Create Hugging Face style datasets (if ForecastDFDataset provides a .to_dataloader() method).
#     train_loader = train_dataset.to_dataloader(batch_size=args.batch_size, shuffle=True, mode="train")
#     val_loader = val_dataset.to_dataloader(batch_size=args.batch_size, shuffle=False, mode="valid")
    
#     # Create PatchTST configuration and model for classification.
#     config = PatchTSTConfig(
#         num_input_channels=len(feature_cols),
#         context_length=history_length,  # input sequence length (after processing)
#         prediction_length=1,
#         num_targets=2,  # binary classification (0 and 1)
#         patch_length=args.patch_length,
#         patch_stride=args.patch_stride,
#         d_model=args.d_model,
#         num_hidden_layers=args.num_hidden_layers,
#         num_attention_heads=args.num_attention_heads,
#     )
#     model = PatchTSTForClassification(config)
    
#     # Setup training arguments
#     training_args = TrainingArguments(
#         output_dir=args.output_dir,
#         evaluation_strategy="steps",
#         eval_steps=args.eval_steps,
#         logging_steps=args.logging_steps,
#         per_device_train_batch_size=args.batch_size,
#         per_device_eval_batch_size=args.batch_size,
#         num_train_epochs=args.epochs,
#         save_steps=args.save_steps,
#         save_total_limit=2,
#         load_best_model_at_end=True,
#         metric_for_best_model="loss",
#     )
    
#     # Define a compute_metrics function for evaluation (using accuracy)
#     def compute_metrics(eval_pred):
#         logits, labels = eval_pred
#         preds = np.argmax(logits, axis=-1)
#         accuracy = (preds == labels).mean()
#         return {"accuracy": accuracy}
    
#     # Instantiate Trainer
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=val_dataset,
#         compute_metrics=compute_metrics,
#     )
    
#     # Train the model and evaluate
#     trainer.train()
#     trainer.evaluate()


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--data_dir", type=str, default="time_series_data_LSTM_10_29_2024",
#                         help="Folder with per-patient CSV files.")
#     parser.add_argument("--debug", action="store_true", help="Debug mode: load only a few patients.")
#     parser.add_argument("--max_patients", type=int, default=10, help="Max patients to load in debug mode.")
#     parser.add_argument("--process_mode", type=str, choices=["truncate", "pool", "none"], default="pool",
#                         help="Preprocessing mode: 'truncate' (pad/truncate to fixed length), 'pool' (aggregate over windows), or 'none'.")
#     parser.add_argument("--fixed_length", type=int, default=10800,
#                         help="Fixed length (number of rows) if using truncate mode (e.g., 10800 for 3 hours at 1 sec resolution).")
#     parser.add_argument("--pool_window", type=int, default=60,
#                         help="Window size for pooling (e.g., 60 seconds).")
#     parser.add_argument("--pool_method", type=str, choices=["average", "max", "median"], default="average",
#                         help="Pooling method if using pool mode.")
#     parser.add_argument("--batch_size", type=int, default=32)
#     parser.add_argument("--patch_length", type=int, default=16)
#     parser.add_argument("--patch_stride", type=int, default=8)
#     parser.add_argument("--d_model", type=int, default=64)
#     parser.add_argument("--num_hidden_layers", type=int, default=2)
#     parser.add_argument("--num_attention_heads", type=int, default=8)
#     parser.add_argument("--output_dir", type=str, default="./patchtst_checkpoints")
#     parser.add_argument("--eval_steps", type=int, default=100)
#     parser.add_argument("--logging_steps", type=int, default=50)
#     parser.add_argument("--save_steps", type=int, default=100)
#     parser.add_argument("--epochs", type=int, default=10)
    
#     args = parser.parse_args()
#     main(args)


In [5]:
# Step 1: Load the Excel with AKI labels.
df_labels = pd.read_excel("imputed_demo_data.xlsx")
df_labels = df_labels[["ID", "Acute_kidney_injury"]].drop_duplicates()
label_dict = dict(zip(df_labels["ID"], df_labels["Acute_kidney_injury"]))

In [6]:
# Step 2: Load per-patient CSVs and merge them.
merged_df = load_and_merge_csvs(data_dir='time_series_data_LSTM_10_29_2024',
                                label_dict=label_dict,
                                debug=False,
                                max_patients=100,
                                # max_patients=df_labels.shape[0]
                               )

In [7]:
# Step 3: Process each patient’s data to ensure a fixed length of 3 hours.
# For 1-second resolution, fixed length = 10800; or use pooling to get fewer points.
processed_dfs = []
for patient_id, group in merged_df.groupby("ID"):
    processed = pool_time_series(group, window_size=60, pooling_method='average')
    # group = group.sort_values("time_idx")
    # if args.process_mode == "truncate":
    #     processed = truncate_pad_series(group, fixed_length=args.fixed_length, pad_value=0)
    # elif args.process_mode == "pool":
    #     processed = pool_time_series(group, window_size=args.pool_window, pooling_method=args.pool_method)
    # else:
    #     processed = group
    processed_dfs.append(processed)
processed_df = pd.concat(processed_dfs, ignore_index=True)

  pooled_row[col] = np.nanmean(window[col])


KeyboardInterrupt: 

In [None]:
processed_df

In [None]:
# Option: if using pooling, all patients should now have the same number of rows.
# Split the data by patient ID to get train/val split.
unique_ids = processed_df["ID"].unique()
train_ids, val_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
train_df = processed_df[processed_df["ID"].isin(train_ids)]
val_df = processed_df[processed_df["ID"].isin(val_ids)]

# Determine the feature columns (time-varying unknowns) for the dataset.
feature_cols = [col for col in processed_df.columns if col not in {"ID", "Acute_kidney_injury", "time_idx"}]

In [None]:
# Create ForecastDFDataset objects.
# Here, history_length is set to the length of each patient's series (assuming uniform length after processing).
history_length = train_df.groupby("ID").size().max()
train_dataset = ForecastDFDataset(
    df=train_df,
    id_col="ID",
    time_col="time_idx",
    target_col="Acute_kidney_injury",
    history_length=history_length,
    forecast_length=1,
    time_varying_unknown_cols=feature_cols,
    static_reals_cols=[],
)
val_dataset = ForecastDFDataset(
    df=val_df,
    id_col="ID",
    time_col="time_idx",
    target_col="Acute_kidney_injury",
    history_length=history_length,
    forecast_length=1,
    time_varying_unknown_cols=feature_cols,
    static_reals_cols=[],
)

# For simplicity, we assume the entire dataset fits into memory.
# Create Hugging Face style datasets (if ForecastDFDataset provides a .to_dataloader() method).
train_loader = train_dataset.to_dataloader(batch_size=args.batch_size, shuffle=True, mode="train")
val_loader = val_dataset.to_dataloader(batch_size=args.batch_size, shuffle=False, mode="valid")

# Create PatchTST configuration and model for classification.
config = PatchTSTConfig(
    num_input_channels=len(feature_cols),
    context_length=history_length,  # input sequence length (after processing)
    prediction_length=1,
    num_targets=2,  # binary classification (0 and 1)
    patch_length=args.patch_length,
    patch_stride=args.patch_stride,
    d_model=args.d_model,
    num_hidden_layers=args.num_hidden_layers,
    num_attention_heads=args.num_attention_heads,
)
model = PatchTSTForClassification(config)

# Setup training arguments
training_args = TrainingArguments(
    output_dir=args.output_dir,
    evaluation_strategy="steps",
    eval_steps=args.eval_steps,
    logging_steps=args.logging_steps,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.epochs,
    save_steps=args.save_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

# Define a compute_metrics function for evaluation (using accuracy)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model and evaluate
trainer.train()
trainer.evaluate()

In [16]:


def load_and_merge_csvs(data_dir, label_dict, debug=False, max_patients=10):
    """
    Load per-patient CSV files from data_dir.
    Each CSV file is read, the patient ID is extracted from the filename,
    and the AKI label (from label_dict) is attached to each row.
    Returns a merged DataFrame.
    """
    all_dfs = []
    count = 0
    for fname in os.listdir(data_dir):
        if fname.endswith(".csv"):
            csv_path = os.path.join(data_dir, fname)
            # Extract patient ID from filename: e.g., "R94565_combined.csv" -> "R94565"
            patient_id = fname.split('_')[0]
            df_ts = pd.read_csv(csv_path)
            # Create a time index (assumed one row per second)
            df_ts["time_idx"] = range(len(df_ts))
            # Add patient ID and AKI label (default to 0 if not found)
            df_ts["ID"] = patient_id
            df_ts["Acute_kidney_injury"] = label_dict.get(patient_id, 0)
            all_dfs.append(df_ts)
            count += 1
            if debug and count >= max_patients:
                break
    df_merged = pd.concat(all_dfs, ignore_index=True)
    return df_merged

def truncate_pad_series(df, fixed_length, pad_value=0):
    """
    For one patient's DataFrame df (assumed sorted by time_idx), truncate if length > fixed_length;
    if length < fixed_length, pad with pad_value.
    Returns a DataFrame with exactly fixed_length rows.
    """
    current_length = len(df)
    if current_length >= fixed_length:
        return df.iloc[:fixed_length].copy()
    else:
        pad_df = pd.DataFrame(pad_value, index=range(fixed_length - current_length), columns=df.columns)
        # Keep constant columns for 'ID' and 'Acute_kidney_injury'
        for col in ["ID", "Acute_kidney_injury"]:
            if col in df.columns:
                pad_df[col] = df.iloc[0][col]
        # Create a continuing time_idx
        pad_df["time_idx"] = range(current_length, fixed_length)
        df_out = pd.concat([df, pad_df], ignore_index=True)
        return df_out

def pool_time_series(df, window_size=60, pooling_method='average'):
    """
    Pool a single patient's time series DataFrame over non-overlapping windows of size `window_size`.
    Each window is aggregated per column using the specified pooling method:
       'average' -> np.nanmean, 'max' -> np.nanmax, 'median' -> np.nanmedian.
    Returns a new DataFrame with ceil(len(df)/window_size) rows.
    Non-numeric columns ('ID', 'Acute_kidney_injury', 'time_idx') are preserved.
    """
    exclude_cols = {"ID", "Acute_kidney_injury", "time_idx"}
    feature_cols = [col for col in df.columns if col not in exclude_cols and np.issubdtype(df[col].dtype, np.number)]
    pooled_data = []
    n = len(df)
    num_windows = int(np.ceil(n / window_size))
    for i in range(num_windows):
        start = i * window_size
        end = min((i + 1) * window_size, n)
        window = df.iloc[start:end]
        pooled_row = {}
        pooled_row["ID"] = window.iloc[0]["ID"]
        pooled_row["Acute_kidney_injury"] = window.iloc[0]["Acute_kidney_injury"]
        pooled_row["time_idx"] = window["time_idx"].mean()
        for col in feature_cols:
            if pooling_method == 'average':
                pooled_row[col] = np.nanmean(window[col])
            elif pooling_method == 'max':
                pooled_row[col] = np.nanmax(window[col])
            elif pooling_method == 'median':
                pooled_row[col] = np.nanmedian(window[col])
            else:
                raise ValueError(f"Unknown pooling method: {pooling_method}")
        pooled_data.append(pooled_row)
    return pd.DataFrame(pooled_data)

def main(args):
    # Step 1: Load the Excel with AKI labels.
    df_labels = pd.read_excel("imputed_demo_data.xlsx")
    df_labels = df_labels[["ID", "Acute_kidney_injury"]].drop_duplicates()
    label_dict = dict(zip(df_labels["ID"], df_labels["Acute_kidney_injury"]))
    
    # Step 2: Load per-patient CSVs and merge them.
    merged_df = load_and_merge_csvs(data_dir=args.data_dir, label_dict=label_dict, debug=args.debug, max_patients=args.max_patients)
    
    # Step 3: Process each patient’s data to ensure a fixed length of 3 hours.
    # For 1-second resolution, fixed length = 10800; or use pooling to get fewer points.
    processed_dfs = []
    for patient_id, group in merged_df.groupby("ID"):
        group = group.sort_values("time_idx")
        if args.process_mode == "truncate":
            processed = truncate_pad_series(group, fixed_length=args.fixed_length, pad_value=0)
        elif args.process_mode == "pool":
            processed = pool_time_series(group, window_size=args.pool_window, pooling_method=args.pool_method)
        else:
            processed = group
        processed_dfs.append(processed)
    processed_df = pd.concat(processed_dfs, ignore_index=True)
    
    # Option: if using pooling, all patients should now have the same number of rows.
    # Split the data by patient ID to get train/val split.
    unique_ids = processed_df["ID"].unique()
    train_ids, val_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
    train_df = processed_df[processed_df["ID"].isin(train_ids)]
    val_df = processed_df[processed_df["ID"].isin(val_ids)]
    
    # Determine the feature columns (time-varying unknowns) for the dataset.
    feature_cols = [col for col in processed_df.columns if col not in {"ID", "Acute_kidney_injury", "time_idx"}]
    
    # Create ForecastDFDataset objects.
    # Here, history_length is set to the length of each patient's series (assuming uniform length after processing).
    history_length = train_df.groupby("ID").size().max()
    train_dataset = ForecastDFDataset(
        df=train_df,
        id_col="ID",
        time_col="time_idx",
        target_col="Acute_kidney_injury",
        history_length=history_length,
        forecast_length=1,
        time_varying_unknown_cols=feature_cols,
        static_reals_cols=[],
    )
    val_dataset = ForecastDFDataset(
        df=val_df,
        id_col="ID",
        time_col="time_idx",
        target_col="Acute_kidney_injury",
        history_length=history_length,
        forecast_length=1,
        time_varying_unknown_cols=feature_cols,
        static_reals_cols=[],
    )
    
    # For simplicity, we assume the entire dataset fits into memory.
    # Create Hugging Face style datasets (if ForecastDFDataset provides a .to_dataloader() method).
    train_loader = train_dataset.to_dataloader(batch_size=args.batch_size, shuffle=True, mode="train")
    val_loader = val_dataset.to_dataloader(batch_size=args.batch_size, shuffle=False, mode="valid")
    
    # Create PatchTST configuration and model for classification.
    config = PatchTSTConfig(
        num_input_channels=len(feature_cols),
        context_length=history_length,  # input sequence length (after processing)
        prediction_length=1,
        num_targets=2,  # binary classification (0 and 1)
        patch_length=args.patch_length,
        patch_stride=args.patch_stride,
        d_model=args.d_model,
        num_hidden_layers=args.num_hidden_layers,
        num_attention_heads=args.num_attention_heads,
    )
    model = PatchTSTForClassification(config)
    
    # Setup training arguments
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        evaluation_strategy="steps",
        eval_steps=args.eval_steps,
        logging_steps=args.logging_steps,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.epochs,
        save_steps=args.save_steps,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
    )
    
    # Define a compute_metrics function for evaluation (using accuracy)
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        accuracy = (preds == labels).mean()
        return {"accuracy": accuracy}
    
    # Instantiate Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Train the model and evaluate
    trainer.train()
    trainer.evaluate()

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--data_dir", type=str, default="time_series_data_LSTM_10_29_2024",
#                         help="Folder with per-patient CSV files.")
#     parser.add_argument("--debug", action="store_true", help="Debug mode: load only a few patients.")
#     parser.add_argument("--max_patients", type=int, default=10, help="Max patients to load in debug mode.")
#     parser.add_argument("--process_mode", type=str, choices=["truncate", "pool", "none"], default="pool",
#                         help="Preprocessing mode: 'truncate' (pad/truncate to fixed length), 'pool' (aggregate over windows), or 'none'.")
#     parser.add_argument("--fixed_length", type=int, default=10800,
#                         help="Fixed length (number of rows) if using truncate mode (e.g., 10800 for 3 hours at 1 sec resolution).")
#     parser.add_argument("--pool_window", type=int, default=60,
#                         help="Window size for pooling (e.g., 60 seconds).")
#     parser.add_argument("--pool_method", type=str, choices=["average", "max", "median"], default="average",
#                         help="Pooling method if using pool mode.")
#     parser.add_argument("--batch_size", type=int, default=32)
#     parser.add_argument("--patch_length", type=int, default=16)
#     parser.add_argument("--patch_stride", type=int, default=8)
#     parser.add_argument("--d_model", type=int, default=64)
#     parser.add_argument("--num_hidden_layers", type=int, default=2)
#     parser.add_argument("--num_attention_heads", type=int, default=8)
#     parser.add_argument("--output_dir", type=str, default="./patchtst_checkpoints")
#     parser.add_argument("--eval_steps", type=int, default=100)
#     parser.add_argument("--logging_steps", type=int, default=50)
#     parser.add_argument("--save_steps", type=int, default=100)
#     parser.add_argument("--epochs", type=int, default=10)
    
#     args = parser.parse_args()
#     main(args)

In [17]:
# Step 1: Load the Excel with AKI labels.
df_labels = pd.read_excel("imputed_demo_data.xlsx")
df_labels = df_labels[["ID", "Acute_kidney_injury"]].drop_duplicates()
label_dict = dict(zip(df_labels["ID"], df_labels["Acute_kidney_injury"]))

In [21]:
# Step 2: Load per-patient CSVs and merge them
merged_df = load_and_merge_csvs(data_dir='time_series_data_LSTM_10_29_2024', 
                                label_dict=label_dict, 
                                debug=False, 
                                max_patients=df_labels.shape[0])

In [None]:
################################################################################
# 1. Pooling over 60-second windows.
# For a 3-hour period, 3*3600/60 = 180 windows.
# pool_method = mean
################################################################################

In [23]:
merged_df

Unnamed: 0,TVexp,MVexp,RRtotal,Circuit_O2,Pmean,TVinsp,MVinsp,PEEPe_i,CO,CI,...,rSO2_Ch1,rSO2_Ch2,rSO2_Ch3,SpO2,ET_CO2,TEMP,FiO2,time_idx,ID,Acute_kidney_injury
0,,,,,,,,,,,...,,,,100.0,0.0,,,0,R94565,0.0
1,,,,,,,,,,,...,,,,100.0,0.0,,,1,R94565,0.0
2,,,,,,,,,,,...,,,,100.0,0.0,,,2,R94565,0.0
3,,,,,,,,,,,...,,,,100.0,0.0,,,3,R94565,0.0
4,,,,,,,,,,,...,,,,100.0,0.0,,,4,R94565,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27881047,316.0,3.88,12.0,81.0,5.0,348.0,4.18,2.7,5.36,3.22,...,71.0,68.0,80.0,100.0,,,81.0,13240,S25956,0.0
27881048,316.0,3.88,12.0,81.0,5.0,348.0,4.18,2.7,6.15,3.70,...,71.0,68.0,80.0,100.0,,,81.0,13241,S25956,0.0
27881049,316.0,3.88,12.0,81.0,5.0,348.0,4.18,2.7,5.92,3.56,...,71.0,68.0,80.0,100.0,,,81.0,13242,S25956,0.0
27881050,316.0,3.88,12.0,81.0,5.0,348.0,4.18,2.7,5.96,3.59,...,71.0,68.0,80.0,100.0,,,81.0,13243,S25956,0.0


In [22]:
# Step 3: Process each patient’s data to ensure a fixed length of 3 hours,
# then apply pooling if requested.
processed_dfs = []
for patient_id, group in merged_df.groupby("ID"):
    group = group.sort_values("time_idx")
    # First, truncate/pad to fixed length (3 hours = 10800 seconds)
    group_fixed = truncate_pad_series(group, fixed_length=10800, pad_value=0)
    # if args.process_mode == "truncate":
    #     processed = group_fixed
    # elif args.process_mode == "pool":
    #     # Now pool over non-overlapping windows (e.g., window_size=60 seconds)
    #     processed = pool_time_series(group_fixed, window_size=args.pool_window, pooling_method=args.pool_method)
    # else:
    #     processed = group_fixed  # default to fixed-length series without further pooling
    process_mode = 'pool'
    processed = pool_time_series(group_fixed, window_size=60, pooling_method='average')
    processed_dfs.append(processed)
processed_df = pd.concat(processed_dfs, ignore_index=True)


  pooled_row[col] = np.nanmean(window[col])


KeyboardInterrupt: 

In [None]:
# Option: if using pooling, all patients should now have the same number of rows.
# Split the data by patient ID to get train/val split.
unique_ids = processed_df["ID"].unique()
train_ids, val_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
train_df = processed_df[processed_df["ID"].isin(train_ids)]
val_df = processed_df[processed_df["ID"].isin(val_ids)]

# Determine the feature columns (time-varying unknowns) for the dataset.
feature_cols = [col for col in processed_df.columns if col not in {"ID", "Acute_kidney_injury", "time_idx"}]

# Create ForecastDFDataset objects.
# Here, history_length is set to the length of each patient's series (assuming uniform length after processing).
history_length = train_df.groupby("ID").size().max()
train_dataset = ForecastDFDataset(
    df=train_df,
    id_col="ID",
    time_col="time_idx",
    target_col="Acute_kidney_injury",
    history_length=history_length,
    forecast_length=1,
    time_varying_unknown_cols=feature_cols,
    static_reals_cols=[],
)
val_dataset = ForecastDFDataset(
    df=val_df,
    id_col="ID",
    time_col="time_idx",
    target_col="Acute_kidney_injury",
    history_length=history_length,
    forecast_length=1,
    time_varying_unknown_cols=feature_cols,
    static_reals_cols=[],
)

# For simplicity, we assume the entire dataset fits into memory.
# Create Hugging Face style datasets (if ForecastDFDataset provides a .to_dataloader() method).
train_loader = train_dataset.to_dataloader(batch_size=args.batch_size, shuffle=True, mode="train")
val_loader = val_dataset.to_dataloader(batch_size=args.batch_size, shuffle=False, mode="valid")

# Create PatchTST configuration and model for classification.
config = PatchTSTConfig(
    num_input_channels=len(feature_cols),
    context_length=history_length,  # input sequence length (after processing)
    prediction_length=1,
    num_targets=2,  # binary classification (0 and 1)
    patch_length=args.patch_length,
    patch_stride=args.patch_stride,
    d_model=args.d_model,
    num_hidden_layers=args.num_hidden_layers,
    num_attention_heads=args.num_attention_heads,
)
model = PatchTSTForClassification(config)

# Setup training arguments
training_args = TrainingArguments(
    output_dir=args.output_dir,
    evaluation_strategy="steps",
    eval_steps=args.eval_steps,
    logging_steps=args.logging_steps,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.epochs,
    save_steps=args.save_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

# Define a compute_metrics function for evaluation (using accuracy)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model and evaluate
trainer.train()
trainer.evaluate()

