In [1]:
import os
import pandas as pd

# Look up the AKI label in imputed_demo_data.xlsx for that patient’s ID.

# Load the Excel that has 'ID' and 'Acute_kidney_injury'
df_labels = pd.read_excel("imputed_demo_data.xlsx")
df_labels = df_labels[["ID", "Acute_kidney_injury"]].drop_duplicates()

# Make a dictionary for quick lookup of the label
label_dict = dict(zip(df_labels["ID"], df_labels["Acute_kidney_injury"]))


In [2]:
label_dict

{'R00118': 0.0,
 'R00627': 0.0,
 'R04519': 0.0,
 'R06096': 0.0,
 'R06923': 1.0,
 'R07008': 0.0,
 'R07366': 0.0,
 'R07653': 0.0,
 'R08046': 0.0,
 'R08594': 0.0,
 'R11664': 0.0,
 'R14110': 0.0,
 'R14244': 0.0,
 'R15971': 0.0,
 'R17156': 0.0,
 'R18283': 0.0,
 'R24195': 0.0,
 'R25239': 0.0,
 'R26666': 0.0,
 'R27034': 0.0,
 'R27338': 0.0,
 'R28251': 0.0,
 'R28536': 0.0,
 'R29821': 0.0,
 'R31132': 1.0,
 'R35378': 0.0,
 'R36397': 1.0,
 'R37681': 0.0,
 'R38174': 0.0,
 'R38966': 0.0,
 'R40555': 0.0,
 'R40759': 0.0,
 'R41199': 0.0,
 'R44253': 0.0,
 'R47647': 0.0,
 'R49153': 0.0,
 'R50211': 0.0,
 'R52548': 0.0,
 'R52884': 0.0,
 'R53601': 0.0,
 'R55544': 0.0,
 'R56879': 0.0,
 'R57506': 0.0,
 'R59147': 0.0,
 'R60191': 0.0,
 'R60712': 0.0,
 'R60755': 0.0,
 'R60882': 0.0,
 'R64659': 0.0,
 'R65241': 0.0,
 'R65313': 1.0,
 'R66662': 0.0,
 'R67677': 0.0,
 'R68199': 0.0,
 'R68685': 0.0,
 'R71310': 0.0,
 'R72890': 0.0,
 'R74993': 1.0,
 'R75465': 0.0,
 'R75753': 0.0,
 'R77330': 0.0,
 'R80255': 0.0,
 'R80815

In [19]:
fname.split('_')[0]

'S02709'

In [23]:
# data_dir = "time_series_data_LSTM_10_29_2024"  # the folder with patient CSVs
# all_dfs = []  # we will collect each patient’s time series into this list

# a = 0
# for fname in os.listdir(data_dir):
#     if fname.endswith(".csv"):
#         # e.g. fname = "R94565_combined.csv"
#         csv_path = os.path.join(data_dir, fname)
        
#         # parse out the ID from the filename
#         patient_id = fname.split('_')[0]   # "R94565"

#         # read the time series CSV for this patient
#         df_ts = pd.read_csv(csv_path)

#         # create an integer time index from row order
#         df_ts["time_idx"] = range(len(df_ts))
        
#         # add the patient ID as a column
#         df_ts["ID"] = patient_id
        
#         # look up AKI label from the dictionary
#         label_aki = label_dict[patient_id]
        
#         # add the label to each row
#         df_ts["Acute_kidney_injury"] = label_aki
        
#         # store for later
#         all_dfs.append(df_ts)

#         a = a+1
#         if a == 10:
#             break
# # after reading all CSVs, we can combine them
# df_merged = pd.concat(all_dfs, ignore_index=True)


In [39]:
import os
import pandas as pd
import numpy as np

def load_and_merge_csvs(data_dir, label_dict, debug=False, max_patients=10):
    """
    Load per-patient CSV files from data_dir, add the patient ID (parsed from filename),
    and attach the AKI label from label_dict.
    Returns a merged DataFrame.
    """
    all_dfs = []
    count = 0
    for fname in os.listdir(data_dir):
        if fname.endswith(".csv"):
            csv_path = os.path.join(data_dir, fname)
            # Assume ID is the first part of the filename separated by '_'
            patient_id = fname.split('_')[0]
            df_ts = pd.read_csv(csv_path)
            # Create a time index assuming rows are in order (one row per second)
            df_ts["time_idx"] = range(len(df_ts))
            # Add patient ID and label to every row
            df_ts["ID"] = patient_id
            df_ts["Acute_kidney_injury"] = label_dict.get(patient_id, 0)
            all_dfs.append(df_ts)
            count += 1
            if debug and count >= max_patients:
                break
    df_merged = pd.concat(all_dfs, ignore_index=True)
    return df_merged

def truncate_pad_series(df, fixed_length, pad_value=0):
    """
    For one patient's DataFrame df, truncate if length > fixed_length;
    if length < fixed_length, pad with pad_value.
    Assumes df is sorted by time_idx.
    Returns a DataFrame with exactly fixed_length rows.
    """
    current_length = len(df)
    if current_length >= fixed_length:
        # Truncate to fixed_length rows
        return df.iloc[:fixed_length].copy()
    else:
        # Create a DataFrame for padding with the same columns
        pad_df = pd.DataFrame(pad_value, index=range(fixed_length - current_length), columns=df.columns)
        # You might want to keep 'ID' and 'Acute_kidney_injury' constant:
        for col in ["ID", "Acute_kidney_injury"]:
            if col in df.columns:
                pad_df[col] = df.iloc[0][col]
        # Create a time_idx that continues the series
        pad_df["time_idx"] = range(current_length, fixed_length)
        # Append the pad_df to original df
        df_out = pd.concat([df, pad_df], ignore_index=True)
        return df_out


def pool_time_series(df, window_size=60, pooling_method='average'):
    """
    Pool a single patient's time series DataFrame over non-overlapping windows of size `window_size`.
    Each window is aggregated per column using the specified pooling method:
       'average' -> np.nanmean, 'max' -> np.nanmax, 'median' -> np.nanmedian.
    Returns a new DataFrame where the number of rows is ceil(original_length/window_size).
    Non-numeric columns (like 'ID', 'Acute_kidney_injury', and 'time_idx') are preserved.
    """
    # Determine which columns are time-series numeric features.
    exclude_cols = {"ID", "Acute_kidney_injury", "time_idx"}
    feature_cols = [col for col in df.columns if col not in exclude_cols and np.issubdtype(df[col].dtype, np.number)]
    
    pooled_data = []
    n = len(df)
    num_windows = int(np.ceil(n / window_size))
    
    for i in range(num_windows):
        start = i * window_size
        end = min((i + 1) * window_size, n)
        window = df.iloc[start:end]
        
        pooled_row = {}
        # Retain the patient ID and AKI label (assumed constant across rows)
        pooled_row["ID"] = window.iloc[0]["ID"]
        pooled_row["Acute_kidney_injury"] = window.iloc[0]["Acute_kidney_injury"]
        # For time_idx, we take the mean as a representative time.
        pooled_row["time_idx"] = window["time_idx"].mean()
        
        for col in feature_cols:
            if pooling_method == 'average':
                pooled_row[col] = np.nanmean(window[col])
            elif pooling_method == 'max':
                pooled_row[col] = np.nanmax(window[col])
            elif pooling_method == 'median':
                pooled_row[col] = np.nanmedian(window[col])
            else:
                raise ValueError(f"Unknown pooling method: {pooling_method}")
                
        pooled_data.append(pooled_row)
    
    return pd.DataFrame(pooled_data)


In [24]:
# Assuming you've already loaded your labels dictionary and merged your CSVs:
data_dir = "time_series_data_LSTM_10_29_2024"
# label_dict was created earlier from imputed_demo_data.xlsx
df_merged = load_and_merge_csvs(data_dir, label_dict, debug=True, max_patients=100)

In [25]:
df_merged

Unnamed: 0,TVexp,MVexp,RRtotal,Circuit_O2,Pmean,TVinsp,MVinsp,PEEPe_i,CO,CI,...,rSO2_Ch1,rSO2_Ch2,rSO2_Ch3,SpO2,ET_CO2,TEMP,FiO2,time_idx,ID,Acute_kidney_injury
0,,,,,,,,,,,...,,,,100.0,0.0,,,0,R94565,0.0
1,,,,,,,,,,,...,,,,100.0,0.0,,,1,R94565,0.0
2,,,,,,,,,,,...,,,,100.0,0.0,,,2,R94565,0.0
3,,,,,,,,,,,...,,,,100.0,0.0,,,3,R94565,0.0
4,,,,,,,,,,,...,,,,100.0,0.0,,,4,R94565,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415705,405.0,4.88,12.0,100.0,9.0,449.0,5.42,5.9,5.67,3.21,...,74.0,74.0,81.0,100.0,30.0,,100.0,12025,S04861,0.0
1415706,405.0,4.88,12.0,100.0,9.0,449.0,5.42,5.9,5.67,3.21,...,74.0,74.0,81.0,100.0,30.0,,100.0,12026,S04861,0.0
1415707,396.0,4.86,12.0,100.0,9.0,451.0,5.42,5.9,5.67,3.21,...,74.0,74.0,81.0,100.0,30.0,,100.0,12027,S04861,0.0
1415708,396.0,4.86,12.0,100.0,9.0,451.0,5.42,5.9,5.67,3.21,...,74.0,74.0,81.0,100.0,30.0,,100.0,12028,S04861,0.0


In [37]:
# Option 1: Truncate/pad each patient's series to a fixed length.
# Let's say we want exactly 3 hours of data at 1-second sampling -> 3 * 3600 = 10800 points.
fixed_length = 10800

# Process each patient group (by ID)
dfs_fixed = []
for patient_id, group in df_merged.groupby("ID"):
    # Make sure group is sorted by time_idx
    group_sorted = group.sort_values("time_idx")
    processed = truncate_pad_series(group_sorted, fixed_length=fixed_length, pad_value=0)
    dfs_fixed.append(processed)
    
df_fixed = pd.concat(dfs_fixed, ignore_index=True)
print("Fixed-length data shape:", df_fixed.shape)


Fixed-length data shape: (1080000, 29)


In [27]:
df_fixed

Unnamed: 0,TVexp,MVexp,RRtotal,Circuit_O2,Pmean,TVinsp,MVinsp,PEEPe_i,CO,CI,...,rSO2_Ch1,rSO2_Ch2,rSO2_Ch3,SpO2,ET_CO2,TEMP,FiO2,time_idx,ID,Acute_kidney_injury
0,,,,,,,,,,,...,76.0,81.0,80.0,100.0,0.0,,,0,R00118,0.0
1,,,,,,,,,,,...,76.0,81.0,80.0,100.0,0.0,,,1,R00118,0.0
2,,,,,,,,,,,...,76.0,81.0,80.0,100.0,0.0,,,2,R00118,0.0
3,,,,93.0,0.0,,,0.0,,,...,76.0,81.0,80.0,100.0,0.0,,93.0,3,R00118,0.0
4,,,,93.0,0.0,,,0.0,,,...,76.0,81.0,80.0,100.0,0.0,,93.0,4,R00118,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1079995,372.0,4.79,13.0,58.0,6.0,404.0,5.21,2.3,5.18,2.97,...,72.0,74.0,74.0,100.0,28.0,35.11,58.0,10795,T83969,0.0
1079996,374.0,4.81,13.0,58.0,6.0,404.0,5.22,2.2,5.15,2.95,...,72.0,74.0,74.0,100.0,28.0,35.11,58.0,10796,T83969,0.0
1079997,374.0,4.81,13.0,58.0,6.0,404.0,5.22,2.2,5.43,3.11,...,72.0,74.0,74.0,100.0,27.0,35.11,58.0,10797,T83969,0.0
1079998,374.0,4.81,13.0,58.0,6.0,404.0,5.22,2.2,5.42,3.11,...,72.0,74.0,74.0,100.0,27.0,35.11,58.0,10798,T83969,0.0


In [40]:
# Option 2: Pooling over 60-second windows.
# For a 3-hour period, 3*3600/60 = 180 windows.
# Process each patient group
dfs_pooled = []
for patient_id, group in df_merged.groupby("ID"):
    group_sorted = group.sort_values("time_idx")
    # If you need exactly 3 hours, first truncate/pad as needed:
    group_fixed = truncate_pad_series(group_sorted, fixed_length=10800, pad_value=0)
    pooled = pool_time_series(group_fixed, window_size=60, pooling_method='average')
    dfs_pooled.append(pooled)
    
df_pooled = pd.concat(dfs_pooled, ignore_index=True)
print("Pooled data shape:", df_pooled.shape)

  pooled_row[col] = np.nanmean(window[col])


Pooled data shape: (18000, 29)


In [41]:
df_pooled

Unnamed: 0,ID,Acute_kidney_injury,time_idx,TVexp,MVexp,RRtotal,Circuit_O2,Pmean,TVinsp,MVinsp,...,SYS,DIA,HRV,rSO2_Ch1,rSO2_Ch2,rSO2_Ch3,SpO2,ET_CO2,TEMP,FiO2
0,R00118,0.0,29.5,,,,93.000000,0.0,,,...,102.000000,102.000000,,76.150000,81.116667,78.716667,100.000000,0.000000,,93.000000
1,R00118,0.0,89.5,,,,93.200000,0.0,,,...,102.000000,102.000000,,75.916667,81.316667,75.216667,100.000000,0.000000,,93.200000
2,R00118,0.0,149.5,,,,93.666667,0.0,,,...,102.000000,102.000000,,76.166667,81.233333,76.450000,100.000000,0.000000,,93.666667
3,R00118,0.0,209.5,,,,93.816667,0.0,,,...,102.000000,102.000000,,75.566667,80.916667,78.000000,100.000000,0.000000,,93.816667
4,R00118,0.0,269.5,,,,94.000000,0.0,,,...,102.000000,102.000000,,75.250000,80.150000,79.000000,99.686667,0.000000,,94.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,T83969,0.0,10529.5,369.933333,4.795833,13.0,58.000000,6.0,401.233333,5.205333,...,108.933333,49.133333,0.538483,71.850000,74.683333,74.533333,100.000000,27.722222,35.118491,58.000000
17996,T83969,0.0,10589.5,368.116667,4.805000,13.0,58.000000,6.0,399.433333,5.210167,...,106.600000,51.200000,0.572383,72.000000,74.900000,76.516667,100.000000,27.620690,35.115763,58.000000
17997,T83969,0.0,10649.5,368.800000,4.776000,13.0,58.000000,6.0,400.533333,5.185333,...,106.933333,54.133333,1.898917,71.833333,74.516667,73.900000,100.000000,28.067797,35.111379,58.000000
17998,T83969,0.0,10709.5,368.233333,4.789333,13.0,58.000000,6.0,399.116667,5.196333,...,106.033333,53.183333,0.465167,71.883333,74.516667,73.550000,100.000000,28.053571,35.111404,58.000000


In [42]:
np.nanmean([1,  1,1,  1,1,  1,1,  1,1,  1, np.nan])

np.float64(1.0)