# PEDAP

In [8]:
import pandas as pd
import numpy as np

def split_nan_pedap(pd_dir, output_dir="raw_data"):
    df = pd.read_csv(pd_dir)
    
    unique_pats = df['pat_id'].unique()
    np.random.shuffle(unique_pats)
    
    total_pat = len(unique_pats)
    trn_end = int(0.7 * total_pat)
    val_end = int(0.8 * total_pat)
    
    trn_ids = unique_pats[:trn_end]
    val_ids = unique_pats[trn_end:val_end]
    tst_ids = unique_pats[val_end:]
    
    splits = {
        'train': trn_ids,
        'val': val_ids,
        'test': tst_ids
    }
    
    for name, ids in splits.items():
        split_df = df[df['pat_id'].isin(ids)]
        split_df.to_csv(f"{output_dir}/{name}.csv", index=False)
        print(f"Saved {name} set with {len(ids)} patients and {len(split_df)} rows.")

split_nan_pedap('timeseries.csv')

Saved train set with 69 patients and 4631147 rows.
Saved val set with 10 patients and 534180 rows.
Saved test set with 20 patients and 1453922 rows.


# TCR Simulation

In [11]:
import pandas as pd
import numpy as np
import os

tcr_path = '/project/shakeri-lab/Amir/Data_Processing/Sim/sim/UVA-T1D-Simulator/TestAID/New_TCR_Data/'

tcr_time_lag_min = 2.5 * 60  # 2.5 hours lag in minutes
tcr_duration_min = 4.0 * 60  # 4 hours duration
num_days = 56

meal_times_matrix = np.array([
    [8, 13, 19], # Day type 1
    [11, 13, 20], # Day type 2
    [6, 11, 18]   # Day type 3
]) * 60

meal_pattern = [0, 1, 2]
os.makedirs(os.path.join(tcr_path, 'FLAG_TCR'), exist_ok=True)
for i in range(1, 101):
    file_name = f'ctrl_{i:03d}.csv'
    file_full_path = os.path.join(tcr_path, file_name)
    
    if os.path.exists(file_full_path):
        df = pd.read_csv(file_full_path)
        
        df['tcr_flag'] = 0        
        for day in range(num_days):
            day_type_idx = day % 3             
            target_meal_idx = meal_pattern[day % len(meal_pattern)]            
            meal_time_of_day = meal_times_matrix[day_type_idx][target_meal_idx]
            
            start_min = (day * 1440) + meal_time_of_day + tcr_time_lag_min
            end_min = start_min + tcr_duration_min
            
            start_idx = int(start_min / 5)
            end_idx = int(end_min / 5)
            
            df.loc[start_idx : end_idx, 'tcr_flag'] = 1
        
        df.to_csv(os.path.join(tcr_path, 'FLAG_TCR', file_name), index=False)
        if i % 20 == 0:
            print(f"Flagged TCR moments for {i} files...")

print("Successfully added 'tcr_flag' to all 100 patient files.")

Successfully added 'tcr_flag' to all 100 patient files.


In [13]:
import pandas as pd
import random
import os

tcr_path = '/project/shakeri-lab/Amir/Data_Processing/Sim/sim/UVA-T1D-Simulator/TestAID/TCR_Data/FLAG_TCR/'
all_files = [f'ctrl_{i:03d}.csv' for i in range(1, 101)]

random.seed(42)
random.shuffle(all_files)

train_files = all_files[:70]
val_files   = all_files[70:80]
test_files  = all_files[80:]

def combine_sets(file_list):
    combined_list = []
    for f in file_list:
        file_full_path = os.path.join(tcr_path, f)
        if os.path.exists(file_full_path):
            df = pd.read_csv(file_full_path)
            combined_list.append(df)
    return pd.concat(combined_list, ignore_index=True)

train_df = combine_sets(train_files)
train_df.to_csv(os.path.join(tcr_path, 'train.csv'), index=False)

val_df = combine_sets(val_files)
val_df.to_csv(os.path.join(tcr_path, 'val.csv'), index=False)

test_df = combine_sets(test_files)
test_df.to_csv(os.path.join(tcr_path, 'test.csv'), index=False) 

In [12]:
import pandas as pd
import random
import os, glob


tcr_path = '/project/shakeri-lab/Amir/Data_Processing/Sim/sim/UVA-T1D-Simulator/TestAID/New_TCR_Data/FLAG_TCR/'
file_list = glob.glob(os.path.join(tcr_path, "ctrl_*.csv"))
def combine_sets(file_list):
    combined_list = []
    for f in file_list:
        file_full_path = os.path.join(tcr_path, f)
        if os.path.exists(file_full_path):
            df = pd.read_csv(file_full_path)
            combined_list.append(df)
    return pd.concat(combined_list, ignore_index=True)

test_df = combine_sets(file_list)
test_df.to_csv(os.path.join(tcr_path, 'test.csv'), index=False) 

##### add time(date)
dec_path = "./tcr_data/test.csv"
df_src = pd.read_csv(dec_path)
test_df['date'] = df_src['date']
test_df.to_csv('./tcr_data/test.csv')

In [14]:
import pandas as pd 

for splt in ['train', 'val', 'test']:
    df_src = pd.read_csv(f'/project/shakeri-lab/Amir/Data_Processing/Sim/sim/UVA-T1D-Simulator/TestAID/ImputationModerateConfig/{splt}.csv')
    df_dst = pd.read_csv(f'/project/shakeri-lab/Amir/Data_Processing/Sim/sim/UVA-T1D-Simulator/TestAID/TCR_Data/FLAG_TCR/{splt}.csv')

    df_dst['date'] = df_src['date']
    df_dst.to_csv(f'./tcr_data/{splt}.csv')