In [1]:
import os
import pandas as pd
import CGM_TAML as taml
from tqdm import tqdm
import pickle

import warnings
warnings.filterwarnings("ignore")
from concurrent.futures import ThreadPoolExecutor

# Create Training Dataset using Shah_healthy_169 and Aleppo_T1D_226

### Import Healthy and T1D Data

In [2]:
CGM_healthy = pd.read_csv("../Public Datasets/Shah_Healthy_169/NonDiabDeviceCGM_processed.csv")
CGM_T1D = pd.read_csv("../Public Datasets/Aleppo_T1D_226/HDeviceCGM_processed.csv")

# Change subjects id to unique ids
CGM_healthy.id = 'Shah_healthy_'+CGM_healthy.id.astype(str)
CGM_T1D.id = 'Ale_T1D_'+CGM_T1D.id.astype(str)

In [3]:
all_subjects_table = pd.concat([CGM_healthy, CGM_T1D])
all_subjects_table = all_subjects_table.rename(columns={"id": "ID", "gl": "Glucose", "time": "Time"})
all_subjects_table.Time = pd.to_datetime(all_subjects_table['Time'])

### Compute glycemic features for all segmented days

In [6]:
import importlib

# Make changes to my_module.py

importlib.reload(taml)  # Reload the module

<module 'CGM_TAML_PA180_2' from '/Users/stuartsong/Library/CloudStorage/OneDrive-DukeUniversity/CGM/CIBM Journal/CGM_TAML_PA180_2.py'>

In [7]:
all_subjects_table_data_grouped = all_subjects_table.groupby(['ID'])

# Define a function to process each group
def process_group(id):
    current_id_data = all_subjects_table_data_grouped.get_group(id).sort_values(by='Time').reset_index(drop=True)
    feature_table = taml.feature_extraction_fixed_hour_window_0oclock(current_id_data, id, hour=24)
    return feature_table if feature_table.shape[1] > 1 else None

# Use ThreadPoolExecutor to process groups in parallel
with ThreadPoolExecutor() as executor:
    # Map process_group function to each ID, and use tqdm for progress bar
    results = list(tqdm(executor.map(process_group, all_subjects_table_data_grouped.groups.keys()), total=len(all_subjects_table_data_grouped)))

# Concatenate non-None results to get the final DataFrame
all_feature_table = pd.concat([res for res in results if res is not None], ignore_index=True)


100%|█████████████████████████████████████████| 395/395 [02:42<00:00,  2.42it/s]


In [8]:
all_feature_table.dropna(inplace=True)
all_feature_table.reset_index(inplace=True,drop=True)

### Generate Label

In [6]:
label = []
for ids in all_feature_table.id:
    if "T1D" in ids:
        label.append(1)
    else:
        label.append(0)
all_feature_table["label"]=label

In [9]:
all_feature_table.columns

Index(['id', 'mean', 'median', 'min', 'max', 'fq', 'tq', 'interdaysd',
       'interdaycv', 'TOR', 'TIR', 'MGE', 'MGN', 'J_index', 'LBGI', 'HBGI',
       'ADRR', 'TA140', 'TA200', 'TIR_70_180', 'TA180', 'TA250', 'TB70',
       'TB54', 'TITR', 'GRI', 'PA140', 'PA180', 'PA200'],
      dtype='object')

In [10]:
all_feature_table.to_csv("CIBM_Journal_Training_Unbalanced.csv",index=False)

### Make Balanced dataset by keeping all Minor group and random downsample Major group

In [8]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X_res, y_res = tl.fit_resample(all_feature_table.iloc[:,1:-3], all_feature_table.iloc[:,-1])

downsampled_all_feature_table = all_feature_table.iloc[tl.sample_indices_].reset_index(drop=True)

ValueError: Input X contains NaN.
TomekLinks does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
healthy_n = downsampled_all_feature_table.loc[downsampled_all_feature_table['id'].str.contains('healthy')].shape[0]
T1D_downsampled_all_feature_table = downsampled_all_feature_table.loc[downsampled_all_feature_table['id'].str.contains('T1D')]

In [None]:
all_healthy = downsampled_all_feature_table.loc[downsampled_all_feature_table['id'].str.contains('healthy')]
all_T1D = T1D_downsampled_all_feature_table.sample(healthy_n)
balanced_data = pd.concat([all_healthy,all_T1D]).reset_index(drop=True)

In [None]:
balanced_data.to_csv("S_Journal_Training_Balanced.csv",index=False)

# Create Validation Dataset Using Colas_healthy_208 and Tam_T1D_451

### Import Healthy and T1D Data

In [11]:
CGM_healthy = pd.read_csv("../Public Datasets/Colas_Healthy_208/Colas_healthy_processed.csv")
CGM_healthy = CGM_healthy.rename(columns={"id": "ID", "gl": "Glucose", "time": "Time"})

CGM_healthy.ID = CGM_healthy.ID.astype('str')
CGM_healthy.ID = "Colas_"+CGM_healthy.ID

CGM_T1D = pd.read_csv("../Public Datasets/Tamborlane_T1D_451/Processed/tblADataRTCGM_processed.csv")
CGM_T1D = CGM_T1D.rename(columns={"id": "ID", "gl": "Glucose", "time": "Time"})

CGM_T1D.dropna(inplace=True)
CGM_T1D.ID = CGM_T1D.ID.astype('int').astype('str')
CGM_T1D.ID = "Tam_T1D_"+CGM_T1D.ID

In [12]:
all_subjects_table = pd.concat([CGM_healthy, CGM_T1D])
all_subjects_table = all_subjects_table.rename(columns={"id": "ID", "gl": "Glucose", "time": "Time"})
all_subjects_table.Time = pd.to_datetime(all_subjects_table['Time'])

### Compute glycemic features for all segmented day

In [13]:
all_subjects_table_data_grouped = all_subjects_table.groupby(['ID'])

# Define a function to process each group
def process_group(id):
    current_id_data = all_subjects_table_data_grouped.get_group(id).sort_values(by='Time').reset_index(drop=True)
    feature_table = taml.feature_extraction_fixed_hour_window_0oclock(current_id_data, id, hour=24)
    return feature_table if feature_table.shape[1] > 1 else None

# Use ThreadPoolExecutor to process groups in parallel
with ThreadPoolExecutor() as executor:
    # Map process_group function to each ID, and use tqdm for progress bar
    results = list(tqdm(executor.map(process_group, all_subjects_table_data_grouped.groups.keys()), total=len(all_subjects_table_data_grouped)))

# Concatenate non-None results to get the final DataFrame
all_feature_table = pd.concat([res for res in results if res is not None], ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████████| 642/642 [06:14<00:00,  1.72it/s]


In [14]:
all_feature_table.dropna(inplace=True)
all_feature_table.reset_index(inplace=True,drop=True)

### Generate Label

In [15]:
label = []
for ids in all_feature_table.id:
    if "T1D" in ids:
        label.append(1)
    else:
        label.append(0)
all_feature_table["label"]=label

In [16]:
all_feature_table

Unnamed: 0,id,mean,median,min,max,fq,tq,interdaysd,interdaycv,TOR,...,TIR_70_180,TA180,TA250,TB70,TB54,TITR,GRI,PA140,PA200,label
0,Colas_Healthy_1_win1,98.47743055555556,92.0,75.5,141.5,85.0,108.75,17.477326909397654,17.747545616087034,325,...,1440,0,0,0,0,1420,0.0,1,0,0
1,Colas_Healthy_1_win2,98.47743055555556,92.0,75.5,141.5,85.0,108.75,17.477326909397654,17.747545616087034,325,...,1440,0,0,0,0,1420,0.0,1,0,0
2,Colas_Healthy_101_win1,94.03993055555556,93.5,85.0,103.0,91.0,97.5,4.001211100773771,4.254800144083468,480,...,1440,0,0,0,0,1440,0.0,0,0,0
3,Colas_Healthy_101_win2,94.03993055555556,93.5,85.0,103.0,91.0,97.5,4.001211100773771,4.254800144083468,480,...,1440,0,0,0,0,1440,0.0,0,0,0
4,Colas_Healthy_108_win1,112.13194444444444,103.0,74.5,239.0,86.0,125.0,34.66313514219105,30.912810184402744,230,...,1365,75,0,0,0,1235,60.0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22292,Tam_T1D_98_win127,134.2972027972028,118.0,56.0,259.0,100.0,161.0,47.78725207319548,35.583207302803785,310,...,1200,215,85,15,0,890,344.0,3,1,1
22293,Tam_T1D_98_win128,122.91756272401433,107.0,39.0,248.0,72.0,171.5,62.42469596610881,50.785823101838105,560,...,755,315,0,335,245,590,1791.0,2,1,1
22294,Tam_T1D_98_win139,133.39208633093526,122.0,65.0,278.0,87.0,168.25,52.8223663178488,39.59932539536166,425,...,1070,295,55,35,0,850,408.0,2,3,1
22295,Tam_T1D_98_win156,126.01459854014598,127.0,39.0,235.0,102.0,148.0,41.40444727364334,32.85686559597508,395,...,1090,150,0,145,90,705,738.0,5,1,1


In [18]:
all_feature_table.to_csv("CIBM_Journal_Validation_Unbalanced.csv",index=False)

In [20]:
healthy_n = all_feature_table.loc[all_feature_table['id'].str.contains('Healthy')].shape[0]
T1D_all_feature_table = all_feature_table.loc[all_feature_table['id'].str.contains('T1D')]

all_healthy = all_feature_table.loc[all_feature_table['id'].str.contains('Healthy')]
all_T1D = T1D_all_feature_table.sample(healthy_n)
balanced_data = pd.concat([all_healthy,all_T1D]).reset_index(drop=True)