In [15]:
import pandas as pd
import os
from edc_egfr.calculators import EgfrCkdEpi
import numpy as np
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch.optim as to
import matplotlib.pyplot as plt
import seaborn as sns

# Read data path to join all  entries using subject_id (patient ID):
This returns the complete MIMICIV dataset with all components (lab tests, EHR data, IoT measurements, labels etc...)

In [16]:
# Run throw directory to concatenate csv files by subject_id column
csv_dir = 'data/'
csv_fdir = [csv_dir+f for f in os.listdir(csv_dir) if f.endswith('.csv')]
fullcsv=pd.read_csv(csv_fdir[0], )
for file in csv_fdir[1:]:
    temp= pd.read_csv(file)
    fullcsv=pd.merge(fullcsv,temp,on='subject_id', how='inner')
# The final csv should contain the following features/ labels
column_list=['subject_id','heart_rate','dbp','sbp','mbp','BMI','height','weight','Serum_Creatinine','anchor_age'
,'Glucose','Creatinine','Urea_Nitrogen','Albumin','ethnicity','gender','Diabetes','CKD' ,'HbA1c_Dia','Hematocrit_CKD' ]

df=fullcsv[column_list]
print('dataset complete')



dataset complete


In [None]:
# Define data processing & cleansing methods: 
categorical variable conversions, outlier removal and the addition of another feature ('eGFR')

In [17]:
def ethnic_mod(ethn):
    ethn = ethn.strip().upper()
    if "WHITE" in ethn:
        return 0
    elif "BLACK" in ethn or "AFRICAN" in ethn:
        return 1
    elif "HISPANIC" in ethn or "LATINO" in ethn:
        return 2
    elif "ASIAN" in ethn:
        return 3
    elif "NATIVE" in ethn or "INDIAN" in ethn or "AMERICAN INDIAN" in ethn:
        return 4
    elif "PACIFIC" in ethn or "HAWAIIAN" in ethn:
        return 5
    elif "UNKNOWN" in ethn or "DECLINED" in ethn:
        return 6
    else:
        return "7"

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3* IQR
    upper_bound = Q3 + 3* IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)& (df[column] != 0)]

def calculate_egfr(row):
    age = row['age']
    sex = row['gender']
    scr = row['serum_creatinine']
    eth = row['ethnicity']
    
    if sex == 'female':
        if scr <= 0.7:
            egfr = 144 * (scr / 0.7) ** -0.329 * 0.993 ** age
        else:
            egfr = 144 * (scr / 0.7) ** -1.209 * 0.993 ** age
    else:
        if scr <= 0.9:
            egfr = 141 * (scr / 0.9) ** -0.411 * 0.993 ** age
        else:
            egfr = 141 * (scr / 0.9) ** -1.209 * 0.993 ** age

    if eth==1:
        egfr *= 1.159

    return round(egfr)

In [18]:
# simplify column names
df.columns=df.columns.str.lower()
df.rename(columns={'anchor_age': 'age'}, inplace=True)

#convert non-numerical categorical into numerical
df['ethnicity'] = df['ethnicity'].apply(ethnic_mod)

# 
columns_to_fill = ['serum_creatinine','hematocrit_ckd', 'glucose', 'creatinine', 'urea_nitrogen', 'albumin']
# Permutate Null values based on type
col_median = ['bmi', 'serum_creatinine', 'creatinine', 'glucose', 'urea_nitrogen', 'albumin']
col_mean = ['heart_rate', 'dbp', 'sbp', 'mbp', 'height', 'weight', 'hematocrit_ckd', 'age']

for col in col_mean:
    df[col] = df[col].fillna(df[col].mean())

for col in col_median:
    df[col] = df[col].fillna(df[col].median())
# Removing outliers based on normal ranges for all numerical fields except 'HbA1c_Dia'

#  
for col in columns_to_fill:
    df = remove_outliers(df, col)
df['gender'] = df['gender'].map({'M': 0, 'F': 1})
df_ensemble=df[df['hba1c_dia'].notnull()]
df_ensemble['eGFR'] = df_ensemble.apply(calculate_egfr, axis=1)

# Dataset for ensemble stacking
df_ensemble.to_csv('total.csv')
print(df_ensemble)


       subject_id  heart_rate    dbp    sbp    mbp   bmi  height  weight  \
5        11318942        73.0   31.0  109.0   48.0  25.0   160.0    64.0   
6        12543373        98.0  103.0  109.0  106.0  26.0   180.0    84.2   
7        12853724        82.0  101.0  129.0  107.0  25.0   170.0    72.0   
8        13717952        87.0   99.0  173.0  117.0  48.0   178.0   150.8   
9        14775807        74.0   37.0  106.0   61.0  30.0   163.0    81.0   
...           ...         ...    ...    ...    ...   ...     ...     ...   
18837    19985545       104.0   91.0  163.0  113.0  22.0   178.0    70.0   
18840    14890100        80.0   91.0  140.0  102.0  26.0   185.0    89.4   
18842    18255949       101.0   91.0  143.0  106.0  37.0   183.0   124.7   
18843    18255949       101.0   91.0  143.0  106.0  37.0   183.0   124.7   
18845    14348068        91.0   91.0  152.0  104.0  41.0   165.0   112.6   

       serum_creatinine  age  ...  creatinine  urea_nitrogen  albumin  \
5             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ensemble['eGFR'] = df_ensemble.apply(calculate_egfr, axis=1)


In [None]:
# Assemble Client Training Datasets for each Diagnosis and Server Evaluation Dataset.
A minimum ratio of 1:3 for label ones and zeroes respectively.
The 'diabetes' label has more ones (approx. 3500) than ckd (approx. 2500), therefore , more balanced distribution can be afforded for the former.

In [22]:
#Select first 2000 entries for ckd, the remaining 4000 are 0s =6000 total
ckd_ones = df[df['ckd'] == 1].iloc[0:2000]
ckd_zeroes=df[df['ckd'] == 0].iloc[0:4000]
ckd_df=pd.concat([ckd_ones,ckd_zeroes],axis=0,ignore_index=True)
ckd_df=ckd_df.sample(frac=1).reset_index(drop=True)

#Select first 2500 entries for ckd, the remaining 4000 are 0s =6000 total
rest=df[~df['subject_id'].isin(ckd_df['subject_id'])]
dia_ones=df[df['diabetes'] == 1].iloc[0:2500]
dia_zeroes=rest[rest['diabetes'] == 0].iloc[0:3500]
dia_df=pd.concat([dia_ones,dia_zeroes],axis=0,ignore_index=True)
dia_df=dia_df.sample(frac=1).reset_index(drop=True)

# Assign remaining entries in the full dataset for server-side multi-task evaluation, 
test_df = df[~df['subject_id'].isin(ckd_df['subject_id']) & ~df['subject_id'].isin(dia_df['subject_id'])]
test_df=test_df.drop( test_df[(test_df['diabetes'] == 0) & (test_df['ckd'] == 0)].head(4000).index )
test_df=test_df.sample(frac=1).reset_index(drop=True)


ckd_df=ckd_df.drop(columns=['subject_id','hematocrit_ckd','hba1c_dia','diabetes','height','weight','sbp'])
dia_df=dia_df.drop(columns=['subject_id','hematocrit_ckd','hba1c_dia','ckd','height','weight','sbp'])
test_df=test_df.drop(columns=['subject_id','hematocrit_ckd','hba1c_dia','height','weight','sbp'])

print(ckd_df)
print(dia_df)
print(test_df)

ckd_df.to_csv('fl/ckd.csv')
dia_df.to_csv('fl/dia.csv')
test_df.to_csv('fl/test.csv')

6178
2178
      heart_rate   dbp   mbp   bmi  serum_creatinine  age  glucose  \
0           61.0  56.0  80.0  24.0               1.3   91     96.0   
1           49.0  18.0  21.0  25.0               2.7   65     87.0   
2           97.0  53.0  81.0  35.0               1.9   65     84.0   
3           74.0  58.0  68.0  20.0               0.6   49     79.0   
4          103.0  38.0  49.0  20.0               1.7   70     96.0   
...          ...   ...   ...   ...               ...  ...      ...   
5995       148.0  34.0  39.0  19.0               0.7   84    204.0   
5996        74.0  56.0  68.0  26.0               1.3   82     69.0   
5997        74.0  45.0  55.0  29.0               1.9   71    117.0   
5998        64.0  56.0  69.0  18.0               0.6   58     93.0   
5999       145.0  64.0  69.0  26.0               1.2   74     93.0   

      creatinine  urea_nitrogen  albumin ethnicity  gender  ckd  
0            2.1           58.0      3.0         0       1    1  
1            2.7 

In [24]:
# Label to Feature correlation coefficients
selected_columns = test_df.columns.difference(['ckd', 'diabetes'])

# Calculate correlation with 'ckd'
correlation_with_ckd = test_df[selected_columns].corrwith(test_df['ckd'])

# Calculate correlation with 'diabetes'
correlation_with_diabetes = test_df[selected_columns].corrwith(test_df['diabetes'])

# Combine the results into a single DataFrame
correlation_df = pd.DataFrame({
    'Correlation with CKD': correlation_with_ckd,
    'Correlation with Diabetes': correlation_with_diabetes
}).sort_values(by='Correlation with CKD', ascending=False)

# Display the DataFrame
print(correlation_df)


                  Correlation with CKD  Correlation with Diabetes
creatinine                    0.550757                  -0.088527
serum_creatinine              0.541085                  -0.094944
urea_nitrogen                 0.486485                  -0.038351
age                           0.334408                   0.030569
glucose                      -0.045933                   0.276677
heart_rate                   -0.050132                  -0.050001
gender                       -0.058681                   0.038548
ethnicity                    -0.062028                  -0.028706
mbp                          -0.076294                  -0.319115
bmi                          -0.088135                   0.153957
albumin                      -0.124240                   0.017540
dbp                          -0.141469                  -0.353433


In [None]:
# Perform Multitask training using TabNet Multi-Task Classifier
Scores can be compared to FMTL scores

In [None]:
# Shuffle Constructed Dataset before training
datac_shuff = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Remove Non-Null values for 
select_features=[ 'heart_rate', 'dbp', 'mbp', 'bmi', 'serum_creatinine', 'age', 'glucose', 'creatinine', 'urea_nitrogen',
        'albumin', 'ethnicity', 'gender']
scallable_feat = [ 'heart_rate', 'dbp', 'mbp', 'bmi',
         'serum_creatinine', 'age', 'glucose', 'creatinine', 'urea_nitrogen',
        'albumin']
# # Features
X =datac_shuff[select_features]
# Two different labels for multi-task classification
y1 = datac_shuff['diabetes']
y2 = datac_shuff['ckd']
# # Combine labels
y = np.vstack((y1, y2)).T

# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train[scallable_feat] = scaler.fit_transform(X_train[scallable_feat])
X_valid[scallable_feat]= scaler.transform(X_valid[scallable_feat])

X_train=X_train.values
X_valid = X_valid.values

# Initialize the TabNetMultiTaskClassifier
clf = TabNetMultiTaskClassifier(
    cat_idxs=[10,11],  # For Ethnicity, Gender
    cat_dims=[8,2],  # 8 Values for Ethnicity, 2 for Gender
    cat_emb_dim=1,  # 1 value for each categorical feature
    optimizer_fn=to.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size": 50, "gamma": 0.9},
    scheduler_fn=to.lr_scheduler.StepLR,
    mask_type='entmax'
)

# # Fit the model
clf.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_valid, y_valid)],  # Validation data
    eval_name=['valid'],
    eval_metric=['accuracy', 'auc', 'balanced_accuracy'],  # Metric
    max_epochs=50,  # Number of epochs
    patience=10,  # Early stopping
    batch_size=128,
    virtual_batch_size=32,
    num_workers=1,
    drop_last=False
)

# # Make predictions
preds = clf.predict(X_valid)



epoch 0  | loss: 0.45914 | valid_accuracy: 0.80744 | valid_auc: 0.78347 | valid_balanced_accuracy: 0.56863 |  0:00:11s
epoch 1  | loss: 0.40773 | valid_accuracy: 0.80985 | valid_auc: 0.79721 | valid_balanced_accuracy: 0.61123 |  0:00:22s
epoch 2  | loss: 0.40051 | valid_accuracy: 0.81212 | valid_auc: 0.80242 | valid_balanced_accuracy: 0.61311 |  0:00:34s
epoch 3  | loss: 0.39688 | valid_accuracy: 0.81046 | valid_auc: 0.80698 | valid_balanced_accuracy: 0.62116 |  0:00:45s
epoch 4  | loss: 0.39216 | valid_accuracy: 0.81439 | valid_auc: 0.81289 | valid_balanced_accuracy: 0.60854 |  0:00:56s
epoch 5  | loss: 0.38965 | valid_accuracy: 0.81212 | valid_auc: 0.81111 | valid_balanced_accuracy: 0.59071 |  0:01:08s
epoch 6  | loss: 0.38968 | valid_accuracy: 0.81106 | valid_auc: 0.81421 | valid_balanced_accuracy: 0.6258  |  0:01:19s
epoch 7  | loss: 0.38885 | valid_accuracy: 0.81469 | valid_auc: 0.81543 | valid_balanced_accuracy: 0.62092 |  0:01:31s
epoch 8  | loss: 0.3881  | valid_accuracy: 0.815