In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from datetime import datetime
pd.options.mode.chained_assignment = None

In [None]:
data= pd.read_csv('data/DXA.csv', low_memory=False)
new_cols = {}
for col in data.columns:
    new_cols[col] = col.split('first reported')[0]
data.rename(columns= new_cols, inplace=True)

map with image dictionary

In [None]:
png_df = pd.read_csv('data/png_df.csv') # path to img
data = data.merge(png_df[['Participant ID', 'path_11','path_12']], on = ['Participant ID'], how = 'inner')
print(len(data))

Select Caucasian ethnic group

In [None]:
print(data['Genetic ethnic grouping'].value_counts())
s_data = data[data['Genetic ethnic grouping']=='Caucasian']
len(s_data)

Select unmatched Sex

In [None]:
s_data = s_data[s_data['Sex'] == s_data['Genetic sex']]
len(s_data)

get the VAT and lean percentage

In [None]:
print('VAT:', len(s_data)- s_data['VAT (visceral adipose tissue) mass | Instance 2'].isna().sum())
print('lean mass:',len(s_data)- s_data['Total lean mass | Instance 2'].isna().sum())
print('total mass:',len(s_data)- s_data['Total tissue mass | Instance 2'].isna().sum())
print('BMD:',len(s_data)- s_data['Total BMD (bone mineral density) T-score | Instance 2'].isna().sum())

from here we have two groups of data:

new_data: to get HC reference group

s_data: to get hc + disease group

In [None]:
new_data = s_data[['Participant ID','Date of attending assessment centre | Instance 2','Sex' ,'Age when attended assessment centre | Instance 2','VAT (visceral adipose tissue) mass | Instance 2','Total lean mass | Instance 2', 'Total tissue mass | Instance 2','Total BMD (bone mineral density) T-score | Instance 2', 'path_11','path_12']]
new_data['VAT_Rate'] = new_data['VAT (visceral adipose tissue) mass | Instance 2']/new_data['Total tissue mass | Instance 2']*100
new_data['lean_Rate'] = new_data['Total lean mass | Instance 2']/new_data['Total tissue mass | Instance 2']* 100

remove cancer

In [None]:
# self report cancers
self_cancer =pd.read_csv('data/self_cancer.csv')
self_cancer_remove = set(self_cancer['Participant ID'])
print(len(new_data))
# for HC reference group
new_data = new_data[~new_data['Participant ID'].isin(self_cancer_remove)]
print(len(new_data))
# all hc and disease group
s_data = s_data[~s_data['Participant ID'].isin(self_cancer_remove)]
print(len(s_data))

In [None]:
with open('data_processing/label/cancer_record.pkl', 'rb') as f_read:
    cancer_record = pickle.load(f_read)
date_dic = {}
for i in range(len(new_data)):
    p = new_data.iloc[i]
    date_dic[p['Participant ID']] = datetime.strptime(p['Date of attending assessment centre | Instance 2'], '%Y-%m-%d')

official_list = []
disase_record = []
for k, v in cancer_record.items():
    if k in date_dic.keys():
    # k is id, v is the dictionary of cancer record
        to_add = 0
        ins_date = date_dic[k] # attchend isntance data
        for kk, vv in v.items(): # kk is the ICD, vv is the disease date
            if type(vv) == str:
                di_data= datetime.strptime(vv, '%Y-%m-%d') # disease date
                duration = (di_data - ins_date).days
                if duration >=0: # have disease after 180 days
                    disase_record.append([k,kk, duration])
                else:
                    to_add +=1
            else:
                to_add +=1
        if to_add > 0:
            official_list.append(k)
print(len(new_data))
new_data = new_data[~new_data['Participant ID'].isin(official_list)]
s_data = s_data[~s_data['Participant ID'].isin(official_list)]

# for both hc reference and all group
print('HC reference group:', len(new_data))
print('all group:',len(s_data))


# Make labeling
T2 Diabetes: "E11", "E12"

MACE: "G45", "I21", "I22", "I23", "I24", "I25", "I63", "I64"

Hypertension: 	I10, I11, I12, I13, I15

ASCVD:	I71, I20-I25, I63, I65, I66, I70-I79

In [None]:
T2D = ['Date '+i+' ' for i in ["E11", "E12"]]
MACE = ['Date '+i+' ' for i in ["G45", "I21", "I22", "I23", "I24", "I25", "I63", "I64"]]
Hypertension = ['Date '+i+' ' for i in ["I10", "I11", "I12", "I13", "I15"]]
ASCVD = ['Date '+i+' ' for i in ["I20", 'I21', 'I22', 'I23', 'I24', 'I25', 'I63', 'I65', 'I70', "I71",'I72', "I73", 'I74', 'I77', 'I78','I78']]

In [None]:
def disease_finder(s_data =s_data, dil=MACE):
    new_col = [*['Participant ID','Sex' ,'Age when attended assessment centre | Instance 2','Date of attending assessment centre | Instance 2','path_11','path_12'], *dil]
    t2d_df =  s_data[new_col]
    durations = []
    labels = []
    for i in tqdm(range(len(t2d_df))):
        p_df = t2d_df.iloc[i]
        p_date = datetime.strptime(p_df['Date of attending assessment centre | Instance 2'],  '%Y-%m-%d')
        p_di_datae = list(set(t2d_df[dil].iloc[i].dropna()))
        # for multi disease, we only use the earlist one
        if len(p_di_datae)>1:
            p_di_datae.sort()
            p_di_datae = [p_di_datae[0]]
        # after the above if, len(p_di_datae)==1 and will go to the next if
        if len(p_di_datae)==1:
            # print(1)
            t_data = datetime.strptime(p_di_datae[0],  '%Y-%m-%d')
            duration = (t_data - p_date).days
            if duration>=0:
                labels.append("After")
            else:
                labels.append("Before")
        elif len(p_di_datae)==0:
            duration = (datetime.today() -p_date).days
            labels.append('HC')
        durations.append(duration)
    t2d_df['durations'] = durations
    t2d_df['label'] = labels
    return t2d_df

In [None]:
t2d_df = disease_finder(s_data =s_data, dil=T2D)
t2d_eid = t2d_df[t2d_df['label']=='Before']['Participant ID']
print(len(t2d_eid))
t2d_eid_after = t2d_df[t2d_df['label']=='After']['Participant ID']
print(len(t2d_eid_after))


In [None]:
mace_df = disease_finder(s_data =s_data, dil=MACE)
mace_eid = mace_df[mace_df['label']=='Before']['Participant ID']
print(len(mace_eid))
mace_eid_after = mace_df[mace_df['label']=='After']['Participant ID']
print(len(mace_eid_after))

In [None]:
hyper_df = disease_finder(s_data =s_data, dil=Hypertension)
hyper_eid = hyper_df[hyper_df['label']=='Before']['Participant ID']
print(len(hyper_eid))
hyper_eid_after = hyper_df[hyper_df['label']=='After']['Participant ID']
print(len(hyper_eid_after))


In [None]:
ascvd_df = disease_finder(s_data =s_data, dil=ASCVD)
ascvd_eid = ascvd_df[ascvd_df['label']=='Before']['Participant ID']
print(len(ascvd_eid))
ascvd_eid_after = ascvd_df[ascvd_df['label']=='After']['Participant ID']
print(len(ascvd_eid_after))

In [None]:
illed_eid = list(set([*ascvd_eid, *t2d_eid, *hyper_eid, *mace_eid]))
print(len(illed_eid))
after_illed_eid = list(set([*ascvd_eid_after, *t2d_eid_after, *hyper_eid_after, *mace_eid_after]))
print(len(after_illed_eid))


In [None]:
all_ill_list = [*illed_eid, *after_illed_eid]
print(len(all_ill_list)) 
all_ill = list(set(all_ill_list))
print(len(all_ill)) 

save the disease labels

In [None]:
t2d_df.rename(columns={'label':'T2D Label', 'durations':'T2D Duration'}, inplace=True)
hyper_df.rename(columns={'label':'Hypertension Label', 'durations':'Hypertension Duration'}, inplace=True)
mace_df.rename(columns={'label':'MACE Label', 'durations':'MACE Duration'}, inplace=True)
ascvd_df.rename(columns={'label':'ASCVD Label', 'durations':'ASCVD Duration'}, inplace=True)
new_t2d = ['Participant ID','Sex' ,'Age when attended assessment centre | Instance 2','Date of attending assessment centre | Instance 2','path_11','path_12','T2D Label' ,'T2D Duration']
new_hyp = ['Participant ID','Sex' ,'Age when attended assessment centre | Instance 2','Date of attending assessment centre | Instance 2','path_11','path_12','Hypertension Label' ,'Hypertension Duration']
new_mace = ['Participant ID','Sex' ,'Age when attended assessment centre | Instance 2','Date of attending assessment centre | Instance 2','path_11','path_12','MACE Label' ,'MACE Duration']
new_ascvd = ['Participant ID','Sex' ,'Age when attended assessment centre | Instance 2','Date of attending assessment centre | Instance 2','path_11','path_12','ASCVD Label' ,'ASCVD Duration']

di_df = t2d_df[new_t2d].merge(hyper_df[new_hyp], on = new_t2d[:6], how='inner')
di_df = di_df.merge(mace_df[new_mace], on = new_t2d[:6], how='inner')
di_df = di_df.merge(ascvd_df[new_ascvd], on = new_t2d[:6], how='inner')

In [None]:
di_df_f = di_df[di_df.Sex=='Female']
print(len(di_df_f))
di_df_m = di_df[di_df.Sex=='Male']
print(len(di_df_m))
di_df.to_csv('data/disease_label.csv', index=False)


select sex df

In [None]:
new_data = new_data[~new_data['Participant ID'].isin(all_ill)]
len(new_data)

In [None]:
# trainF, validateF, testF = np.split(new_data.sample(frac=1, random_state=42), 
#                        [int(.7*len(new_data)), int(.85*len(new_data))])
# print(len(trainF))
# trainF.to_csv('data/All_data_train.csv', index=False)
# validateF.to_csv('data/All_data_val.csv', index=False)
# testF.to_csv('data/All_data_test.csv', index=False)

In [None]:
F_data = new_data[new_data.Sex =='Female']
print(len(F_data))
M_data = new_data[new_data.Sex =='Male']
print(len(M_data))

In [None]:
old_hc_patients = new_data['Participant ID'].to_list()
new_data = new_data.dropna()
new_data.reset_index(inplace=True)
len(new_data)

In [None]:
na_hc_data = set(old_hc_patients)-set(new_data['Participant ID'])
len(na_hc_data)
na_hc = pd.DataFrame(na_hc_data, columns=['Remove'])
na_hc.to_csv('data/na_hc.csv', index=False)

In [None]:
new_data['Age_Label'] = pd.cut(x=new_data['Age when attended assessment centre | Instance 2'], 
                               bins=[40, 50, 60, 70, 80, 90], labels=['40-49', '50-59', '60-69', '70-79', '80-89']) 
new_data.groupby('Age_Label').count()/len(new_data)

In [None]:
F_data = new_data[new_data.Sex =='Female']
print(len(F_data))
M_data = new_data[new_data.Sex =='Male']
print(len(M_data))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plot_data = F_data[['VAT_Rate', 'lean_Rate']]
fig, axes = plt.subplots(1, 2, figsize=(20, 5))
# quantiles = ['5%', '10%','25%','50%', '75%', '90%', '95%']
quantiles = ['mean-std','mean', 'mean+std', ]
colors = ['green', 'red', 'blue']

for col, ax in zip(plot_data, axes.flat):
#     print(col, ax)
    sns.histplot(ax=ax, data=plot_data, x=plot_data[col], multiple='stack')
    m = plot_data[col].mean()
    s = plot_data[col].std()
    desc = [m-s, m, m+s]
    for i in range(len(quantiles)):
        ax.axvline(desc[i], color=colors[i])

In [None]:
def hc_select(F_data): 
    # VAT_upper = F_data['VAT_Rate'].quantile(.25)
    # VAT_lower = F_data['VAT_Rate'].quantile(.75)
    VAT_upper = F_data['VAT_Rate'].mean() - F_data['VAT_Rate'].std()
    VAT_lower = F_data['VAT_Rate'].mean() + F_data['VAT_Rate'].std()
    print(VAT_upper, VAT_lower)
    lean_upper = F_data['lean_Rate'].mean() - F_data['lean_Rate'].std()
    lean_lower = F_data['lean_Rate'].mean() + F_data['lean_Rate'].std()
    print(lean_upper, lean_lower)
    # lean_upper = F_data['lean_Rate'].quantile(.25)
    # lean_lower = F_data['lean_Rate'].quantile(.75)
    F_new_data = F_data
    F_new_data = F_new_data[F_new_data.VAT_Rate <=VAT_lower]
    F_new_data = F_new_data[F_new_data.VAT_Rate >=VAT_upper]
    F_new_data = F_new_data[F_new_data.lean_Rate <=lean_lower]
    F_new_data = F_new_data[F_new_data.lean_Rate >=lean_upper]
    return F_new_data

In [None]:
F_new_data = hc_select(F_data)
print(len(F_new_data))
M_new_data = hc_select(M_data)
print(len(M_new_data))

In [None]:
F_data_group = F_data.groupby('Age_Label')
# print(F_data_group.count())
F_new_data = pd.concat([hc_select(group) for name, group in F_data_group])
print(len(F_new_data))
# Group the dataframe by age group
M_data_group = M_data.groupby('Age_Label')
M_new_data = pd.concat([hc_select(group) for name, group in M_data_group])
print(len(M_new_data))

In [None]:
# trainF, validateF, testF = np.split(F_data.sample(frac=1, random_state=42), 
#                        [int(.7*len(F_data)), int(.8*len(F_data))])
# print(len(validateF))
# trainF.to_csv('data/F_data_train.csv', index=False)
# validateF.to_csv('data/F_data_val.csv', index=False)
# testF.to_csv('data/F_data_test.csv', index=False)
# trainM, validateM, testM = np.split(M_data.sample(frac=1, random_state=42), 
#                        [int(.7*len(M_data)), int(.8*len(M_data))])
# print(len(trainM))
# trainM.to_csv('data/M_data_train.csv', index=False)
# validateM.to_csv('data/M_data_val.csv', index=False)
# testM.to_csv('data/M_data_test.csv', index=False)

In [None]:
trainF, validateF, testF = np.split(F_new_data.sample(frac=1, random_state=42), 
                       [int(.7*len(F_new_data)), int(.8*len(F_new_data))])
print(len(trainF))
trainF.to_csv('data/F_na_data_train.csv', index=False)
validateF.to_csv('data/F_na_data_val.csv', index=False)
testF.to_csv('data/F_na_data_test.csv', index=False)
trainM, validateM, testM = np.split(M_new_data.sample(frac=1, random_state=42), 
                       [int(.7*len(M_new_data)), int(.8*len(M_new_data))])
print(len(trainM))
trainM.to_csv('data/M_na_data_train.csv', index=False)
validateM.to_csv('data/M_na_data_val.csv', index=False)
testM.to_csv('data/M_na_data_test.csv', index=False)

In [None]:
train_f = pd.read_csv('data/F_na_data_train.csv')
train_m = pd.read_csv('data/M_na_data_train.csv')
val_f = pd.read_csv('data/F_na_data_val.csv')
val_m = pd.read_csv('data/M_na_data_val.csv')
test_f = pd.read_csv('data/F_na_data_test.csv')
test_m = pd.read_csv('data/M_na_data_test.csv')

In [None]:
train = pd.concat([train_f, train_m], ignore_index=False)
train.sample(frac=1, replace=True, ignore_index=True, random_state=42)
train

In [None]:
val = pd.concat([val_f, val_m], ignore_index=False)
val.sample(frac=1, replace=True, ignore_index=True, random_state=42)
val

In [None]:
test = pd.concat([test_f, test_m], ignore_index=False)
test.sample(frac=1, replace=True, ignore_index=True, random_state=42)
test

In [None]:
train.to_csv('data/train.csv', index=False)
val.to_csv('data/val.csv', index=False)
test.to_csv('data/test.csv', index=False)