In [1]:
!nvidia-smi

Mon Nov 15 14:29:46 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.67       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3090    Off  | 00000000:65:00.0 Off |                  N/A |
| 62%   54C    P8    50W / 370W |     45MiB / 24265MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 3090    Off  | 00000000:B3:00.0 Off |                  N/A |
| 80%   69C    P2   328W / 370W |  23699MiB / 24268MiB |    100%      Defaul

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
import os
from tableone import TableOne, load_dataset
import pandas as pd

 
print(tf.__version__)

warnings.filterwarnings("ignore")

gpus = tf.config.list_physical_devices(device_type='GPU')
tf.config.set_visible_devices(devices=gpus[1], device_type='GPU')

2.5.0


In [9]:
seed = 42
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

In [10]:
df_icu = pd.read_csv('ICU_subject.csv')
df_icu.fillna(0, inplace=True)
df_icu

Unnamed: 0,icu_subject,ventilation_status,StudyID,COPD
0,10000032,Oxygen,53911762,1
1,10000032,Oxygen,56699142,1
2,10000032,Oxygen,53189527,1
3,10000032,Oxygen,50414267,1
4,10000980,0,54577367,1
...,...,...,...,...
127148,19999287,Oxygen,52519175,1
127149,19999287,Oxygen,50000173,1
127150,19999287,Oxygen,50574077,1
127151,19999287,Oxygen,58938059,1


In [11]:
df_ed = pd.read_csv('ED_subject.csv')
df_ed.fillna(0, inplace=True)
df_ed

Unnamed: 0,ed_subject,ventilation_status,StudyID,COPD
0,10000032,0.0,53911762,0
1,10000032,0.0,56699142,0
2,10000032,0.0,53189527,0
3,10000032,0.0,50414267,0
4,10000764,0.0,57375967,0
...,...,...,...,...
227296,19999442,0.0,58497551,0
227297,19999733,0.0,57132437,0
227298,19999987,0.0,58621812,0
227299,19999987,0.0,58971208,0


In [12]:
unique_subject_id_icu = list(set(df_icu['icu_subject']))
unique_subject_id_ed = list(set(df_ed['ed_subject']))

In [13]:
demographic = pd.read_csv('demographic (2).csv')
import random
from statistics import mode, StatisticsError

def most_common(l):
    if (len(l) == 1):
        return l[0]
    
    try:
        most_common = mode(l)
    except:
        most_common = random.choice(l)
    
    if (most_common == 3): 
        second_common_list = [x for x in l if x != most_common]
        if (len(second_common_list) > 0):
            most_common = mode(second_common_list)
    return most_common

In [14]:
from IPython import display
new_race_list = []
for i in demographic['race']:
    l = list(i)
    l = [x for x in l if x != ',']
    new_race_list.append(most_common(l))

In [15]:
demographic.drop('race',axis=1, inplace=True)
demographic['race'] = new_race_list

In [16]:
demographic

Unnamed: 0,subject_id,gender,anchor_age,race
0,19940860,1,0,0
1,17727602,1,0,0
2,16427415,0,3,0
3,13197526,1,3,0
4,10071435,1,4,0
...,...,...,...,...
256873,17400716,0,3,4
256874,19454978,0,4,4
256875,18632748,0,3,4
256876,14120635,1,2,4


In [17]:
# create a dataframe containing each patient uniquely
gender_array = []
age_array = []
races_array = []
copd_array = []
vs_array = []
miss = 0
for subject_id in unique_subject_id_icu:    
    try:
        gender = int(demographic.loc[demographic['subject_id'] == subject_id, 'gender'].values[0])
        age = int(demographic.loc[demographic['subject_id'] == subject_id, 'anchor_age'].values[0])
        race = int(demographic.loc[demographic['subject_id'] == subject_id, 'race'].values[0])
    except:
        miss += 0
        gender = np.nan
        age = np.nan
        race = np.nan
    copd_array.append(df_icu.loc[df_icu['icu_subject'] == subject_id, 'COPD'].values[0])
    vs_array.append(df_icu.loc[df_icu['icu_subject'] == subject_id, 'ventilation_status'].values[0])
    gender_array.append(gender)
    age_array.append(age)
    races_array.append(race)

unique_df_icu = pd.DataFrame({'subject_id':unique_subject_id_icu, 'gender': gender_array, 
                              'age':age_array, 'race':races_array, 
                              'COPD':copd_array, 'ventilation_status': vs_array
                             })
print(miss)
unique_df_icu

0


Unnamed: 0,subject_id,gender,age,race,COPD,ventilation_status
0,12025857,0,3,1,0,0
1,14811141,0,4,0,1,Oxygen
2,17465349,1,4,0,1,Oxygen
3,17367047,1,3,0,0,0
4,13139976,1,3,0,0,Oxygen
...,...,...,...,...,...,...
14857,10289146,1,3,3,0,Oxygen
14858,12156923,0,3,0,0,Oxygen
14859,10780669,1,3,0,0,Oxygen
14860,11141118,0,4,0,0,Oxygen


In [18]:
# create a dataframe containing each patient uniquely
gender_array = []
age_array = []
races_array = []
copd_array = []
vs_array = []
miss = 0
for subject_id in unique_subject_id_ed:
    try:
        gender = int(demographic.loc[demographic['subject_id'] == subject_id, 'gender'].values[0])
        age = int(demographic.loc[demographic['subject_id'] == subject_id, 'anchor_age'].values[0])
        race = int(demographic.loc[demographic['subject_id'] == subject_id, 'race'].values[0])
    except:
        miss += 0
        gender = np.nan
        age = np.nan
        race = np.nan
    copd_array.append(df_ed.loc[df_ed['ed_subject'] == subject_id, 'COPD'].values[0])
    vs_array.append(df_ed.loc[df_ed['ed_subject'] == subject_id, 'ventilation_status'].values[0])
    gender_array.append(gender)
    age_array.append(age)
    races_array.append(race)

unique_df_ed = pd.DataFrame({'subject_id':unique_subject_id_ed, 'gender': gender_array, 
                              'age':age_array, 'race':races_array, 
                              'COPD':copd_array, 'ventilation_status': vs_array
                             })
print(miss)
unique_df_ed

0


Unnamed: 0,subject_id,gender,age,race,COPD,ventilation_status
0,17039362,1.0,1.0,1.0,0,0.0
1,14811141,0.0,4.0,0.0,0,0.0
2,13107206,0.0,3.0,0.0,0,0.0
3,13893638,0.0,4.0,0.0,0,0.0
4,13369352,1.0,2.0,1.0,0,0.0
...,...,...,...,...,...,...
65141,11272182,0.0,3.0,4.0,0,0.0
65142,12713976,0.0,1.0,1.0,0,0.0
65143,13500411,0.0,2.0,1.0,0,0.0
65144,11141118,0.0,4.0,0.0,0,0.0


In [19]:
unique_df_icu = unique_df_icu.replace({'ventilation_status': {np.nan: 0, 'Oxygen': 1, 'HighFlow': 2, 'Oxygen,HighFlow': 3}})
unique_df_icu

Unnamed: 0,subject_id,gender,age,race,COPD,ventilation_status
0,12025857,0,3,1,0,0
1,14811141,0,4,0,1,1
2,17465349,1,4,0,1,1
3,17367047,1,3,0,0,0
4,13139976,1,3,0,0,1
...,...,...,...,...,...,...
14857,10289146,1,3,3,0,1
14858,12156923,0,3,0,0,1
14859,10780669,1,3,0,0,1
14860,11141118,0,4,0,0,1


In [20]:
unique_df_ed = unique_df_ed.replace({'ventilation_status': {np.nan: 0, 'Oxygen': 1, 'HighFlow': 2,'Oxygen,HighFlow': 3}})
unique_df_ed

Unnamed: 0,subject_id,gender,age,race,COPD,ventilation_status
0,17039362,1.0,1.0,1.0,0,0.0
1,14811141,0.0,4.0,0.0,0,0.0
2,13107206,0.0,3.0,0.0,0,0.0
3,13893638,0.0,4.0,0.0,0,0.0
4,13369352,1.0,2.0,1.0,0,0.0
...,...,...,...,...,...,...
65141,11272182,0.0,3.0,4.0,0,0.0
65142,12713976,0.0,1.0,1.0,0,0.0
65143,13500411,0.0,2.0,1.0,0,0.0
65144,11141118,0.0,4.0,0.0,0,0.0


In [23]:
df_ed = pd.DataFrame()
df_icu = pd.DataFrame()
miss = 0
filename = ['TFrecords/mimic-tf-record{i}.tfrecords'.format(i=i) for i in range(24)]
raw_dataset = tf.data.TFRecordDataset(filename)

for raw_record in raw_dataset:
    #loads data  from the trecord all info 
    example = tf.train.Example() # subject id (contains several chest Xrays) 
    example.ParseFromString(raw_record.numpy()) #reads the example 
    

    subject_id = example.features.feature['subject_id'].int64_list.value[0] # change subject_id to type int
    try:
        df_ed = df_ed.append(unique_df_ed.loc[unique_df_ed['subject_id'] == subject_id], ignore_index=True)
    except:
        miss+=1

    try:
        df_icu = df_icu.append(unique_df_icu.loc[unique_df_icu['subject_id'] == subject_id], ignore_index=True)
    except:
        miss+=1


In [24]:
df_icu = df_icu.drop_duplicates()
df_icu

Unnamed: 0,subject_id,gender,age,race,COPD,ventilation_status
0,15529160,1,2,4,0,0
5,10617255,1,3,3,0,0
6,16825136,0,4,3,0,0
8,12857550,0,3,4,0,0
11,11220176,0,4,4,0,1
...,...,...,...,...,...,...
106953,19994233,0,4,3,0,1
106958,19994379,1,3,0,1,1
106960,19995127,1,3,1,0,0
106969,19998330,0,3,1,1,0


In [25]:
df_ed = df_ed.drop_duplicates()
df_ed

Unnamed: 0,subject_id,gender,age,race,COPD,ventilation_status
0,15529160,1.0,2.0,4.0,0,0.0
5,18085253,1.0,2.0,4.0,0,0.0
6,10617255,1.0,3.0,3.0,0,0.0
7,19170368,1.0,1.0,4.0,0,0.0
10,11135741,1.0,2.0,4.0,0,0.0
...,...,...,...,...,...,...
228843,19998350,1.0,2.0,1.0,0,0.0
228846,19998843,1.0,2.0,3.0,0,0.0
228851,19999068,1.0,3.0,0.0,0,0.0
228852,19999287,0.0,3.0,1.0,1,0.0


In [26]:
columns = ['gender', 'age', 'race', 'COPD', 'ventilation_status']
categorical = columns
groupby = 'COPD'
mytable = TableOne(df_icu, columns=columns, categorical=categorical,
                   groupby=groupby, pval=True)

In [27]:
mytable

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by COPD,Grouped by COPD,Grouped by COPD,Grouped by COPD,Grouped by COPD
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0,1,P-Value
n,,,14725,11305,3420,
"gender, n (%)",0.0,0.0,7007 (47.6),5310 (47.0),1697 (49.6),0.007
"gender, n (%)",1.0,,7718 (52.4),5995 (53.0),1723 (50.4),
"age, n (%)",0.0,0.0,54 (0.4),54 (0.5),,<0.001
"age, n (%)",1.0,,1348 (9.2),1285 (11.4),63 (1.8),
"age, n (%)",2.0,,4117 (28.0),3287 (29.1),830 (24.3),
"age, n (%)",3.0,,6103 (41.4),4349 (38.5),1754 (51.3),
"age, n (%)",4.0,,3103 (21.1),2330 (20.6),773 (22.6),
"race, n (%)",0.0,0.0,10224 (69.4),7711 (68.2),2513 (73.5),<0.001
"race, n (%)",1.0,,2096 (14.2),1587 (14.0),509 (14.9),


In [28]:
columns = ['gender', 'age', 'race', 'COPD', 'ventilation_status']
categorical = columns
groupby = 'COPD'
mytable = TableOne(df_ed, columns=columns, categorical=categorical,
                   groupby=groupby, pval=True)
mytable
# 0: Female, 1: Male
# 0: White, 1: Black, 2: Latino, 3: others, 4: Asian
# 0: -20, 1: 20-40, 2:40-60, 3: 60-80, 4: 80-

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by COPD,Grouped by COPD,Grouped by COPD,Grouped by COPD,Grouped by COPD
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0,1,P-Value
n,,,53051,50946,2105,
"gender, n (%)",0.0,0.0,27519 (51.9),26364 (51.7),1155 (54.9),0.005
"gender, n (%)",1.0,,25532 (48.1),24582 (48.3),950 (45.1),
"age, n (%)",0.0,0.0,434 (0.8),431 (0.8),3 (0.1),<0.001
"age, n (%)",1.0,,7878 (14.8),7802 (15.3),76 (3.6),
"age, n (%)",2.0,,16536 (31.2),15940 (31.3),596 (28.3),
"age, n (%)",3.0,,19483 (36.7),18452 (36.2),1031 (49.0),
"age, n (%)",4.0,,8720 (16.4),8321 (16.3),399 (19.0),
"race, n (%)",0.0,0.0,34213 (64.5),32800 (64.4),1413 (67.1),<0.001
"race, n (%)",1.0,,8978 (16.9),8530 (16.7),448 (21.3),


In [29]:
a = np.array(df_icu['subject_id'])
b = np.array(df_ed['subject_id'])
concat = np.unique(np.concatenate((a, b), axis=0))

In [30]:
train, test = train_test_split(concat, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [32]:
np.savetxt('COPD_val_list.csv', val, delimiter=",")
np.savetxt('COPD_test_list.csv', test, delimiter=",")
np.savetxt('COPD_train_list.csv', train, delimiter=",")