In [None]:
!nvidia-smi

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
import os

 
print(tf.__version__)

warnings.filterwarnings("ignore")

gpus = tf.config.list_physical_devices(device_type='GPU')
tf.config.set_visible_devices(devices=gpus[0], device_type='GPU')

In [None]:
seed = 42
os.environ['PYTHONHASHSEED']=str(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

In [None]:
train_subject = pd.read_csv('Data/COPD_train_list.csv')
train_subject = np.array(train_subject['Subject'])

In [None]:
train_subject

In [None]:
val_subject = pd.read_csv('Data/COPD_val_list.csv')
val_subject = np.array(val_subject['Subject'])

In [None]:
val_subject

In [None]:
test_subject = pd.read_csv('Data/COPD_test_list.csv')
test_subject = np.array(test_subject['Subject'])

In [None]:
test_subject

In [None]:
patient = pd.read_csv('Data/Processed_demo.csv')
patient.head()

In [None]:
# extract image, subject_id, study_id from tfrecords to 
df_icu = pd.read_csv('Data/ICU_subject.csv')
subject_ids = np.array(df_icu['icu_subject'])
study_ids = np.array(df_icu['StudyID'])
COPDs = np.array(df_icu['COPD'])
vs = np.array(df_icu['ventilation_status'])

study_id_array = []

img_train = []
subject_id_train = []
study_id_train = []
copd_train = []
disease_train = []
vs_train = []
age_train = []
races_train = []
gender_train = []

img_test = []
subject_id_test = []
study_id_test = []
copd_test = []
disease_test = []
vs_test = []
age_test = []
races_test = []
gender_test = []

img_val = []
subject_id_val = []
study_id_val = []
copd_val = []
disease_val = []
vs_val = []
age_val = []
races_val = []
gender_val = []

filename = ['TFrecords/mimic-tf-record{i}.tfrecords'.format(i=i) for i in range(24)]
# filename = ['lateral_tf-record{i}.tfrecords'.format(i=i) for i in range(4)]
raw_dataset = tf.data.TFRecordDataset(filename)

for raw_record in raw_dataset:
    #loads data  from the trecord all info 
    example = tf.train.Example() # subject id and study_id (contains several chest Xrays) 
    example.ParseFromString(raw_record.numpy()) #reads the example 

    subject_id = example.features.feature['subject_id'].int64_list.value[0] # change subject_id to type int
    study_id = example.features.feature['study_id'].int64_list.value[0]
    
    idx = np.where(subject_ids == subject_id)[0]
        
    try:
        vs_status = vs[idx][0]
        label = COPDs[idx][0]
    except:
        continue
            
    sub_y = []
        
    sub_y.append(1 if example.features.feature['Atelectasis'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Cardiomegaly'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Consolidation'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Edema'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Enlarged Cardiomediastinum'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Fracture'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Lung Lesion'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Lung Opacity'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['No Finding'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pleural Effusion'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pleural Other'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pneumonia'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pneumothorax'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Support Devices'].float_list.value[0] == 1 else 0)
    
    gender = int(patient.loc[patient['subject_id'] == subject_id, 'gender'].values[0])
    
    age = int(patient.loc[patient['subject_id'] == subject_id, 'anchor_age'].values[0])
        
    race = int(patient.loc[patient['subject_id'] == subject_id, 'race'].values[0])
        
    byte_arr = example.features.feature['jpg_bytes'].bytes_list.value[0] # format image stored in Tfrecord
    
    if (vs_status == 'Oxygen'):
        vs_value = 0
    elif (vs_status == 'HighFlow'):
        vs_value = 1
    else:
        vs_value = 2
        
    if (subject_id in train_subject):
        vs_train.append(vs_value)
        img_train.append(byte_arr) # img added to list 
        subject_id_train.append(subject_id) # subject_id add to list
        study_id_train.append(study_id)
        copd_train.append(label)
        gender_train.append(gender)
        age_train.append(age)
        races_train.append(race)
        disease_train.append(sub_y)
    elif (subject_id in val_subject):
        vs_val.append(vs_value)
        img_val.append(byte_arr) # img added to list 
        subject_id_val.append(subject_id) # subject_id add to list
        study_id_val.append(study_id)
        copd_val.append(label)
        gender_val.append(gender)
        age_val.append(age)
        races_val.append(race)
        disease_val.append(sub_y)
    elif (subject_id in test_subject):
        vs_test.append(vs_value)
        img_test.append(byte_arr) # img added to list 
        subject_id_test.append(subject_id) # subject_id add to list
        study_id_test.append(study_id)
        copd_test.append(label)
        gender_test.append(gender)
        age_test.append(age)
        races_test.append(race)
        disease_test.append(sub_y)
    else:
        continue 
        
    study_id_array.append(study_id)

In [None]:
# extract image, subject_id, study_id from tfrecords to 
df_ed = pd.read_csv('Data/ED_subject.csv')
subject_ids = np.array(df_ed['ed_subject'])
study_ids = np.array(df_ed['StudyID'])
COPDs = np.array(df_ed['COPD'])
vs = np.array(df_ed['ventilation_status'])

filename = ['TFrecords/mimic-tf-record{i}.tfrecords'.format(i=i) for i in range(24)]
# filename = ['lateral_tf-record{i}.tfrecords'.format(i=i) for i in range(4)]
raw_dataset = tf.data.TFRecordDataset(filename)

for raw_record in raw_dataset:
    #loads data  from the trecord all info 
    example = tf.train.Example() # subject id and study_id (contains several chest Xrays) 
    example.ParseFromString(raw_record.numpy()) #reads the example 

    subject_id = example.features.feature['subject_id'].int64_list.value[0] # change subject_id to type int
    study_id = example.features.feature['study_id'].int64_list.value[0]
    
    idx = np.where(subject_ids == subject_id)[0]
        
    try:
        vs_status = vs[idx][0]
        label = COPDs[idx][0]
    except:
        continue
    
    if (study_id in np.unique(study_id_array)):
        continue
    
    sub_y = []
        
    sub_y.append(1 if example.features.feature['Atelectasis'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Cardiomegaly'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Consolidation'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Edema'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Enlarged Cardiomediastinum'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Fracture'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Lung Lesion'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Lung Opacity'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['No Finding'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pleural Effusion'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pleural Other'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pneumonia'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Pneumothorax'].float_list.value[0] == 1 else 0)
    sub_y.append(1 if example.features.feature['Support Devices'].float_list.value[0] == 1 else 0)
    
    gender = int(patient.loc[patient['subject_id'] == subject_id, 'gender'].values[0])
    
    age = int(patient.loc[patient['subject_id'] == subject_id, 'anchor_age'].values[0])
        
    race = int(patient.loc[patient['subject_id'] == subject_id, 'race'].values[0])
    
    byte_arr = example.features.feature['jpg_bytes'].bytes_list.value[0] # format image stored in Tfrecord
    
    if (vs_status == 'Oxygen'):
        vs_value = 0
    elif (vs_status == 'HighFlow'):
        vs_value = 1
    else:
        vs_value = 2
        
    if (subject_id in train_subject):
        vs_train.append(vs_value)
        img_train.append(byte_arr) # img added to list 
        subject_id_train.append(subject_id) # subject_id add to list
        study_id_train.append(study_id)
        copd_train.append(label)
        gender_train.append(gender)
        age_train.append(age)
        races_train.append(race)
        disease_train.append(sub_y)
    elif (subject_id in val_subject):
        vs_val.append(vs_value)
        img_val.append(byte_arr) # img added to list 
        subject_id_val.append(subject_id) # subject_id add to list
        study_id_val.append(study_id)
        copd_val.append(label)
        gender_val.append(gender)
        age_val.append(age)
        races_val.append(race)
        disease_val.append(sub_y)
    elif (subject_id in test_subject):
        vs_test.append(vs_value)
        img_test.append(byte_arr) # img added to list 
        subject_id_test.append(subject_id) # subject_id add to list
        study_id_test.append(study_id)
        copd_test.append(label)
        gender_test.append(gender)
        age_test.append(age)
        races_test.append(race)
        disease_test.append(sub_y)
    else:
        continue 

In [None]:
# store data into a new tfrecord
np.random.seed(seed)

diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

record_file_train = 'copd_train_new.tfrecords' #output file 

with tf.io.TFRecordWriter(record_file_train) as writer: #write info into record_file 
    for i in range(len(img_train)):
        example = tf.train.Example() #create example and append img id and study_id to recrod file 
        
        example.features.feature['jpg_bytes'].bytes_list.value.append(img_train[i])  
                
        example.features.feature['subject_id'].int64_list.value.append(subject_id_train[i])
        
        example.features.feature['study_id'].int64_list.value.append(study_id_train[i])

        example.features.feature['COPD'].int64_list.value.append(copd_train[i]) # labels 
        
        example.features.feature['VS'].int64_list.value.append(vs_train[i])
        
        example.features.feature['race'].int64_list.value.append(races_train[i])
        
        example.features.feature['age'].int64_list.value.append(age_train[i])
        
        example.features.feature['gender'].int64_list.value.append(gender_train[i])
        
        for j in range(14):
            example.features.feature[diseases[j]].int64_list.value.append(disease_train[i][j])

        # append label to the example
        
        writer.write(example.SerializeToString()) #Write to a record file 
        
record_file_test = 'copd_val_new.tfrecords'

with tf.io.TFRecordWriter(record_file_test) as writer:
    for i in range(len(img_val)):
        example = tf.train.Example() #default module in TFrecord 
        
        example.features.feature['jpg_bytes'].bytes_list.value.append(img_val[i])  
                
        example.features.feature['subject_id'].int64_list.value.append(subject_id_val[i])
        
        example.features.feature['study_id'].int64_list.value.append(study_id_val[i])

        example.features.feature['COPD'].int64_list.value.append(copd_val[i])
        
        example.features.feature['VS'].float_list.value.append(vs_val[i])
        
        example.features.feature['race'].int64_list.value.append(races_val[i])
        
        example.features.feature['age'].int64_list.value.append(age_val[i])
        
        example.features.feature['gender'].int64_list.value.append(gender_val[i])
        
        for j in range(14):
            example.features.feature[diseases[j]].int64_list.value.append(disease_val[i][j])

        # append label to the example
        
        writer.write(example.SerializeToString())

record_file_test = 'copd_test_new.tfrecords'

with tf.io.TFRecordWriter(record_file_test) as writer:
    for i in range(len(img_test)):
        example = tf.train.Example() #default module in TFrecord 
        
        example.features.feature['jpg_bytes'].bytes_list.value.append(img_test[i])  
                
        example.features.feature['subject_id'].int64_list.value.append(subject_id_test[i])
        
        example.features.feature['study_id'].int64_list.value.append(study_id_test[i])

        example.features.feature['COPD'].int64_list.value.append(copd_test[i])
        
        example.features.feature['VS'].float_list.value.append(vs_test[i])
        
        example.features.feature['race'].int64_list.value.append(races_test[i])
        
        example.features.feature['age'].int64_list.value.append(age_test[i])
        
        example.features.feature['gender'].int64_list.value.append(gender_test[i])
        
        for j in range(14):
            example.features.feature[diseases[j]].int64_list.value.append(disease_test[i][j])

        # append label to the example
        
        writer.write(example.SerializeToString())

In [None]:
patient_train = pd.read_csv('Data/COPD_train_list.csv')
patient_test = pd.read_csv('Data/COPD_test_list.csv')
patient_val = pd.read_csv('Data/COPD_val_list.csv')

patient_list = np.array(patient_train['Subject'])
patient_list = np.append(patient_list, np.array(patient_test['Subject']))
patient_list = np.append(patient_list, np.array(patient_val['Subject']))

In [None]:
labels = []
races = []
subject_ids = []
study_ids = []
genders = []
ages = []
imgs = []

filename = ['TFrecords/mimic-tf-record{i}.tfrecords'.format(i=i) for i in range(24)]
# filename = ['lateral_tf-record{i}.tfrecords'.format(i=i) for i in range(4)]
raw_dataset = tf.data.TFRecordDataset(filename)

for raw_record in raw_dataset:
    #loads data  from the trecord all info 
    example = tf.train.Example() # subject id and study_id (contains several chest Xrays) 
    example.ParseFromString(raw_record.numpy()) #reads the example 

    subject_id = example.features.feature['subject_id'].int64_list.value[0] # change subject_id to type int
    study_id = example.features.feature['study_id'].int64_list.value[0]
    
    if (subject_id not in patient_list):
            
        sub_y = []

        sub_y.append(1 if example.features.feature['Atelectasis'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Cardiomegaly'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Consolidation'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Edema'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Enlarged Cardiomediastinum'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Fracture'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Lung Lesion'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Lung Opacity'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['No Finding'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Pleural Effusion'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Pleural Other'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Pneumonia'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Pneumothorax'].float_list.value[0] == 1 else 0)
        sub_y.append(1 if example.features.feature['Support Devices'].float_list.value[0] == 1 else 0)
        
        labels.append(sub_y)

        races.append(example.features.feature['ethnicity'].int64_list.value[0])

        imgs.append(example.features.feature['jpg_bytes'].bytes_list.value[0]) # format image stored in Tfrecord

        subject_ids.append(subject_id)
        study_ids.append(study_id)

In [None]:
record_file_test = 'copd_no_label.tfrecords'
diseases = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

#idx = np.arange(len(imgs))
#np.random.shuffle(idx)

with tf.io.TFRecordWriter(record_file_test) as writer:
    for i in range(len(imgs)):
        example = tf.train.Example() #default module in TFrecord 
        
        example.features.feature['jpg_bytes'].bytes_list.value.append(imgs[i])  
                
        example.features.feature['subject_id'].int64_list.value.append(subject_ids[i])
        
        example.features.feature['study_id'].int64_list.value.append(study_ids[i])
        
        example.features.feature['race'].int64_list.value.append(races[i])
        
        for j in range(14):
            example.features.feature[diseases[j]].int64_list.value.append(labels[i][j])

        # append label to the example
        
        writer.write(example.SerializeToString())

In [None]:
len(patient_list)