In [2]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 16)
#pd.set_option('display.width', 2000)
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn import metrics
from sktime.transformations.panel.catch22 import Catch22

In [2]:
# Import dates2 data
dates = pd.read_csv(r"eicu_iv_switch_stayid_dates.csv")

In [5]:
# vitalperiodic - saO2, heartrate, respiration, temperature, bp
# vitalaperiodic - bp
# physicalexam - gcs, richmond
iter_csv = pd.read_csv(r"eicu-collaborative-research-database-2.0/vitalPeriodic.csv", iterator=True, chunksize=10000000)
vitalperiodic = pd.concat([chunk[chunk['patientunitstayid'].isin(stay_id_list)] for chunk in iter_csv])

iter_csv = pd.read_csv(r"eicu-collaborative-research-database-2.0/vitalAperiodic.csv", iterator=True, chunksize=10000000)
vitalaperiodic = pd.concat([chunk[chunk['patientunitstayid'].isin(stay_id_list)] for chunk in iter_csv])

iter_csv = pd.read_csv(r"eicu-collaborative-research-database-2.0/physicalExam.csv", iterator=True, chunksize=10000000)
physicalexam = pd.concat([chunk[chunk['patientunitstayid'].isin(stay_id_list)] for chunk in iter_csv])

In [6]:
# Get unitadmittime24
# Note unit admit date = 02 Sep 2022
iter_csv = pd.read_csv(r"eicu-collaborative-research-database-2.0/patient.csv", iterator=True, chunksize=10000000)
patients = pd.concat([chunk[chunk['patientunitstayid'].isin(stay_id_list)] for chunk in iter_csv])

# Select relevant columns
patients = patients[['patientunitstayid', 'unitadmittime24']]
# Convert unit admit time to day (day 0)
patients['unitadmittime24'] = pd.to_datetime(patients['unitadmittime24'])
# Set start date
patients['unitadmittime24'] = patients['unitadmittime24'].apply(lambda t: t.replace(year=2022, month=9, day=2))


In [7]:
vitalperiodic = pd.merge(patients, vitalperiodic, on=['patientunitstayid'])
vitalaperiodic = pd.merge(patients, vitalaperiodic, on=['patientunitstayid'])
physicalexam = pd.merge(patients, physicalexam, on=['patientunitstayid'])

In [8]:
# Create observationtime
vitalperiodic['observationoffset'] = pd.to_timedelta(vitalperiodic['observationoffset'], unit='min') # Convert to timedelta
vitalperiodic['observationtime'] = vitalperiodic['unitadmittime24'] + vitalperiodic['observationoffset']

vitalaperiodic['observationoffset'] = pd.to_timedelta(vitalaperiodic['observationoffset'], unit='min') # Convert to timedelta
vitalaperiodic['observationtime'] = vitalaperiodic['unitadmittime24'] + vitalaperiodic['observationoffset']

physicalexam['physicalexamoffset'] = pd.to_timedelta(physicalexam['physicalexamoffset'], unit='min') # Convert to timedelta
physicalexam['observationtime'] = physicalexam['unitadmittime24'] + physicalexam['physicalexamoffset']


physicalexam

In [10]:
# Filter for GCS
physicalexam = physicalexam[physicalexam.physicalexampath.str.contains('GCS')]
score = 'Motor Score|Verbal Score|Eyes Score'
# Filter for the three scores
physicalexam = physicalexam[physicalexam.physicalexampath.str.contains(score)]
# Remame
physicalexam.loc[physicalexam['physicalexampath'].str.contains('Eyes Score'), 'physicalexampath'] = 'gcs - eye opening'
physicalexam.loc[physicalexam['physicalexampath'].str.contains('Verbal Score'), 'physicalexampath'] = 'gcs - verbal response'
physicalexam.loc[physicalexam['physicalexampath'].str.contains('Motor Score'), 'physicalexampath'] = 'gcs - motor response'



In [11]:
# Order
physicalexam = physicalexam.sort_values(by=['patientunitstayid', 'observationtime'])

In [12]:
# Create date column 
physicalexam['date'] =  pd.to_datetime(physicalexam['observationtime']).dt.date
# Create hour column
physicalexam['hour'] =  pd.to_datetime(physicalexam['observationtime']).dt.hour
# Pivot
physicalexam_pivoted = pd.pivot_table(physicalexam, index=['patientunitstayid', 'date', 'hour'], columns=['physicalexampath'], values=['physicalexamvalue'])
physicalexam_pivoted.columns = physicalexam_pivoted.columns.droplevel()

In [13]:
physicalexam_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,physicalexampath,gcs - eye opening,gcs - motor response,gcs - verbal response
patientunitstayid,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
141392,2022-09-02,7,4.0,6.0,5.0
141470,2022-09-02,7,4.0,6.0,5.0
142405,2022-09-02,9,4.0,6.0,5.0
143068,2022-09-02,6,4.0,6.0,5.0
143157,2022-09-02,14,4.0,6.0,5.0
...,...,...,...,...,...
3335807,2022-09-02,13,4.0,6.0,5.0
3340875,2022-09-02,23,4.0,6.0,4.0
3340875,2022-09-04,6,4.0,6.0,5.0
3341168,2022-09-03,1,4.0,6.0,4.0


vitalperiodic / vitalaperiodic

In [14]:
# Select columns
vitalaperiodic = vitalaperiodic[['patientunitstayid', 'observationtime', 'noninvasivesystolic', 'noninvasivediastolic', 'noninvasivemean']]
vitalperiodic = vitalperiodic[['patientunitstayid', 'observationtime', 'temperature', 'sao2', 'heartrate', 'respiration', 'systemicsystolic', 'systemicdiastolic', 'systemicmean']]

In [15]:
# Merge
vital = pd.merge(vitalperiodic, vitalaperiodic, on=['patientunitstayid', 'observationtime'], how='outer')

In [16]:
# Create combined bp columns
vital['blood pressure systolic'] = vital[['systemicsystolic', 'noninvasivesystolic']].mean(axis=1)
vital['blood pressure diastolic'] = vital[['systemicdiastolic', 'noninvasivediastolic']].mean(axis=1)
vital['blood pressure mean'] = vital[['systemicmean', 'noninvasivemean']].mean(axis=1)

In [17]:
# Order
vital = vital.sort_values(by=['patientunitstayid', 'observationtime'])
# Create date column 
vital['date'] =  pd.to_datetime(vital['observationtime']).dt.date
# Create hour column
vital['hour'] =  pd.to_datetime(vital['observationtime']).dt.hour
# Select columns
final_vital = vital[['patientunitstayid', 'date', 'hour', 'temperature', 'sao2', 'heartrate', 'respiration', 'blood pressure systolic', 'blood pressure diastolic', 'blood pressure mean']]
#final_vital.set_index(['patientunitstayid', 'date', 'hour'], inplace=True)
final_vital = final_vital.groupby(['patientunitstayid', 'date', 'hour']).mean()

In [19]:
# Merge
data = pd.merge(final_vital, physicalexam_pivoted, how='outer', left_index=True, right_index=True)

In [20]:
# rename and reorder
#data.rename(index={'patientunitstayid': 'stay_id'}, inplace=True)
data.rename_axis(['stay_id', 'date', 'hour'], inplace=True)
data.rename(columns={'sao2': 'o2 saturation pulseoxymetry', 'heartrate': 'heart rate', 'respiration': 'respiratory rate'}, inplace=True)
data.sort_index(axis=1, inplace=True) # Sore by alphabetical order like mimic

In [21]:
print('% of missing values (columns):', (data.isnull().sum(axis=0)/len(data)))
print('Average % of missing values (row):', (data.isnull().sum(axis=1)/len(data.columns)).mean())

% of missing values (columns): blood pressure diastolic       0.110858
blood pressure mean            0.110342
blood pressure systolic        0.110850
gcs - eye opening              0.975192
gcs - motor response           0.975185
gcs - verbal response          0.975192
heart rate                     0.006172
o2 saturation pulseoxymetry    0.059008
respiratory rate               0.180110
temperature                    0.948614
dtype: float64
Average % of missing values (row): 0.44515239596860967


c22_function

In [23]:
# Define C22 function
def c22_extra_fun(df):
    c22 = Catch22()
    c22_2 = Catch22()
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    remove_set = set()
    for x in range(len(df.columns)): # Iterate through columns so not to many nans
        print(x)
        working_df = df.iloc[:, x]
        working_df.dropna(inplace=True) # Drop nans # Note this causes issue with hours being dropped and hence data not being evenly spaced as exspected - could correct with forward filling as before?? But also since doing per feature and per patient likley to be relativly regular anyway 
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0):
                gb = new_df.groupby(level=1)
                group_dict = dict(list(gb))
                n = 0
                for date, new_df2 in new_df.groupby(level=1):
                    n += 1
                    # Create new df that incoperates all data to date for stay
                    group_dict_values_list = list(group_dict.values())[:n]
                    if len(group_dict_values_list) > 1:
                        new_group_dict_values_list = []
                        for y in range(len(group_dict_values_list)):
                            if y == 0:
                                new_group_dict_values_list.append(group_dict_values_list[y])
                            else:
                                sub_df = group_dict_values_list[y].copy()
                                sub_df.reset_index(inplace=True)
                                sub_df['hour'] = sub_df['hour'] + (24*y) # Update hours fo c22 works 
                                sub_df.set_index(['stay_id', 'date', 'hour'], inplace=True)
                                new_group_dict_values_list.append(sub_df)
                        new_df3 = pd.concat(new_group_dict_values_list)
                        new_df3.reset_index(inplace=True)
                        new_df3["date"] = date # Update date so c22 works
                        new_df3.set_index(['stay_id', 'date', 'hour'], inplace=True)
                    else:
                        new_df3 = pd.concat(group_dict_values_list)

                    # C22 for current day
                    if len(new_df2) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data = pd.DataFrame()
                        transformed_data['_mean'] = new_df2.mean().values[0]
                        transformed_data['_std'] = new_df2.std().values[0]
                    else:
                        transformed_data = c22.fit_transform(new_df2)
                        transformed_data['_mean'] = new_df2.mean().values[0]
                        transformed_data['_std'] = new_df2.std().values[0]
                    transformed_data = transformed_data.add_prefix(column_name)           
                    transformed_data.insert(0, 'stay_id', stay_id)
                    transformed_data.insert(1, 'date', date)

                    # C22 for all data to date for stay
                    if len(new_df3) <= 2:
                        transformed_data2 = pd.DataFrame()
                        transformed_data2['_mean'] = new_df3.mean().values[0]
                        transformed_data2['_std'] = new_df3.std().values[0]
                    else:
                        transformed_data2 = c22_2.fit_transform(new_df3)
                        transformed_data2['_mean'] = new_df2.mean().values[0]
                        transformed_data2['_std'] = new_df2.std().values[0]
                    transformed_data2 = transformed_data2.add_prefix(column_name)
                    transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                    transformed_data2.insert(0, 'stay_id', stay_id)
                    transformed_data2.insert(1, 'date', date)

                    # Create master df's
                    master_df = pd.concat([master_df, transformed_data])
                    master_df2 = pd.concat([master_df2, transformed_data2])

                    #print('master_df', master_df)
                    #print('master_df2', master_df2)

            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            master_df = master_df.merge(master_df2, how='left', on=['stay_id', 'date'])

            #print('master_df', master_df)
            #print('master_df.info()', master_df.info())

        if x == 0:
            overlord_df = master_df.copy()
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['stay_id', 'date'])

    return overlord_df

In [24]:
c22_data = c22_extra_fun(data)
print(c22_data)

0
1
2
3
4
5
6
7
8
9
      stay_id        date  blood pressure diastolic_mean  \
0      141196  2022-09-03                      72.500000   
1      141392  2022-09-02                      62.550000   
2      141470  2022-09-02                      56.750000   
3      142405  2022-09-02                      72.200000   
4      143068  2022-09-02                      48.115741   
...       ...         ...                            ...   
6636  3235015  2022-09-06                            NaN   
6637  3335807  2022-09-02                            NaN   
6638  3095016  2022-09-05                            NaN   
6639  3112389  2022-09-02                            NaN   
6640  3201912  2022-09-05                            NaN   

      blood pressure diastolic_std  blood pressure diastolic0  \
0                         5.205766                  74.500000   
1                        12.983575                  59.800003   
2                         4.866981                  55.099998   

In [25]:
# Re order to same as with MIMIC (some mean and std first here)
# Import mimic c22 data
c22_data_method2 = pd.read_csv(r"catch_22_data.csv")
# Filter for columns in eICU
pattern = 'stay_id|date|blood pressure|Temperature|GCS|heart rate|o2 saturation pulseoxymetry|respiratory rate'
c22_data_method2 = c22_data_method2[c22_data_method2.columns[c22_data_method2.columns.str.contains(pattern, case=False, na=False)]]
# Remove rows
c22_data_method2 = c22_data_method2.iloc[0:0]
# Concat
c22_data = pd.concat([c22_data_method2, c22_data])
c22_data

Unnamed: 0,stay_id,date,blood pressure diastolic0,blood pressure diastolic1,blood pressure diastolic2,blood pressure diastolic3,blood pressure diastolic4,blood pressure diastolic5,...,temperature16_current_stay,temperature17_current_stay,temperature18_current_stay,temperature19_current_stay,temperature20_current_stay,temperature21_current_stay,temperature_mean_current_stay,temperature_std_current_stay
0,141196,2022-09-03,74.500000,72.750000,2.0,0.166667,0.0,1.0,...,,,,,,,,
1,141392,2022-09-02,59.800003,57.000000,3.0,0.066667,0.0,1.0,...,,,,,,,,
2,141470,2022-09-02,55.099998,54.250000,3.0,0.111111,0.0,1.0,...,,,,,,,,
3,142405,2022-09-02,71.449997,72.225002,2.0,0.100000,0.0,1.0,...,,,,,,,,
4,143068,2022-09-02,47.583336,47.924999,4.0,0.055556,0.0,1.0,...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6636,3235015,2022-09-06,,,,,,,...,,,,,,,,
6637,3335807,2022-09-02,,,,,,,...,,,,,,,,
6638,3095016,2022-09-05,,,,,,,...,,,,,,,,
6639,3112389,2022-09-02,,,,,,,...,,,,,,,,


In [26]:
# Save df
c22_data.to_csv('eicu_catch_22_data.csv', index=False)