In [1]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 16)
#pd.set_option('display.width', 2000)
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn import metrics
from sktime.transformations.panel.catch22 import Catch22

In [62]:
# Import dates2 data
dates2 = pd.read_csv(r"iv_switch_stayid_dates.csv")

In [None]:
# Load filtered chartevents data
# Create list
stay_id_list = dates2.stay_id.unique().tolist()
iter_csv = pd.read_csv(r"mimic-iv-2.0/icu/chartevents.csv", iterator=True, chunksize=10000000)
chartevents = pd.concat([chunk[chunk['stay_id'].isin(stay_id_list)] for chunk in iter_csv])

In [6]:
d_items = pd.read_csv(r"mimic-iv-2.0/icu/d_items.csv")

In [152]:
# Filter data and c22 as before 
# Merge
new_chartevents = chartevents.merge(d_items[['itemid', 'label']], how='left', on=['itemid'])
# Filter for those with numeric values 
new_chartevents = new_chartevents[~new_chartevents['valuenum'].isna()]
# Convert Farenhight to celcius 
new_chartevents.loc[new_chartevents['label'].str.contains('Temperature Fahrenheit'), 'valuenum'] = (new_chartevents.loc[new_chartevents['label'].str.contains('Temperature Fahrenheit'), 'valuenum']-32)*(5/9)
# Drop Temperature Site, Changes in Temperature, Blood Pressure Alarm, Blood Temperature
drop_pattern = 'Temperature Site|Changes in Temperature|Blood Pressure Alarm|Blood Temperature|Cerebral Temperature|Baseline pain level'
new_chartevents = new_chartevents[~new_chartevents['label'].str.contains(drop_pattern, case=False, na=False)]
new_chartevents.label.nunique()

string_list = [
 'Temperature',
 'Blood Pressure systolic',
 'Blood Pressure diastolic',
 'Blood Pressure mean',
 'PH .',
 'Creatinine',
 'Hematocrit',
 'O2 Flow',
 'Pain Level']

#len(string_list)

pd.options.mode.chained_assignment = None

x = 0
for string in string_list:
    x += 1
    #print(x)
    #print(string)
    sub_df = new_chartevents[new_chartevents['label'].str.contains(string, case=False, na=False)]
    sub_df['final_label'] = string.lower() # use filter string as final_label 
    if x == 1:
        new_chartevents2 = sub_df
    else:
        new_chartevents2 = pd.concat([new_chartevents2, sub_df])

print('new_chartevents2:', new_chartevents2.label.nunique())

string_list2 = [
 'Heart Rate',
 'Respiratory Rate',
 #'Level of Consciousness',
 'C Reactive Protein (CRP)',
 #'Febrile last 24 hours',
 'O2 saturation pulseoxymetry',
 'GCS - Eye Opening',
 'GCS - Verbal Response', 
 'GCS - Motor Response',
 'Richmond-RAS Scale', 'Goal Richmond-RAS Scale',
 'Braden Sensory Perception', 'Braden Moisture', 'Braden Activity', 'Braden Mobility', 'Braden Nutrition', 'Braden Friction/Shear',
 'Strength L Arm', 'Strength L Leg', 'Strength R Leg', 'Strength R Arm',
 'Secondary diagnosis',
 'Gait/Transferring',
 'Mental status', 
 'Glucose finger stick (range 70-100)',
 'SpO2 Desat Limit',
 'Inspired O2 Fraction',
 'Magnesium',
 'Minute Volume',
 'WBC',
 'Riker-SAS Scale',
 'Mean Airway Pressure',
 'Tidal Volume (set)', 'Tidal Volume (observed)', 'Tidal Volume (spontaneous)',
 'PSV Level',
 'Pulmonary Artery Pressure systolic', 'Pulmonary Artery Pressure diastolic', 'Pulmonary Artery Pressure mean',
 'Arterial CO2 Pressure',
 'Expiratory Ratio',
 'Plateau Pressure',
 'Flow Rate (L/min)',
 'Troponin-T',
 'Agitation',
 'Pressure Ulcer Stage #1', 'Pressure Ulcer Stage #2', 'Pressure Ulcer Stage #3', 'Pressure Ulcer Stage #4' 'Pressure Ulcer Stage #5', 'Pressure Ulcer Stage #6', 'Pressure Ulcer Stage #7', 'Pressure Ulcer Stage #8', 'Pressure Ulcer Stage #9',
 'PAR-Consciousness',
 ]

new_chartevents3 = new_chartevents[new_chartevents['label'].isin(string_list2)]

# Merge
new_chartevents4 = pd.concat([new_chartevents2, new_chartevents3])

# Fill in final_label for those that are nan
new_chartevents4.loc[new_chartevents4['final_label'].isna(), 'final_label'] = new_chartevents4['label'].str.lower()
# Order
new_chartevents4 = new_chartevents4.sort_values(by=['stay_id'])
# Reset index 
new_chartevents4.reset_index(inplace=True, drop=True)

new_chartevents4
new_chartevents4.final_label.nunique()

# Create date column 
new_chartevents4['date'] =  pd.to_datetime(new_chartevents4['charttime']).dt.date
# Create hour column
new_chartevents4['hour'] =  pd.to_datetime(new_chartevents4['charttime']).dt.hour
# Pivot
catch22_pivoted_method2 = pd.pivot_table(new_chartevents4, index=['stay_id', 'date', 'hour'], columns=['final_label'], values=['valuenum'])
catch22_pivoted_method2.columns = catch22_pivoted_method2.columns.droplevel()

# Rename ph
catch22_pivoted_method2 = catch22_pivoted_method2.rename(columns={'ph .': 'ph'})

from sktime.datatypes import check_raise
check_raise(catch22_pivoted_method2, 'pd_multiindex_hier')

catch22_pivoted_method2
len(catch22_pivoted_method2.columns)

792

new_chartevents2: 24


Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning,label,final_label
0,12207593,22795209,30000646,2194-04-29 20:00:00,2194-04-29 19:48:00,223761,98.6,37.0,°F,0.0,Temperature Fahrenheit,temperature
1,12207593,22795209,30000646,2194-04-30 13:00:00,2194-04-30 14:14:00,220045,94,94.0,bpm,0.0,Heart Rate,heart rate
2,12207593,22795209,30000646,2194-04-30 12:00:00,2194-04-30 14:20:00,224059,Potential Problem,2.0,,0.0,Braden Friction/Shear,braden friction/shear
3,12207593,22795209,30000646,2194-04-30 12:00:00,2194-04-30 14:20:00,224058,Adequate,3.0,,0.0,Braden Nutrition,braden nutrition
4,12207593,22795209,30000646,2194-04-30 12:00:00,2194-04-30 14:20:00,224057,Slight Limitations,3.0,,0.0,Braden Mobility,braden mobility
...,...,...,...,...,...,...,...,...,...,...,...,...
7351852,17526143,29577504,39996073,2175-09-05 10:00:00,2175-09-05 10:02:00,220045,76,76.0,bpm,0.0,Heart Rate,heart rate
7351853,17526143,29577504,39996073,2175-09-05 09:40:00,2175-09-05 09:40:00,220277,99,99.0,%,0.0,O2 saturation pulseoxymetry,o2 saturation pulseoxymetry
7351854,17526143,29577504,39996073,2175-09-05 09:35:00,2175-09-05 09:40:00,220277,100,100.0,%,0.0,O2 saturation pulseoxymetry,o2 saturation pulseoxymetry
7351855,17526143,29577504,39996073,2175-09-05 10:00:00,2175-09-05 10:09:00,224056,Bedfast,1.0,,0.0,Braden Activity,braden activity


59

True

Unnamed: 0_level_0,Unnamed: 1_level_0,final_label,agitation,arterial co2 pressure,blood pressure diastolic,blood pressure mean,blood pressure systolic,braden activity,braden friction/shear,braden mobility,...,strength r arm,strength r leg,temperature,tidal volume (observed),tidal volume (set),tidal volume (spontaneous),troponin-t,wbc
stay_id,date,hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
30000646,2194-04-29,1,,,68.5,77.5,111.0,1.0,2.0,3.0,...,,,37.000000,,,,,
30000646,2194-04-29,2,,,65.0,75.0,111.0,,,,...,,,,,,,,8.5
30000646,2194-04-29,3,,,58.0,67.0,97.0,,,,...,,,,,,,,
30000646,2194-04-29,4,,,57.0,67.0,98.0,,,,...,,,,,,,,
30000646,2194-04-29,5,,,66.0,73.0,98.0,,,,...,,,37.111111,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39996073,2175-09-19,21,,,,,,,,,...,,,,,,,,
39996073,2175-09-20,0,,,86.0,,125.0,,,,...,5.0,5.0,36.666667,,,,,
39996073,2175-09-20,4,,,96.0,,149.0,,,,...,5.0,5.0,36.722222,,,,,5.7
39996073,2175-09-20,8,,,102.0,110.0,124.0,3.0,2.0,3.0,...,5.0,5.0,36.722222,,,,,


59

In [10]:
# Define C22 function
def c22_extra_fun(df):
    c22 = Catch22()
    c22_2 = Catch22()
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    remove_set = set()
    for x in range(len(df.columns)): # Iterate through columns so not to many nans
        print(x)
        working_df = df.iloc[:, x]
        working_df.dropna(inplace=True) # Drop nans # Note this causes issue with hours being dropped and hence data not being evenly spaced as exspected - could correct with forward filling as before?? But also since doing per feature and per patient likley to be relativly regular anyway 
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0):
                gb = new_df.groupby(level=1)
                group_dict = dict(list(gb))
                n = 0
                for date, new_df2 in new_df.groupby(level=1):
                    n += 1
                    # Create new df that incoperates all data to date for stay
                    group_dict_values_list = list(group_dict.values())[:n]
                    if len(group_dict_values_list) > 1:
                        new_group_dict_values_list = []
                        for y in range(len(group_dict_values_list)):
                            if y == 0:
                                new_group_dict_values_list.append(group_dict_values_list[y])
                            else:
                                sub_df = group_dict_values_list[y].copy()
                                sub_df.reset_index(inplace=True)
                                sub_df['hour'] = sub_df['hour'] + (24*y) # Update hours fo c22 works 
                                sub_df.set_index(['stay_id', 'date', 'hour'], inplace=True)
                                new_group_dict_values_list.append(sub_df)
                        new_df3 = pd.concat(new_group_dict_values_list)
                        new_df3.reset_index(inplace=True)
                        new_df3["date"] = date # Update date so c22 works
                        new_df3.set_index(['stay_id', 'date', 'hour'], inplace=True)
                    else:
                        new_df3 = pd.concat(group_dict_values_list)

                    # C22 for current day
                    if len(new_df2) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data = pd.DataFrame()
                        transformed_data['_mean'] = new_df2.mean().values[0]
                        transformed_data['_std'] = new_df2.std().values[0]
                    else:
                        transformed_data = c22.fit_transform(new_df2)
                        transformed_data['_mean'] = new_df2.mean().values[0]
                        transformed_data['_std'] = new_df2.std().values[0]
                    transformed_data = transformed_data.add_prefix(column_name)           
                    transformed_data.insert(0, 'stay_id', stay_id)
                    transformed_data.insert(1, 'date', date)

                    # C22 for all data to date for stay
                    if len(new_df3) <= 2:
                        transformed_data2 = pd.DataFrame()
                        transformed_data2['_mean'] = new_df3.mean().values[0]
                        transformed_data2['_std'] = new_df3.std().values[0]
                    else:
                        transformed_data2 = c22_2.fit_transform(new_df3)
                        transformed_data2['_mean'] = new_df2.mean().values[0]
                        transformed_data2['_std'] = new_df2.std().values[0]
                    transformed_data2 = transformed_data2.add_prefix(column_name)
                    transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                    transformed_data2.insert(0, 'stay_id', stay_id)
                    transformed_data2.insert(1, 'date', date)

                    # Create master df's
                    master_df = pd.concat([master_df, transformed_data])
                    master_df2 = pd.concat([master_df2, transformed_data2])

                    #print('master_df', master_df)
                    #print('master_df2', master_df2)

            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            master_df = master_df.merge(master_df2, how='left', on=['stay_id', 'date'])

            #print('master_df', master_df)
            #print('master_df.info()', master_df.info())

        if x == 0:
            overlord_df = master_df.copy()
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['stay_id', 'date'])

    return overlord_df

In [154]:
c22_data_method2 = c22_extra_fun(catch22_pivoted_method2)
print(c22_data_method2)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
        stay_id        date  agitation0  agitation1  agitation2  agitation3  \
0      30005085  2136-01-27         2.8        3.00         4.0    0.250000   
1      30005085  2136-01-28         1.3        1.15         2.0    0.125000   
2      30005085  2136-01-29         0.1        0.05         1.0    0.333333   
3      30046600  2169-03-18         NaN         NaN         1.0    0.000000   
4      30100328  2161-12-30         0.9        0.95         1.0    0.333333   
...         ...         ...         ...         ...         ...         ...   
33291  39983109  2110-07-13         NaN         NaN         NaN         NaN   
33292  35094999  2114-10-08         NaN         NaN         NaN         NaN   
33293  35094999  2114-10-09         NaN         NaN         NaN         NaN   
33294  38766778  2152-03-22         NaN    

In [155]:
c22_data_method2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33296 entries, 0 to 33295
Columns: 2834 entries, stay_id to wbc_std_current_stay
dtypes: float64(2832), int64(1), object(1)
memory usage: 720.2+ MB


In [156]:
# Save df
c22_data_method2.to_csv('catch_22_data.csv', index=False)