In [1]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sktime.transformations.panel.catch22 import Catch22
import math

In [2]:
# Import
path = r'switch_data/antibiotic_po_flag.csv'
antibiotic_df = pd.read_csv(path)

In [6]:
antibiotic_df.SUBJECT.nunique()
antibiotic_df.SPELL_IDENTIFIER.nunique()

9268

10309

In [4]:
# Save text file to get subjects to filter for sql
np.savetxt("antibiotic_subjects.txt", antibiotic_df.SUBJECT.unique(), fmt='%s', delimiter=",", newline="','")

In [None]:
import json, snowflake.connector

# establish the connection to snowflake
ctx = snowflake.connector.connect( 
    **json.load(open('/opt/ich/python-snowflake-defaults.json')))
    
# verify and test if connection is working
try: 
    cs = ctx.cursor() 
    cs.execute('SELECT current_version(), current_role(), current_warehouse()')
    print(cs.fetchone())
finally: 
    cs.close()

In [None]:
# Import 
query = '''
SELECT * from ICHT_SANDBOX_PROD.COVOAM_22016.SWITCH_VITALS_filtered
'''
cur = ctx.cursor().execute(query)
switch_vitals = pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])

In [6]:
# Filter columns
switch_vitals = switch_vitals[['SUBJECT', 'OBSERVATION_CODE', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME', 'OBSERVATION_START_DATETIME', 'OBSERVATION_END_DATETIME', 'OBSERVATION_RESULT', 'OBSERVATION_RESULT_CLEAN', 'OBSERVATION_UNIT']]

In [7]:
# Save
#switch_vitals.to_csv('switch_data/switch_vitals_filtered.csv', index=False)

In [3]:
# Import
path = r'switch_data/switch_vitals_filtered_2.csv'
switch_vitals = pd.read_csv(path)

In [5]:
# Check units
switch_vitals.groupby('OBSERVATION_NAME')['OBSERVATION_UNIT'].nunique()

OBSERVATION_NAME
Diastolic Blood Pressure            1
Diastolic Blood Pressure Cuff       1
Glasgow Coma Score                  0
HR - SpO2 (AN)                      1
Heart Rate                          1
Mean Arterial Pressure, Cuff        1
Mean Arterial Pressure, Invasive    1
NEWS Conscious Level Score          0
NEWS Supplemental Oxygen Calc       0
Respiratory Rate                    2
SaO2%                               1
SpO2                                1
SpO2 (AN)                           1
Systolic Blood Pressure             1
Systolic Blood Pressure Cuff        1
Temperature                         1
Name: OBSERVATION_UNIT, dtype: int64

In [9]:
switch_vitals[switch_vitals['OBSERVATION_NAME'] == 'HR - SpO2 (AN)']['OBSERVATION_UNIT'].unique()

array(['beats/minute'], dtype=object)

In [9]:
switch_vitals[switch_vitals['OBSERVATION_NAME'] == 'Respiratory Rate']['OBSERVATION_UNIT'].unique()

array(['breaths/minute', nan, 'beats/minute'], dtype=object)

In [None]:
switch_vitals[(switch_vitals['OBSERVATION_NAME'] == 'Respiratory Rate')&(switch_vitals['OBSERVATION_UNIT'] == 'breaths/minute')]

In [None]:
switch_vitals[(switch_vitals['OBSERVATION_NAME'] == 'Respiratory Rate')&(switch_vitals['OBSERVATION_UNIT'] == 'beats/minute')]

In [None]:
# Units seems ok 

In [4]:
# Rename some OBSERVATION_NAME
switch_vitals['OBSERVATION_NAME'] = switch_vitals['OBSERVATION_NAME'].replace({'SpO2 (AN)': 'SpO2', 'SaO2%':'SpO2', 'Systolic Blood Pressure Cuff':'Systolic Blood Pressure', 'Diastolic Blood Pressure Cuff':'Diastolic Blood Pressure', 'Mean Arterial Pressure, Cuff': 'Mean Arterial Pressure', 'Mean Arterial Pressure, Invasive': 'Mean Arterial Pressure', 'HR - SpO2 (AN)': 'Heart Rate'})

In [10]:
switch_vitals['OBSERVATION_NAME'].nunique()
switch_vitals['OBSERVATION_NAME'].unique()

10

array(['Temperature', 'Mean Arterial Pressure', 'Systolic Blood Pressure',
       'Diastolic Blood Pressure', 'Heart Rate',
       'NEWS Conscious Level Score', 'NEWS Supplemental Oxygen Calc',
       'SpO2', 'Respiratory Rate', 'Glasgow Coma Score'], dtype=object)

In [5]:
# Create date column for merge
switch_vitals['date'] =  pd.to_datetime(switch_vitals['OBSERVATION_DATETIME']).dt.date

In [6]:
# Conver to datetime 
switch_vitals['date'] =  pd.to_datetime(switch_vitals['date'])
antibiotic_df['ADMINISTRATION_DATETIME'] =  pd.to_datetime(antibiotic_df['ADMINISTRATION_DATETIME'])

In [7]:
# Merge
antibiotic_vitals = pd.merge(antibiotic_df, switch_vitals, left_on=['SUBJECT', 'ADMINISTRATION_DATETIME'], right_on=['SUBJECT', 'date'])

In [14]:
antibiotic_vitals.SUBJECT.nunique()
antibiotic_vitals.SPELL_IDENTIFIER.nunique()

9100

10134

In [28]:
antibiotic_vitals.columns

Index(['SUBJECT', 'SPELL_IDENTIFIER', 'ADMISSION_DATE_TIME', 'DISCHARGE_DATE_TIME', 'ADMINISTRATION_DATETIME', 'ROUTE', 'po_flag', 'iv_treatment_length', 'OBSERVATION_CODE', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME', 'OBSERVATION_START_DATETIME', 'OBSERVATION_END_DATETIME', 'OBSERVATION_RESULT', 'OBSERVATION_RESULT_CLEAN', 'OBSERVATION_UNIT', 'date'], dtype='object')

In [8]:
# Filter columns
antibiotic_vitals = antibiotic_vitals[['SUBJECT', 'SPELL_IDENTIFIER', 'ADMINISTRATION_DATETIME', 'ROUTE', 'po_flag', 'iv_treatment_length', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME', 'OBSERVATION_RESULT_CLEAN', 'date']]

In [9]:
# Order
antibiotic_vitals.sort_values(by=['SUBJECT', 'SPELL_IDENTIFIER', 'ADMINISTRATION_DATETIME', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME'], inplace=True)

In [10]:
# Reset index
antibiotic_vitals.reset_index(drop=True, inplace=True)

In [18]:
# Mean number of observations per stay per observation per day
antibiotic_vitals.groupby(['SPELL_IDENTIFIER', 'OBSERVATION_NAME', 'date']).size().mean()

5.552622195272173

In [23]:
# Mean number of observations per day for each observation type
pd.DataFrame(antibiotic_vitals.groupby(['SPELL_IDENTIFIER', 'OBSERVATION_NAME', 'date']).size().reset_index()).groupby('OBSERVATION_NAME')[0].mean()

OBSERVATION_NAME
Diastolic Blood Pressure         6.331618
Glasgow Coma Score               2.871955
Heart Rate                       6.421748
Mean Arterial Pressure           5.853772
NEWS Conscious Level Score       3.287533
NEWS Supplemental Oxygen Calc    3.284980
Respiratory Rate                 6.152422
SpO2                             6.273301
Systolic Blood Pressure          6.332756
Temperature                      5.544472
Name: 0, dtype: float64

In [None]:
# Create hour column
antibiotic_vitals['hour'] =  pd.to_datetime(antibiotic_vitals['OBSERVATION_DATETIME']).dt.hour

In [25]:
# Mean number of observations per hour
pd.DataFrame(antibiotic_vitals.groupby(['SPELL_IDENTIFIER', 'OBSERVATION_NAME', 'date', 'hour']).size().reset_index()).groupby('hour')[0].mean()

hour
0     1.180377
1     1.122415
2     1.134049
3     1.209125
4     1.224128
5     1.065309
6     1.051953
7     1.128873
8     1.109344
9     1.071106
10    1.081173
11    1.172531
12    1.248718
13    1.194278
14    1.243992
15    1.291801
16    1.196338
17    1.177652
18    1.219734
19    1.320787
20    1.089730
21    1.066465
22    1.130438
23    1.235142
Name: 0, dtype: float64

In [26]:
# Number of observations per hour
pd.DataFrame(antibiotic_vitals.groupby(['OBSERVATION_NAME', 'hour']).size().reset_index()).groupby('hour')[0].mean()

hour
0      7079.9
1      9040.6
2      8120.7
3      4622.0
4      4725.5
5     13990.7
6     24358.7
7      9712.6
8      6088.3
9     16009.5
10    19306.5
11    10317.1
12     8621.9
13    12999.0
14    12692.2
15    10796.1
16    16309.2
17    18206.5
18    15217.4
19     9583.1
20    17261.0
21    21499.5
22     9224.6
23     5411.9
Name: 0, dtype: float64

In [21]:
# Mean for 6 hours grouped -- see less observations overnight
(7626.1+9715.1+8786.4+4954.0+5068.0+15278.4)/6
(26734.4+10552.3+6546.3+17460.1+21116.2+11234.0)/6
(9297.1+14091.4+13742.0+11724.9+17848.6+19837.2)/6
(16548.7+10329.6+18946.5+23524.2+10012.0+5808.1)/6

8571.333333333334

15607.216666666667

14423.533333333333

14194.85

In [None]:
# Will tackle iregularrly spaced timeseries by agregating by 6 hours so 4 readings a day and forward fill if needed 

In [21]:
def hours_grouped_fun(row):
    if row['hour'] < 6:
        return 1
    elif row['hour'] < 12:
        return 2
    elif row['hour'] < 18:
        return 3
    elif row['hour'] < 24:
        return 4

In [None]:
antibiotic_vitals['hours_grouped'] = antibiotic_vitals.apply(hours_grouped_fun, axis=1)

In [24]:
# Pivot
antibiotic_vitals_pivoted = pd.pivot_table(antibiotic_vitals, index=['SPELL_IDENTIFIER', 'date', 'hours_grouped'], columns=['OBSERVATION_NAME'], values=['OBSERVATION_RESULT_CLEAN'])
antibiotic_vitals_pivoted.columns = antibiotic_vitals_pivoted.columns.droplevel()

In [26]:
def add_all_hours_grouped(df):
    final_new_df = pd.DataFrame(columns=df.columns)
    spell_list = df['SPELL_IDENTIFIER'].unique().tolist()
    for spell in spell_list:
        temp_df = df[df['SPELL_IDENTIFIER'] == spell]
        temp_df.reset_index(drop=True, inplace=True)
        # Convert to str
        temp_df['date'] = temp_df['date'].astype(str)
        date_list = temp_df['date'].unique().tolist()
        for x in range(len(date_list)):
            temp_df2 = temp_df[temp_df['date'].isin([date_list[x]])]
            temp_df2.reset_index(drop=True, inplace=True)
            new_df = pd.DataFrame(columns=temp_df2.columns)
            if x == 0:
                for i in range(temp_df2['hours_grouped'].iloc[0],5):
                    day_df = temp_df2[temp_df2['hours_grouped'] == i]
                    if day_df.empty:
                        new_df.loc[i] = np.nan
                        new_df.loc[i, 'hours_grouped'] = i
                        new_df.loc[i, 'SPELL_IDENTIFIER'] = temp_df2.iloc[0]['SPELL_IDENTIFIER']
                        new_df.loc[i, 'date'] = temp_df2.iloc[0]['date']
                    else:
                        new_df = pd.concat([new_df, temp_df2[temp_df2['hours_grouped'] == i]])
                    
            elif x == len(date_list) - 1:
                for i in range(1,temp_df2['hours_grouped'].iloc[-1]+1):
                    day_df = temp_df2[temp_df2['hours_grouped'] == i]
                    if day_df.empty:
                        new_df.loc[i] = np.nan
                        new_df.loc[i, 'hours_grouped'] = i
                        new_df.loc[i, 'SPELL_IDENTIFIER'] = temp_df2.iloc[0]['SPELL_IDENTIFIER']
                        new_df.loc[i, 'date'] = temp_df2.iloc[0]['date']
                    else:
                        new_df = pd.concat([new_df, temp_df2[temp_df2['hours_grouped'] == i]])
            else:
                for i in range(1,5):
                    day_df = temp_df2[temp_df2['hours_grouped'] == i]
                    if day_df.empty:
                        new_df.loc[i] = np.nan
                        new_df.loc[i, 'hours_grouped'] = i
                        new_df.loc[i, 'SPELL_IDENTIFIER'] = temp_df2.iloc[0]['SPELL_IDENTIFIER']
                        new_df.loc[i, 'date'] = temp_df2.iloc[0]['date']
                    else:
                        new_df = pd.concat([new_df, temp_df2[temp_df2['hours_grouped'] == i]])
            final_new_df = pd.concat([final_new_df, new_df], ignore_index=True)
    return final_new_df

In [None]:
antibiotic_vitals_pivoted_2 = add_all_hours_grouped(antibiotic_vitals_pivoted.reset_index())

In [29]:
antibiotic_vitals_pivoted_2.hours_grouped.value_counts()

2    56628
3    55842
1    53745
4    53332
Name: hours_grouped, dtype: int64

In [30]:
antibiotic_vitals_pivoted_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219547 entries, 0 to 219546
Data columns (total 13 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   SPELL_IDENTIFIER               219547 non-null  object 
 1   date                           219547 non-null  object 
 2   hours_grouped                  219547 non-null  object 
 3   Diastolic Blood Pressure       189565 non-null  float64
 4   Glasgow Coma Score             26260 non-null   float64
 5   Heart Rate                     189980 non-null  float64
 6   Mean Arterial Pressure         181730 non-null  float64
 7   NEWS Conscious Level Score     107827 non-null  float64
 8   NEWS Supplemental Oxygen Calc  107714 non-null  float64
 9   Respiratory Rate               188068 non-null  float64
 10  SpO2                           188812 non-null  float64
 11  Systolic Blood Pressure        189579 non-null  float64
 12  Temperature                   

In [47]:
antibiotic_vitals_pivoted_2.columns

Index(['SPELL_IDENTIFIER', 'date', 'hours_grouped', 'Diastolic Blood Pressure', 'Glasgow Coma Score', 'Heart Rate', 'Mean Arterial Pressure', 'NEWS Conscious Level Score', 'NEWS Supplemental Oxygen Calc', 'Respiratory Rate', 'SpO2', 'Systolic Blood Pressure', 'Temperature'], dtype='object', name='OBSERVATION_NAME')

In [None]:
# Forward fill
antibiotic_vitals_pivoted_3 = antibiotic_vitals_pivoted_2.groupby(['SPELL_IDENTIFIER'])['Diastolic Blood Pressure', 'Glasgow Coma Score', 'Heart Rate', 'Mean Arterial Pressure', 'NEWS Conscious Level Score', 'NEWS Supplemental Oxygen Calc', 'Respiratory Rate', 'SpO2', 'Systolic Blood Pressure', 'Temperature'].ffill()
antibiotic_vitals_pivoted_3 = antibiotic_vitals_pivoted_2[['SPELL_IDENTIFIER', 'date', 'hours_grouped']].join(antibiotic_vitals_pivoted_3)

In [33]:
antibiotic_vitals_pivoted_3.drop_duplicates(subset=['SPELL_IDENTIFIER'], keep='last')['hours_grouped'].value_counts()

3.0    3389
2.0    3330
4.0    3287
1.0     128
Name: hours_grouped, dtype: int64

In [None]:
# Do prediction at 24 hours, 48 hours and then daily afterwards with data up to midday (hours grouped 2)
# If midday data not available then just use up to hours groups 1

In [None]:
# Remove those with less than 24 hours of data

In [83]:
len((antibiotic_vitals_pivoted_3.groupby('SPELL_IDENTIFIER').size() > 3).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())

10538

In [34]:
antibiotic_vitals_pivoted_4 = antibiotic_vitals_pivoted_3[antibiotic_vitals_pivoted_3['SPELL_IDENTIFIER'].isin((antibiotic_vitals_pivoted_3.groupby('SPELL_IDENTIFIER').size() > 3).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())]

In [35]:
antibiotic_vitals_pivoted_4['SPELL_IDENTIFIER'].nunique()

10007

In [36]:
antibiotic_vitals_pivoted_4.reset_index(inplace=True, drop=True)

In [38]:
# Save
#antibiotic_vitals_pivoted_4.to_csv('switch_data/antibiotic_vitals.csv', index=False)

In [2]:
# Import
path = r'switch_data/antibiotic_vitals.csv'
antibiotic_vitals_pivoted_4 = pd.read_csv(path)
antibiotic_vitals_pivoted_4['hours_grouped'] = antibiotic_vitals_pivoted_4['hours_grouped'].astype(int) # Needs to be type int for c22 to work

In [None]:
### 24 hours ###

In [6]:
c22_24 = antibiotic_vitals_pivoted_4.groupby('SPELL_IDENTIFIER').head(4)
c22_24.set_index(['SPELL_IDENTIFIER', 'date', 'hours_grouped'] ,inplace=True, drop=True)

In [None]:
### 48 hours ###

In [7]:
# Define
c22_48 = antibiotic_vitals_pivoted_4.groupby('SPELL_IDENTIFIER').head(8)

In [11]:
c22_48['SPELL_IDENTIFIER'].nunique()

10007

In [8]:
# Remove those without enough data
c22_48 = c22_48[c22_48['SPELL_IDENTIFIER'].isin((c22_48.groupby('SPELL_IDENTIFIER').size() > 7).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())]

In [13]:
c22_48['SPELL_IDENTIFIER'].nunique()

8869

In [9]:
# Set index
c22_48.set_index(['SPELL_IDENTIFIER', 'date', 'hours_grouped'] ,inplace=True, drop=True)

In [15]:
(c22_48.groupby('SPELL_IDENTIFIER').size() < 4).where(lambda x : x == True).dropna()
(c22_48.groupby('SPELL_IDENTIFIER').size() > 8).where(lambda x : x == True).dropna()
(c22_48.groupby('SPELL_IDENTIFIER').size().min())
(c22_48.groupby('SPELL_IDENTIFIER').size().max())
len((c22_48.groupby('SPELL_IDENTIFIER').size() > 7).where(lambda x : x == True).dropna())


Series([], dtype: float64)

Series([], dtype: float64)

8

8

8869

In [None]:
# Other days

In [3]:
# Remove those without enough data
c22_other = antibiotic_vitals_pivoted_4[antibiotic_vitals_pivoted_4['SPELL_IDENTIFIER'].isin((antibiotic_vitals_pivoted_4.groupby('SPELL_IDENTIFIER').size() > 8).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())]

In [19]:
c22_other['SPELL_IDENTIFIER'].nunique()

8643

In [4]:
# Set index
c22_other.set_index(['SPELL_IDENTIFIER', 'date', 'hours_grouped'] ,inplace=True, drop=True)

In [None]:
# c22

In [23]:
from sktime.datatypes import check_raise
check_raise(c22_24, 'pd_multiindex_hier')

True

In [25]:
c22_24.index.levels[0].nunique()

10007

In [34]:
from sktime.transformations.panel.catch22 import Catch22

In [32]:
import sktime

In [33]:
sktime.__version__

'0.16.1'

In [20]:
# Supress mean of empty slice warning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)

In [37]:
# Define C22 function
def c22_24_fun(df):
    c22 = Catch22(catch24=True) # Add catch24 = True
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    remove_set = set()
    for x in range(len(df.columns)): # Iterate through columns so not to many nans
        print('column number:', x)
        working_df = df.iloc[:, x]
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0):
                    new_df2 = new_df.reset_index(drop=True).dropna()
                    
                    # C22 for current day
                    if len(new_df2) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data = pd.DataFrame()
                        np_data = new_df2.to_numpy()
                        if not np_data.size == 0:
                            transformed_data['22'] = np.mean(new_df2.to_numpy())
                            transformed_data['23'] = np.std(new_df.to_numpy())
                    else:
                        transformed_data = c22.fit_transform(new_df2)

                    transformed_data = transformed_data.add_prefix(column_name)           
                    transformed_data.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data.insert(2, '24_hour_flag', 1)

                    # C22 for all data to date for stay - same for first 24 hours
                    transformed_data2 = transformed_data.iloc[:,3:].copy()
                    transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay
                    transformed_data2.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data2.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data2.insert(2, '24_hour_flag', 1)

                    # Create master df's
                    master_df = pd.concat([master_df, transformed_data])
                    master_df2 = pd.concat([master_df2, transformed_data2])


            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            master_df = master_df.merge(master_df2, how='left', on=['SPELL_IDENTIFIER', 'date', '24_hour_flag'])

        if x == 0:
            overlord_df = master_df.copy()
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['SPELL_IDENTIFIER', 'date', '24_hour_flag'])
        
    return overlord_df

In [40]:
c22_24_hour_df = c22_24_fun(c22_24)

column number: 0
column number: 1
column number: 2
column number: 3
column number: 4
column number: 5
column number: 6
column number: 7
column number: 8
column number: 9


In [42]:
# Save
#c22_24_hour_df.to_csv('switch_data/c22_24_hour.csv', index=False)

In [45]:
# Difference
c22_24_hour_df2 = c22_24_hour_df.drop(columns=['date', '24_hour_flag'])

c22_24_hour_diff_df = pd.DataFrame()
c22_24_hour_df2.set_index('SPELL_IDENTIFIER', inplace=True)
for stay_id, new_df in c22_24_hour_df2.groupby(level=0):
    diff_df = new_df.diff()
    diff_df = diff_df.add_suffix('_difference')
    c22_24_hour_diff_df = pd.concat([c22_24_hour_diff_df, diff_df], ignore_index=True)

c22_24_hour_df2.reset_index(inplace=True)
c22_24_hour_diff_df = pd.concat([c22_24_hour_df2, c22_24_hour_diff_df], axis=1)
c22_24_hour_diff_df = pd.concat([c22_24_hour_df[['date', '24_hour_flag']], c22_24_hour_diff_df], axis=1)
col = c22_24_hour_diff_df.pop("SPELL_IDENTIFIER")
c22_24_hour_diff_df.insert(0, col.name, col)

In [None]:
# Obviously diff does not make much of a difference here as just one day for each patient but work out anyway 

In [49]:
# Save
#c22_24_hour_diff_df.to_csv('switch_data/c22_24_hour_with_diff.csv', index=False)

In [None]:
# 48

In [78]:
# Define C22 function
def c22_48_fun(df):
    c22 = Catch22(catch24=True)
    c22_2 = Catch22(catch24=True)
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    remove_set = set()
    for x in range(len(df.columns)): # Iterate through columns so not to many nans
        print('column number:', x)
        working_df = df.iloc[:, x]
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0):

                    new_df2 = new_df.tail(4).reset_index(drop=True).dropna()
                    new_df3 = new_df.reset_index(drop=True).dropna()
                    
                    # C22 for latest day
                    if len(new_df2) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data = pd.DataFrame()
                        np_data = new_df2.to_numpy()
                        if not np_data.size == 0:
                            transformed_data['22'] = np.mean(np_data)
                            transformed_data['23'] = np.std(np_data)
                    else: 
                        transformed_data = c22.fit_transform(new_df2)
                    transformed_data = transformed_data.add_prefix(column_name)           
                    transformed_data.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data.insert(2, '48_hour_flag', 1)

                    # C22 for all data to date for stay
                    if len(new_df3) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data2 = pd.DataFrame()
                        np_data = new_df3.to_numpy()
                        if not np_data.size == 0:
                            transformed_data2['22'] = np.mean(np_data)
                            transformed_data2['23'] = np.std(np_data) 
                    else:
                        transformed_data2 = c22_2.fit_transform(new_df3)
                    transformed_data2 = transformed_data2.add_prefix(column_name)  
                    transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                    transformed_data2.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data2.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data2.insert(2, '48_hour_flag', 1)

                    # Create master df's
                    master_df = pd.concat([master_df, transformed_data])
                    master_df2 = pd.concat([master_df2, transformed_data2])

            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            master_df = master_df.merge(master_df2, how='left', on=['SPELL_IDENTIFIER', 'date', '48_hour_flag'])

        if x == 0:
            overlord_df = master_df.copy()
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['SPELL_IDENTIFIER', 'date', '48_hour_flag'])

    return overlord_df

In [81]:
c22_48_hour_df = c22_48_fun(c22_48)

column number: 0
column number: 1
column number: 2
column number: 3
column number: 4
column number: 5
column number: 6
column number: 7
column number: 8
column number: 9


In [84]:
c22_48.reset_index()['SPELL_IDENTIFIER'].nunique()

8869

In [85]:
# Save
#c22_48_hour_df.to_csv('switch_data/c22_48_hour.csv', index=False)

In [87]:
# Difference
c22_48_hour_df2 = c22_48_hour_df.drop(columns=['date', '48_hour_flag'])

c22_48_hour_diff_df = pd.DataFrame()
c22_48_hour_df2.set_index('SPELL_IDENTIFIER', inplace=True)
for stay_id, new_df in c22_48_hour_df2.groupby(level=0):
    diff_df = new_df.diff()
    diff_df = diff_df.add_suffix('_difference')
    c22_48_hour_diff_df = pd.concat([c22_48_hour_diff_df, diff_df], ignore_index=True)

c22_48_hour_df2.reset_index(inplace=True)
c22_48_hour_diff_df = pd.concat([c22_48_hour_df2, c22_48_hour_diff_df], axis=1)
c22_48_hour_diff_df = pd.concat([c22_48_hour_df[['date', '48_hour_flag']], c22_48_hour_diff_df], axis=1)
col = c22_48_hour_diff_df.pop("SPELL_IDENTIFIER")
c22_48_hour_diff_df.insert(0, col.name, col)

In [89]:
# Save
#c22_48_hour_diff_df.to_csv('switch_data/c22_48_hour_with_diff.csv', index=False)