In [1]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sktime.transformations.panel.catch22 import Catch22
import math

In [2]:
# Supress mean of empty slice warning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)

In [6]:
# Import
path = r'switch_data/antibiotic_po_flag_2023.csv'
antibiotic_df = pd.read_csv(path)

In [3]:
antibiotic_df.SUBJECT.nunique()
antibiotic_df.SPELL_IDENTIFIER.nunique()

3217

3431

In [5]:
# Save text file to get subjects to filter for sql
np.savetxt("antibiotic_subjects_2023.txt", antibiotic_df.SUBJECT.unique(), fmt='%s', delimiter=",", newline="','")

In [None]:
import json, snowflake.connector

# establish the connection to snowflake
ctx = snowflake.connector.connect( 
    **json.load(open('/opt/ich/python-snowflake-defaults.json')))
    
# verify and test if connection is working
try: 
    cs = ctx.cursor() 
    cs.execute('SELECT current_version(), current_role(), current_warehouse()')
    print(cs.fetchone())
finally: 
    cs.close()

In [7]:
# Import 
query = '''
SELECT * from ICHT_SANDBOX_PROD.COVOAM_22016.SWITCH_VITALS_filtered_2023
'''
cur = ctx.cursor().execute(query)
switch_vitals = pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])

In [9]:
# Filter columns
switch_vitals = switch_vitals[['SUBJECT', 'OBSERVATION_CODE', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME', 'OBSERVATION_START_DATETIME', 'OBSERVATION_END_DATETIME', 'OBSERVATION_RESULT', 'OBSERVATION_RESULT_CLEAN', 'OBSERVATION_UNIT']]

In [10]:
# Save
#switch_vitals.to_csv('switch_data/switch_vitals_filtered_2023.csv', index=False)

In [7]:
# Import
path = r'switch_data/switch_vitals_filtered_2023.csv'
switch_vitals = pd.read_csv(path)

  switch_vitals = pd.read_csv(path)


In [11]:
# Chech units 
switch_vitals.groupby('OBSERVATION_NAME')['OBSERVATION_UNIT'].nunique()

OBSERVATION_NAME
Diastolic Blood Pressure            1
Diastolic Blood Pressure Cuff       1
Glasgow Coma Score                  0
HR - SpO2 (AN)                      1
Heart Rate                          1
Mean Arterial Pressure, Cuff        1
Mean Arterial Pressure, Invasive    1
NEWS Conscious Level Score          0
NEWS Supplemental Oxygen Calc       0
Respiratory Rate                    1
SaO2%                               1
SpO2                                1
SpO2 (AN)                           1
Systolic Blood Pressure             1
Systolic Blood Pressure Cuff        1
Temperature                         1
Name: OBSERVATION_UNIT, dtype: int64

In [9]:
# Rename some OBSERVATION_NAME
switch_vitals['OBSERVATION_NAME'] = switch_vitals['OBSERVATION_NAME'].replace({'SpO2 (AN)': 'SpO2', 'SaO2%':'SpO2', 'Systolic Blood Pressure Cuff':'Systolic Blood Pressure', 'Diastolic Blood Pressure Cuff':'Diastolic Blood Pressure', 'Mean Arterial Pressure, Cuff': 'Mean Arterial Pressure', 'Mean Arterial Pressure, Invasive': 'Mean Arterial Pressure', 'HR - SpO2 (AN)': 'Heart Rate'})

In [10]:
switch_vitals['OBSERVATION_NAME'].nunique()
switch_vitals['OBSERVATION_NAME'].unique()

10

array(['Temperature', 'Mean Arterial Pressure', 'Systolic Blood Pressure',
       'Diastolic Blood Pressure', 'Heart Rate',
       'NEWS Conscious Level Score', 'NEWS Supplemental Oxygen Calc',
       'Respiratory Rate', 'Glasgow Coma Score', 'SpO2'], dtype=object)

In [11]:
# Create date column for merge
switch_vitals['date'] =  pd.to_datetime(switch_vitals['OBSERVATION_DATETIME']).dt.date

In [12]:
# Conver to datetime 
switch_vitals['date'] =  pd.to_datetime(switch_vitals['date'])
antibiotic_df['ADMINISTRATION_DATETIME'] =  pd.to_datetime(antibiotic_df['ADMINISTRATION_DATETIME'])

In [13]:
# Merge
antibiotic_vitals = pd.merge(antibiotic_df, switch_vitals, left_on=['SUBJECT', 'ADMINISTRATION_DATETIME'], right_on=['SUBJECT', 'date'])

In [18]:
antibiotic_vitals.SUBJECT.nunique()
antibiotic_vitals.SPELL_IDENTIFIER.nunique()

3217

3431

In [14]:
# Filter columns
antibiotic_vitals = antibiotic_vitals[['SUBJECT', 'SPELL_IDENTIFIER', 'ADMINISTRATION_DATETIME', 'ROUTE', 'po_flag', 'iv_treatment_length', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME', 'OBSERVATION_RESULT_CLEAN', 'date']]

In [15]:
# Order
antibiotic_vitals.sort_values(by=['SUBJECT', 'SPELL_IDENTIFIER', 'ADMINISTRATION_DATETIME', 'OBSERVATION_NAME', 'OBSERVATION_DATETIME'], inplace=True)

In [16]:
# Reset index
antibiotic_vitals.reset_index(drop=True, inplace=True)

In [23]:
# Mean number of observations per stay per observation per day
antibiotic_vitals.groupby(['SPELL_IDENTIFIER', 'OBSERVATION_NAME', 'date']).size().mean()

5.386282532634344

In [None]:
# Create hour column
antibiotic_vitals['hour'] =  pd.to_datetime(antibiotic_vitals['OBSERVATION_DATETIME']).dt.hour

In [25]:
def hours_grouped_fun(row):
    if row['hour'] < 6:
        return 1
    elif row['hour'] < 12:
        return 2
    elif row['hour'] < 18:
        return 3
    elif row['hour'] < 24:
        return 4

In [None]:
antibiotic_vitals['hours_grouped'] = antibiotic_vitals.apply(hours_grouped_fun, axis=1)

In [27]:
# Pivot
antibiotic_vitals_pivoted = pd.pivot_table(antibiotic_vitals, index=['SPELL_IDENTIFIER', 'date', 'hours_grouped'], columns=['OBSERVATION_NAME'], values=['OBSERVATION_RESULT_CLEAN'])
antibiotic_vitals_pivoted.columns = antibiotic_vitals_pivoted.columns.droplevel()

In [29]:
def add_all_hours_grouped(df):
    final_new_df = pd.DataFrame(columns=df.columns)
    spell_list = df['SPELL_IDENTIFIER'].unique().tolist()
    for spell in spell_list:
        temp_df = df[df['SPELL_IDENTIFIER'] == spell]
        temp_df.reset_index(drop=True, inplace=True)
        # Convert to str
        temp_df['date'] = temp_df['date'].astype(str)
        date_list = temp_df['date'].unique().tolist()
        for x in range(len(date_list)):
            temp_df2 = temp_df[temp_df['date'].isin([date_list[x]])]
            temp_df2.reset_index(drop=True, inplace=True)
            new_df = pd.DataFrame(columns=temp_df2.columns)

            if x == 0:
                for i in range(temp_df2['hours_grouped'].iloc[0],5):
                    day_df = temp_df2[temp_df2['hours_grouped'] == i]
                    if day_df.empty:
                        new_df.loc[i] = np.nan
                        new_df.loc[i, 'hours_grouped'] = i
                        new_df.loc[i, 'SPELL_IDENTIFIER'] = temp_df2.iloc[0]['SPELL_IDENTIFIER']
                        new_df.loc[i, 'date'] = temp_df2.iloc[0]['date']
                    else:
                        new_df = pd.concat([new_df, temp_df2[temp_df2['hours_grouped'] == i]])
                    
            elif x == len(date_list) - 1:
                for i in range(1,temp_df2['hours_grouped'].iloc[-1]+1):
                    day_df = temp_df2[temp_df2['hours_grouped'] == i]
                    if day_df.empty:
                        new_df.loc[i] = np.nan
                        new_df.loc[i, 'hours_grouped'] = i
                        new_df.loc[i, 'SPELL_IDENTIFIER'] = temp_df2.iloc[0]['SPELL_IDENTIFIER']
                        new_df.loc[i, 'date'] = temp_df2.iloc[0]['date']
                    else:
                        new_df = pd.concat([new_df, temp_df2[temp_df2['hours_grouped'] == i]])
            else:
                for i in range(1,5):
                    day_df = temp_df2[temp_df2['hours_grouped'] == i]
                    if day_df.empty:
                        new_df.loc[i] = np.nan
                        new_df.loc[i, 'hours_grouped'] = i
                        new_df.loc[i, 'SPELL_IDENTIFIER'] = temp_df2.iloc[0]['SPELL_IDENTIFIER']
                        new_df.loc[i, 'date'] = temp_df2.iloc[0]['date']
                    else:
                        new_df = pd.concat([new_df, temp_df2[temp_df2['hours_grouped'] == i]])
            final_new_df = pd.concat([final_new_df, new_df], ignore_index=True)
    return final_new_df

In [None]:
antibiotic_vitals_pivoted_2 = add_all_hours_grouped(antibiotic_vitals_pivoted.reset_index())

In [32]:
antibiotic_vitals_pivoted_2.hours_grouped.value_counts()

2    18056
3    17698
1    17079
4    16715
Name: hours_grouped, dtype: int64

In [None]:
# Forward fill
antibiotic_vitals_pivoted_3 = antibiotic_vitals_pivoted_2.groupby(['SPELL_IDENTIFIER'])['Diastolic Blood Pressure', 'Glasgow Coma Score', 'Heart Rate', 'Mean Arterial Pressure', 'NEWS Conscious Level Score', 'NEWS Supplemental Oxygen Calc', 'Respiratory Rate', 'SpO2', 'Systolic Blood Pressure', 'Temperature'].ffill()
antibiotic_vitals_pivoted_3 = antibiotic_vitals_pivoted_2[['SPELL_IDENTIFIER', 'date', 'hours_grouped']].join(antibiotic_vitals_pivoted_3)

In [None]:
# Remove those with less than 24 hours of data

In [34]:
antibiotic_vitals_pivoted_4 = antibiotic_vitals_pivoted_3[antibiotic_vitals_pivoted_3['SPELL_IDENTIFIER'].isin((antibiotic_vitals_pivoted_3.groupby('SPELL_IDENTIFIER').size() > 3).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())]
antibiotic_vitals_pivoted_4.reset_index(inplace=True, drop=True)

In [35]:
# Save
#antibiotic_vitals_pivoted_4.to_csv('switch_data/antibiotic_vitals_2023.csv', index=False)

In [37]:
antibiotic_vitals_pivoted_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69536 entries, 0 to 69535
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SPELL_IDENTIFIER               69536 non-null  object 
 1   date                           69536 non-null  object 
 2   hours_grouped                  69536 non-null  object 
 3   Diastolic Blood Pressure       69468 non-null  float64
 4   Glasgow Coma Score             52354 non-null  float64
 5   Heart Rate                     69500 non-null  float64
 6   Mean Arterial Pressure         69228 non-null  float64
 7   NEWS Conscious Level Score     60009 non-null  float64
 8   NEWS Supplemental Oxygen Calc  59710 non-null  float64
 9   Respiratory Rate               69383 non-null  float64
 10  SpO2                           69434 non-null  float64
 11  Systolic Blood Pressure        69470 non-null  float64
 12  Temperature                    69434 non-null 

In [38]:
# Needs to be type int for c22 to work
antibiotic_vitals_pivoted_4['hours_grouped'] = antibiotic_vitals_pivoted_4['hours_grouped'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  antibiotic_vitals_pivoted_4['hours_grouped'] = antibiotic_vitals_pivoted_4['hours_grouped'].astype(int)


In [None]:
### 24 hours ###

In [39]:
c22_24 = antibiotic_vitals_pivoted_4.groupby('SPELL_IDENTIFIER').head(4)
c22_24.set_index(['SPELL_IDENTIFIER', 'date', 'hours_grouped'] ,inplace=True, drop=True)

In [44]:
from sktime.datatypes import check_raise
check_raise(c22_24, 'pd_multiindex_hier')

True

In [47]:
# Define C22 function
def c22_24_fun(df):
    c22 = Catch22(catch24=True) # Add catch24 = True
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    remove_set = set()
    for x in range(len(df.columns)): # Iterate through columns so not to many nans
        print('column number:', x)
        working_df = df.iloc[:, x]
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0):
                    new_df2 = new_df.reset_index(drop=True).dropna()
                    
                    # C22 for current day
                    if len(new_df2) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data = pd.DataFrame()
                        np_data = new_df2.to_numpy()
                        if not np_data.size == 0:
                            transformed_data['22'] = np.mean(new_df2.to_numpy())
                            transformed_data['23'] = np.std(new_df.to_numpy())
                    else:
                        transformed_data = c22.fit_transform(new_df2)

                    transformed_data = transformed_data.add_prefix(column_name)           
                    transformed_data.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data.insert(2, '24_hour_flag', 1)

                    # C22 for all data to date for stay - same for first 24 hours
                    transformed_data2 = transformed_data.iloc[:,3:].copy()
                    transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay
                    transformed_data2.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data2.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data2.insert(2, '24_hour_flag', 1)

                    # Create master df's
                    master_df = pd.concat([master_df, transformed_data])
                    master_df2 = pd.concat([master_df2, transformed_data2])


            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            master_df = master_df.merge(master_df2, how='left', on=['SPELL_IDENTIFIER', 'date', '24_hour_flag'])

        if x == 0:
            overlord_df = master_df.copy()
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['SPELL_IDENTIFIER', 'date', '24_hour_flag'])
        
    return overlord_df

In [None]:
c22_24_hour_df = c22_24_fun(c22_24)

In [60]:
# Save
#c22_24_hour_df.to_csv('switch_data/c22_24_hour_2023.csv', index=False)

In [None]:
### 48 hours ###

In [62]:
# Define C22 function
def c22_48_fun(df):
    c22 = Catch22(catch24=True)
    c22_2 = Catch22(catch24=True)
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    remove_set = set()
    for x in range(len(df.columns)): # Iterate through columns so not to many nans
        print('column number:', x)
        working_df = df.iloc[:, x]
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0):

                    new_df2 = new_df.tail(4).reset_index(drop=True).dropna()
                    new_df3 = new_df.reset_index(drop=True).dropna()
                    
                    # C22 for latest day
                    if len(new_df2) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data = pd.DataFrame()
                        np_data = new_df2.to_numpy()
                        if not np_data.size == 0:
                            transformed_data['22'] = np.mean(np_data)
                            transformed_data['23'] = np.std(np_data)
                    else: 
                        transformed_data = c22.fit_transform(new_df2)
                    transformed_data = transformed_data.add_prefix(column_name)           
                    transformed_data.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data.insert(2, '48_hour_flag', 1)

                    # C22 for all data to date for stay
                    if len(new_df3) <= 2: # c22 only works with 3 or more timepoints
                        transformed_data2 = pd.DataFrame()
                        np_data = new_df3.to_numpy()
                        if not np_data.size == 0:
                            transformed_data2['22'] = np.mean(np_data)
                            transformed_data2['23'] = np.std(np_data) 
                    else:
                        transformed_data2 = c22_2.fit_transform(new_df3)
                    transformed_data2 = transformed_data2.add_prefix(column_name)  
                    transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                    transformed_data2.insert(0, 'SPELL_IDENTIFIER', stay_id)
                    transformed_data2.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])
                    transformed_data2.insert(2, '48_hour_flag', 1)

                    # Create master df's
                    master_df = pd.concat([master_df, transformed_data])
                    master_df2 = pd.concat([master_df2, transformed_data2])

            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            master_df = master_df.merge(master_df2, how='left', on=['SPELL_IDENTIFIER', 'date', '48_hour_flag'])

        if x == 0:
            overlord_df = master_df.copy()
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['SPELL_IDENTIFIER', 'date', '48_hour_flag'])

    return overlord_df

In [41]:
# Define
c22_48 = antibiotic_vitals_pivoted_4.groupby('SPELL_IDENTIFIER').head(8)

In [42]:
# Remove those without enough data
c22_48 = c22_48[c22_48['SPELL_IDENTIFIER'].isin((c22_48.groupby('SPELL_IDENTIFIER').size() > 7).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())]

In [43]:
# Set index
c22_48.set_index(['SPELL_IDENTIFIER', 'date', 'hours_grouped'] ,inplace=True, drop=True)

In [None]:
c22_48_hour_df = c22_48_fun(c22_48)

In [64]:
# Save
c22_48_hour_df.to_csv('switch_data/c22_48_hour_2023.csv', index=False)

In [None]:
### Other ###

In [66]:
# Remove those without enough data
c22_other = antibiotic_vitals_pivoted_4[antibiotic_vitals_pivoted_4['SPELL_IDENTIFIER'].isin((antibiotic_vitals_pivoted_4.groupby('SPELL_IDENTIFIER').size() > 8).where(lambda x : x == True).dropna().reset_index()['SPELL_IDENTIFIER'].to_list())]
# Set index
c22_other.set_index(['SPELL_IDENTIFIER', 'date', 'hours_grouped'] ,inplace=True, drop=True)

In [67]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [68]:
def c22_other_fun(df):
    c22 = Catch22(catch24=True)
    master_df = pd.DataFrame()
    overlord_df = pd.DataFrame()
    for x in range(len(df.columns)): # Iterate through columns sas this c22 does not work on multiple columns
        print('column number:', x)
        working_df = df.iloc[:, x]
        working_df = working_df.to_frame()
        for column_name in working_df:
            master_df = pd.DataFrame()
            master_df2 = pd.DataFrame()
            for stay_id, new_df in working_df[[column_name]].groupby(level=0, dropna=False): # Iterate through stays

                n_list = list(range(1, math.ceil(len(new_df)/4)+1))
                for n in range(1, math.ceil(len(new_df)/4)+1): # Do this to understand how many 'days' / sets of data we will have for each stay beyond 24 and 48 hours
                    # Ignore first 48 hours as calculated seperatly
                    if n == 1:
                        continue
                    elif n == 2:
                        continue
                    # This is for exceptional cases where we want hours grouped 1 for last day and the spell starts on hours grouped 2
                    elif (new_df.reset_index().iloc[0]['hours_grouped'] == 2) and ((n == (len(new_df)/4)) and (n == n_list[-1])):
                        exit_flag = True
                        flag_2 = False
                        flag_1 = False
                        # Get index of nth 2 hours grouped value
                        try:
                            index_value = new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index][new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index]['hours_grouped'] == 2].index.values[0]
                            new_df2 = new_df.iloc[:index_value+1]
                            exit_flag = False
                            flag_2 = True
                        #  Look at nth 1 hours grouped  
                        except:
                            pass

                        try:
                            index_value = new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index][new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index]['hours_grouped'] == 1].index.values[0]
                            new_df3 = new_df.iloc[:index_value+1]
                            exit_flag = False
                            flag_1 = True
                        except:
                            pass
                        
                        if exit_flag == True:
                            continue
                        
                        if flag_2 == True:
                            new_df2_2 = new_df2.tail(4).reset_index(drop=True).dropna()
                            new_df2_3 = new_df2.reset_index(drop=True).dropna()
                    
                            # C22 for latest day
                            if len(new_df2_2) <= 2: # c22 only works with 3 or more timepoints
                                transformed_data = pd.DataFrame()
                                np_data = new_df2_2.to_numpy()
                                if not np_data.size == 0:
                                    transformed_data['22'] = np.mean(np_data)
                                    transformed_data['23'] = np.std(np_data) 
                            else: 
                                transformed_data = c22.fit_transform(new_df2_2)

                            transformed_data = transformed_data.add_prefix(column_name)           
                            transformed_data.insert(0, 'SPELL_IDENTIFIER', stay_id)
                            transformed_data.insert(1, 'date', new_df2.reset_index().iloc[-1]['date'])

                            # C22 for all data to date for stay
                            if len(new_df2_3) <= 2: # c22 only works with 3 or more timepoints
                                transformed_data2 = pd.DataFrame()
                                np_data = new_df2_3.to_numpy()
                                if not np_data.size == 0:
                                    transformed_data2['22'] = np.mean(np_data)
                                    transformed_data2['23'] = np.std(np_data)
                            else: 
                                transformed_data2 = c22.fit_transform(new_df2_3)

                            transformed_data2 = transformed_data2.add_prefix(column_name)  
                            transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                            transformed_data2.insert(0, 'SPELL_IDENTIFIER', stay_id)
                            transformed_data2.insert(1, 'date', new_df2.reset_index().iloc[-1]['date'])

                        else:
                            transformed_data = pd.DataFrame()
                            transformed_data2 = pd.DataFrame()
                        
                        if flag_1 == True:
                            new_df3_2 = new_df3.tail(4).reset_index(drop=True).dropna()
                            new_df3_3 = new_df3.reset_index(drop=True).dropna()

                            # C22 for latest day
                            if len(new_df3_2) <= 2: # c22 only works with 3 or more timepoints
                                transformed_data3 = pd.DataFrame()
                                np_data = new_df3_2.to_numpy()
                                if not np_data.size == 0:
                                    transformed_data3['22'] = np.mean(np_data)
                                    transformed_data3['23'] = np.std(np_data)
                            else: 
                                transformed_data3 = c22.fit_transform(new_df3_2)

                            transformed_data3 = transformed_data3.add_prefix(column_name)           
                            transformed_data3.insert(0, 'SPELL_IDENTIFIER', stay_id)
                            transformed_data3.insert(1, 'date', new_df.reset_index().iloc[-1]['date'])

                            # C22 for all data to date for stay
                            if len(new_df3_3) <= 2: # c22 only works with 3 or more timepoints
                                transformed_data4 = pd.DataFrame()
                                np_data = new_df3_3.to_numpy()
                                if not np_data.size == 0:
                                    transformed_data4['22'] = np.mean(np_data)
                                    transformed_data4['23'] = np.std(np_data) 
                            else: 
                                transformed_data4 = c22.fit_transform(new_df3_3)
                                transformed_data4 = transformed_data4.add_prefix(column_name)  
                                transformed_data4 = transformed_data4.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                                transformed_data4.insert(0, 'SPELL_IDENTIFIER', stay_id)
                                transformed_data4.insert(1, 'date', new_df3.reset_index().iloc[-1]['date'])
                        else:
                            transformed_data3 = pd.DataFrame()
                            transformed_data4 = pd.DataFrame()
                        
                        # Combine
                        transformed_data = pd.concat([transformed_data, transformed_data3], ignore_index=True)
                        transformed_data2 = pd.concat([transformed_data2, transformed_data4], ignore_index=True)

                        # Create master df's
                        master_df = pd.concat([master_df, transformed_data])
                        master_df2 = pd.concat([master_df2, transformed_data2])


                    # This is for most cases  
                    else:
                        # Get index of nth 2 hours grouped value
                        try:
                            index_value = new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index][new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index]['hours_grouped'] == 2].index.values[0]
                            new_df2 = new_df.iloc[:index_value+1]
                        except:
                            # If fails look at nth 1 hours grouped  
                            try:
                                index_value = new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index][new_df.reset_index().iloc[(new_df.reset_index().groupby('hours_grouped').cumcount() == n-1)[new_df.reset_index().groupby('hours_grouped').cumcount() == n-1].index]['hours_grouped'] == 1].index.values[0]
                                new_df2 = new_df.iloc[:index_value+1]
                            except:
                                continue

                        new_df2_2 = new_df2.tail(4).reset_index(drop=True).dropna()
                        new_df2_3 = new_df2.reset_index(drop=True).dropna()
                    
                        # C22 for latest day
                        if len(new_df2_2) <= 2: # c22 only works with 3 or more timepoints
                            transformed_data = pd.DataFrame()
                            np_data = new_df2_2.to_numpy()
                            if not np_data.size == 0:
                                transformed_data['22'] = np.mean(np_data)
                                transformed_data['23'] = np.std(np_data) 
                        else: 
                            transformed_data = c22.fit_transform(new_df2_2)                        
                        
                        transformed_data = transformed_data.add_prefix(column_name)           
                        transformed_data.insert(0, 'SPELL_IDENTIFIER', stay_id)
                        transformed_data.insert(1, 'date', new_df2.reset_index().iloc[-1]['date'])

                        # C22 for all data to date for stay
                        if len(new_df2_3) <= 2: # c22 only works with 3 or more timepoints
                            transformed_data2 = pd.DataFrame()
                            np_data = new_df2_3.to_numpy()
                            if not np_data.size == 0:
                                transformed_data2['22'] = np.mean(np_data)
                                transformed_data2['23'] = np.std(np_data)
                        else: 
                            transformed_data2 = c22.fit_transform(new_df2_3)                        

                        transformed_data2 = transformed_data2.add_prefix(column_name)  
                        transformed_data2 = transformed_data2.add_suffix('_current_stay') # Indicate different as temporal over whole of current stay             
                        transformed_data2.insert(0, 'SPELL_IDENTIFIER', stay_id)
                        transformed_data2.insert(1, 'date', new_df2.reset_index().iloc[-1]['date'])

                        # Create master df's
                        master_df = pd.concat([master_df, transformed_data])
                        master_df2 = pd.concat([master_df2, transformed_data2])

            master_df.reset_index(inplace=True, drop=True)
            master_df2.reset_index(inplace=True, drop=True)

            if master_df.empty:
                if master_df2.empty:
                    continue
                else:
                    master_df = master_df2
            else:
                master_df = master_df.merge(master_df2, how='left', on=['SPELL_IDENTIFIER', 'date'])

        if x == 0:
            overlord_df = master_df.copy()
        elif master_df.empty:
            continue
        else:
            overlord_df = overlord_df.merge(master_df, how='outer', on=['SPELL_IDENTIFIER', 'date'])
            overlord_df.dropna(axis=0, how='all', inplace=True)
        
        # Save as go through
        overlord_df.to_csv('switch_data/new_c22_other_days.csv', index=False)

    return overlord_df

In [69]:
c22_other_days_df = c22_other_fun(c22_other)

column number: 0
column number: 1
column number: 2
column number: 3
column number: 4
column number: 5
column number: 6
column number: 7
column number: 8
column number: 9


In [70]:
# Save
c22_other_days_df.to_csv('switch_data/c22_other_days_2023.csv', index=False)

In [77]:
c22_other_days_df.drop(columns=['22', '23'], inplace=True)

In [78]:
# Save
c22_other_days_df.to_csv('switch_data/c22_other_days_final_2023.csv', index=False)

# Combine and work out difference 

In [21]:
# Import
path = r'switch_data/c22_24_hour_2023.csv'
c22_24_hour = pd.read_csv(path)

In [22]:
# Import
path = r'switch_data/c22_48_hour_2023.csv'
c22_48_hour = pd.read_csv(path)

In [23]:
# Import
path = r'switch_data/c22_other_days_final_2023.csv'
c22_other_days_df = pd.read_csv(path)

In [44]:
# Combine
c22_combined = pd.concat([c22_24_hour, c22_48_hour, c22_other_days_df])

In [45]:
# Move column
col = c22_combined.pop("48_hour_flag")
c22_combined.insert(3, col.name, col)

  c22_combined.insert(3, col.name, col)


In [46]:
# Reset index
c22_combined2 = c22_combined.reset_index(drop=True)

In [47]:
# Order
c22_combined2 = c22_combined2.sort_values(by=['SPELL_IDENTIFIER', 'date'])

In [50]:
# Reset index
c22_combined2 = c22_combined2.reset_index(drop=True)

In [53]:
# Difference
c22_combined3 = c22_combined2.drop(columns=['date', '24_hour_flag', '48_hour_flag'])

c22_combined_diff = pd.DataFrame()
c22_combined3.set_index('SPELL_IDENTIFIER', inplace=True)
for stay_id, new_df in c22_combined3.groupby(level=0):
    diff_df = new_df.diff()
    diff_df = diff_df.add_suffix('_difference')
    c22_combined_diff = pd.concat([c22_combined_diff, diff_df], ignore_index=True)

c22_combined3.reset_index(inplace=True)
c22_combined_diff = pd.concat([c22_combined3, c22_combined_diff], axis=1)
c22_combined_diff = pd.concat([c22_combined2[['date', '24_hour_flag', '48_hour_flag']], c22_combined_diff], axis=1)
col = c22_combined_diff.pop("SPELL_IDENTIFIER")
c22_combined_diff.insert(0, col.name, col)

In [56]:
c22_combined_diff['SPELL_IDENTIFIER'].nunique()

3427

In [57]:
# Save
#c22_combined_diff.to_csv('switch_data/c22_all_with_diff_2023.csv', index=False)