In [1]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.reset_option('all')
pd.set_option('display.max_columns', None)

  pd.reset_option('all')


In [3]:
DATA_PATH = 'pickle_vars/all_data_cleaned.pkl.xz'

#HOURS_BEFORE_RFD = 24    ## sample window
#HOURS_SINCE_ADM = 4

## variables for which to only use single feature column:
single_feature_variables = ['k', 'na', 'bun', 'creatinine', 'hco3', 'haemoglobin', 'fio2', 'airway', 'pco2', 'po2', 'pain']

### Functions

In [4]:
def f(x):
    d={}
    x=x.sort_values(by='hrs_bRFD',ascending=False)
    
    #Bristol paper features
    xo = x[x.init==0]
    d['MAX'] = xo['VALUENUM'].max()
    d['MIN'] = xo['VALUENUM'].min()
    values = list(xo['VALUENUM'])
    d['VALUES'] = values
    d['LAST_VALUE'] = values[-1] if (len(values)>0) else np.nan
    d['COUNT'] = xo['VALUENUM'].count()
    
    # Initial data
    xi = x[x.init==1]
    
    d['MEAN_GRAD'] = xo['VALUENUM'].mean() - xi['VALUENUM'].mean()/np.random.choice(x['LOS_AC'])
    d['MIN_MAX_GRAD'] = (xo['VALUENUM'].min() - xi['VALUENUM'].max())/np.random.choice(x['LOS_AC'])
    d['MAX_MIN_GRAD'] = (xo['VALUENUM'].max() - xi['VALUENUM'].min())/np.random.choice(x['LOS_AC'])
    # Gradient features
    #grads = x['VALUENUM'].diff()/(x['hrs_bRFD'].diff())
    #grads.replace([np.inf, -np.inf], np.nan, inplace=True) # get rid of inf grads
    #d['MAX_GRAD'] = grads.max()
    #d['MIN_GRAD'] = grads.min()
    #d['MEAN_GRAD'] = grads.mean()
    #d['VAR_GRAD'] = grads.var()
    #d['FL_GRAD'] = (x['VALUENUM'].iloc[[0,-1]].diff()/x['hrs_bRFD'].iloc[[0,-1]].diff()).values[1]
    d['cohort'] = np.random.choice(x['cohort'])
    d['outcome'] = np.random.choice(x['outcome'])
    
    #return pd.Series(d, index=['MIN', 'MAX', 'VALUES', 'COUNT', 'MAX_GRAD', 'MIN_GRAD', 'MEAN_GRAD', 'VAR_GRAD', 'FL_GRAD', 'cohort', 'outcome'])
    return pd.Series(d, index=['MIN', 'MAX', 'VALUES', 'LAST_VALUE', 'COUNT', 'MEAN_GRAD', 'MIN_MAX_GRAD', 'MAX_MIN_GRAD', 'cohort', 'outcome'])

In [5]:
def _split_features_to_columns2(features, single_feature_vars=single_feature_variables):
    fts = ['MIN','MAX','MEAN_GRAD','MIN_MAX_GRAD','MAX_MIN_GRAD']
    vitals = features[~features.VARIABLE.isin(single_feature_variables)].VARIABLE.unique()
    labs = features[features.VARIABLE.isin(single_feature_variables)].VARIABLE.unique()
    #ids = features.ICUSTAY_ID.unique()
    F = features.copy()
    
    for vital in vitals:
        for ft in fts:
            F[vital+'_'+ft] = F.apply(lambda row: np.nan if (row['VARIABLE']!=vital) else row[ft], axis = 1)
    for lab in labs:
        F[lab] = F.apply(lambda row: np.nan if (row['VARIABLE']!=lab) else row['LAST_VALUE'], axis = 1)
    
    cols = [(vital+'_'+ft) for vital in vitals for ft in fts]
    cols = ['ICUSTAY_ID']+cols+single_feature_variables
    F = F[cols]
    FM = F.groupby(by=['ICUSTAY_ID'], as_index=False).first()
    return FM
    

In [6]:
def create_FM(df, days=0, ID_start = 0, HOURS_BEFORE_RFD=4, PREDICT_HOURS=0, HOURS_SINCE_ADM=4):    
    print('days = '+str(days))
    
    df = pd.read_pickle(DATA_PATH)
    df['LOS_AC'] = df['LOS_AC'] - (PREDICT_HOURS / 24) - days #subtract PREDICT_HOURS and days from LOS
    df = df[df.LOS_AC>=1]

    # Prepare table for before callout sample window 
    dfl = df.copy()
    dfl = dfl[~dfl.VARIABLE.isin(single_feature_variables)] # remove lab results
    dfl = dfl[dfl['hrs_bRFD']>= 24*days + PREDICT_HOURS]
    dfl = dfl[dfl['hrs_bRFD']<= 24*days + PREDICT_HOURS + HOURS_BEFORE_RFD]
    dfl['init'] = 0

    #print("There are %d icu stays with variables recorded in the final %d hour window, %d hours before CALLOUT." %(len(dfl['ICUSTAY_ID'].unique()),HOURS_BEFORE_RFD,PREDICT_HOURS))

    # Prepare table for initial data window
    dfi = df.copy()
    dfi = dfi[~dfi.VARIABLE.isin(single_feature_variables)] # remove lab results for initial data features
    dfi = dfi[dfi['hrs_SA']>= (dfi['LOS_AC']*24 - HOURS_BEFORE_RFD) % 24]
    dfi = dfi[dfi['hrs_SA']<= (dfi['LOS_AC']*24 - HOURS_BEFORE_RFD) % 24 + HOURS_SINCE_ADM]
    dfi['init'] = 1

    #dfi['VARIABLE'] = 'init_' + dfi['VARIABLE'] # add initial tag to separate from other measurements
    #print("There are %d icu stays with variables recorded in the initial %d hour window" %(len(dfi['ICUSTAY_ID'].unique()),HOURS_SINCE_ADM))

    # Prepare table for lab test results over lookback window
    dfs = df.copy()
    dfs = dfs[dfs['hrs_bRFD']>=0]
    dfs = dfs[dfs['hrs_bRFD']<=48]
    dfs = dfs[dfs.VARIABLE.isin(single_feature_variables)] # Last result gets chosen
    dfs['init'] = 0

    dfils = pd.concat([dfi,dfl,dfs])
    #dfils = pd.concat([dfl,dfs]) # ignore initial data
    dfils = dfils[dfils.LOS_AC>=1]
    
    summary = dfils.groupby(['ICUSTAY_ID'],as_index=False).agg(outcome=('outcome',np.random.choice),
                                                        cohort=('cohort',np.random.choice),
                                                        readmit=('readmit',np.random.choice),
                                                        in_h_death=('in_h_death',np.random.choice),
                                                        in_icu_death=('in_icu_death',np.random.choice),
                                                        LOS_AC=('LOS_AC',np.random.choice),
                                                        OUTTIME=('OUTTIME',np.random.choice),
                                                        INTIME=('INTIME',np.random.choice),
                                                        age=('age',np.random.choice),
                                                        sex=('sex',np.random.choice),
                                                        weight=('weight',np.random.choice),
                                                        height=('height',np.random.choice))
    
    print("stays: %d" %len(summary))
    print("stays in cohort: %d" %sum(summary['cohort']==1))
    #cohort_summary = summary[summary['cohort']==1]

    features = dfils.groupby(['ICUSTAY_ID','VARIABLE'],as_index=False).apply(f)
    
    FM = _split_features_to_columns2(features, single_feature_variables)
    
    feats = FM.columns.values.tolist()
    ftr = [i for i in feats if ('init' in i) & ('grad' in i)]
    FM = FM.drop(columns=ftr)
    
    FM = FM.fillna(value={'airway':0.0})
    
    nrows = len(FM)

    missing_data = pd.DataFrame()
    vname = []
    miss_freq = []

    for column in FM:
        vname.append(column) 
        miss_freq.append(sum(FM[column].isna())/nrows)

    missing_data['variable'] = vname
    missing_data['fraction_missing'] = miss_freq

    missing_data = missing_data.sort_values('fraction_missing', ascending=True)
    #print(missing_data.to_string())
    
    FM['missing'] = FM.isna().sum(axis=1)/(FM.shape[1]-1)
    
    _sum_sub = summary[['ICUSTAY_ID', 'weight', 'height', 'LOS_AC', 'age', 'sex', 'cohort', 'outcome']]
        
    FM = FM.merge(_sum_sub, on='ICUSTAY_ID')
    
    FM['new_ID'] = np.arange(ID_start, len(FM) + ID_start)
    
    FM.replace([np.inf, -np.inf], np.nan, inplace=True) # get rid of inf grads
    
    if days>0:
        FM.outcome = 0
    
    return FM, missing_data

In [7]:
def create_bal_FM(PREDICT_HOURS, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days):
    df = pd.read_pickle(DATA_PATH)
    FM_bal = pd.DataFrame()
    missing_data_summary = pd.DataFrame()
    ID_start = 0

    for day in resample_days:
        data = df.copy()
        FM_day, missing_data = create_FM(data, days = day, ID_start=ID_start, HOURS_BEFORE_RFD=HOURS_BEFORE_RFD, PREDICT_HOURS=PREDICT_HOURS, HOURS_SINCE_ADM=HOURS_SINCE_ADM)
        FM_bal = pd.concat([FM_bal,FM_day])
        ID_start = FM_bal.new_ID.max() + 1
        if day == 0:
            print('True stays')
            n_true_stays = len(FM_bal[FM_bal.cohort==1])
        else:
            n_true_stays = 0
    return FM_bal, n_true_stays

### Usage

In [8]:
orig_feats = ['bp_MIN', 'bp_MAX', 
              'gcs_MIN', 'gcs_MAX', 
              'hr_MIN', 'hr_MAX', 
              'resp_MIN', 'resp_MAX', 
              'spo2_MIN', 'spo2_MAX',
              'temp_MIN', 'temp_MAX', 
              'k', 'na', 'bun', 'creatinine', 'hco3', 'haemoglobin', 'fio2', 'airway', 'pco2', 'po2', 'pain', 'weight', 'height']

grad_feats = ['bp_MEAN_GRAD', 'bp_MIN_MAX_GRAD', 'bp_MAX_MIN_GRAD',
              'gcs_MEAN_GRAD', 'gcs_MIN_MAX_GRAD', 'gcs_MAX_MIN_GRAD',
              'hr_MEAN_GRAD', 'hr_MIN_MAX_GRAD', 'hr_MAX_MIN_GRAD', 
              'resp_MEAN_GRAD', 'resp_MIN_MAX_GRAD', 'resp_MAX_MIN_GRAD',
              'spo2_MEAN_GRAD', 'spo2_MIN_MAX_GRAD', 'spo2_MAX_MIN_GRAD',
              'temp_MEAN_GRAD', 'temp_MIN_MAX_GRAD', 'temp_MAX_MIN_GRAD',]


cohort_feats = ['LOS_AC','age','sex','outcome']

In [9]:
%%time
orig_summary = dict()
grad_summary = dict()
cohort_summary = dict()

HOURS_BEFORE_RFD = 4
HOURS_SINCE_ADM = 4
PREDICT_WHEN = [0, 24, 48, 72, 96]
resample_days = [0,4,5,6,7,8]
for when in PREDICT_WHEN:
    print('FM for Predict RFD '+str(when)+' hours from sample window')
    FM_bal, n_true_stays = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
    FM_bal = FM_bal[FM_bal.cohort==1]
    FM_bal.to_pickle('pickle_vars/FM_bal_'+str(when)+'_resample_days'+str(resample_days[1])+'-'+str(resample_days[-1])+'window'+str(HOURS_BEFORE_RFD)+'-'+str(HOURS_SINCE_ADM)+'.pkl.xz')
    print(FM_bal.outcome.value_counts())
    
    FM_orig_feats = FM_bal[orig_feats]
    orig_summary[str(when)+'hrs'] = FM_orig_feats.isna().sum()/len(FM_orig_feats)

    FM_grad_feats = FM_bal[grad_feats]
    grad_summary[str(when)+'hrs'] = FM_grad_feats.isna().sum()/len(FM_grad_feats)
    
    FM_cohort_feats = FM_bal[cohort_feats]
    x = pd.Series()
    x['Total ICU admissions'] = int(len(FM_cohort_feats))
    x['Number of True stays'] = int(n_true_stays)
    x['Age, median years'] = int(FM_cohort_feats['age'].median())
    x['Sex, \% female'] = round((FM_cohort_feats['sex'].sum()/len(FM_cohort_feats))*100,3)
    x['Length of stay, median days'] = round(FM_cohort_feats['LOS_AC'].median(),3)
    x['Positive outcome, \%'] = round((FM_cohort_feats['outcome'].sum()/len(FM_cohort_feats))*100,3)
    cohort_summary[str(when)+'hrs'] = x

cohort_table = pd.DataFrame.from_dict(cohort_summary)
cohort_table.to_latex('tex_files/cohort_table.tex')

orig_miss_table = pd.DataFrame.from_dict(orig_summary)
omt = pd.DataFrame(orig_miss_table, index=pd.Index(orig_feats, name='Variable'))
omt.rename(index = lambda s: s.replace('_',' '), inplace=True)
omt = omt.round(3)
omt.to_latex('tex_files/orig_miss_table.tex')

grad_miss_table = pd.DataFrame.from_dict(grad_summary)
gmt = pd.DataFrame(grad_miss_table, index=pd.Index(grad_feats, name='Variable'))
gmt.rename(index = lambda s: s.replace('_',' '), inplace=True)
gmt = gmt.round(3)
gmt.to_latex('tex_files/grad_miss_table.tex')

FM for Predict RFD 0 hours from sample window
days = 0
stays: 7480
stays in cohort: 7034
True stays
days = 4
stays: 1359
stays in cohort: 1248
days = 5
stays: 1070
stays in cohort: 986
days = 6
stays: 867
stays in cohort: 801
days = 7
stays: 694
stays in cohort: 641
days = 8
stays: 546
stays in cohort: 501
outcome
1    6299
0    4912
Name: count, dtype: int64
FM for Predict RFD 24 hours from sample window
days = 0
stays: 4185
stays in cohort: 3884
True stays
days = 4
stays: 1070
stays in cohort: 986
days = 5
stays: 867
stays in cohort: 801
days = 6
stays: 694
stays in cohort: 641
days = 7
stays: 546
stays in cohort: 501
days = 8
stays: 445
stays in cohort: 407
outcome
0    3847
1    3373
Name: count, dtype: int64
FM for Predict RFD 48 hours from sample window
days = 0
stays: 2662
stays in cohort: 2458
True stays
days = 4
stays: 867
stays in cohort: 801
days = 5
stays: 694
stays in cohort: 641
days = 6
stays: 546
stays in cohort: 501
days = 7
stays: 445
stays in cohort: 407
days = 8
sta

In [10]:
cohort_table

Unnamed: 0,0hrs,24hrs,48hrs,72hrs,96hrs
Total ICU admissions,11211.0,7220.0,5141.0,3873.0,3006.0
Number of True stays,0.0,0.0,0.0,0.0,0.0
"Age, median years",62.0,62.0,61.0,61.0,60.0
"Sex, \% female",46.767,46.607,45.828,44.952,44.311
"Length of stay, median days",2.752,3.398,3.766,4.31,4.607
"Positive outcome, \%",56.186,46.717,40.362,36.509,33.899


In [11]:
omt

Unnamed: 0_level_0,0hrs,24hrs,48hrs,72hrs,96hrs
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bp MIN,0.009,0.007,0.005,0.006,0.008
bp MAX,0.009,0.007,0.005,0.006,0.008
gcs MIN,0.099,0.084,0.074,0.073,0.07
gcs MAX,0.099,0.084,0.074,0.073,0.07
hr MIN,0.004,0.004,0.003,0.004,0.005
hr MAX,0.004,0.004,0.003,0.004,0.005
resp MIN,0.008,0.008,0.005,0.006,0.008
resp MAX,0.008,0.008,0.005,0.006,0.008
spo2 MIN,0.013,0.008,0.007,0.006,0.008
spo2 MAX,0.013,0.008,0.007,0.006,0.008


In [12]:
gmt

Unnamed: 0_level_0,0hrs,24hrs,48hrs,72hrs,96hrs
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bp MEAN GRAD,0.016,0.016,0.016,0.018,0.02
bp MIN MAX GRAD,0.016,0.016,0.016,0.018,0.02
bp MAX MIN GRAD,0.016,0.016,0.016,0.018,0.02
gcs MEAN GRAD,0.167,0.156,0.15,0.149,0.151
gcs MIN MAX GRAD,0.167,0.156,0.15,0.149,0.151
gcs MAX MIN GRAD,0.167,0.156,0.15,0.149,0.151
hr MEAN GRAD,0.011,0.013,0.013,0.015,0.017
hr MIN MAX GRAD,0.011,0.013,0.013,0.015,0.017
hr MAX MIN GRAD,0.011,0.013,0.013,0.015,0.017
resp MEAN GRAD,0.015,0.016,0.016,0.017,0.019


In [9]:
%%time
class_summary = dict()

HOURS_BEFORE_RFD = 4
HOURS_SINCE_ADM = 4
PREDICT_WHEN = [0, 24, 48, 72, 96]
resample_days = [0]
for when in PREDICT_WHEN:
    print('FM for Predict RFD '+str(when)+' hours from sample window')
    FM, n_true_stays = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
    FM = FM[FM.cohort==1]
    #FM.to_pickle('pickle_vars/FM_'+str(when)+'_resample_days'+str(resample_days[1])+'-'+str(resample_days[-1])+'window'+str(HOURS_BEFORE_RFD)+'-'+str(HOURS_SINCE_ADM)+'.pkl.xz')
    print(FM.outcome.value_counts())
    
    x = pd.Series()
    x['Total ICU admissions'] = int(len(FM))
    x['Positive outcomes (class = 1)'] = int(len(FM[FM.outcome==1]))
    x['Negative outcomes (class = 0)'] = int(len(FM[FM.outcome==0]))

    class_summary[str(when)+'hrs'] = x

class_table = pd.DataFrame.from_dict(class_summary)
class_table.to_latex('tex_files/class_table.tex')

FM for Predict RFD 0 hours from sample window
days = 0
stays: 7480
stays in cohort: 7034
True stays
outcome
1    6300
0     734
Name: count, dtype: int64
FM for Predict RFD 24 hours from sample window
days = 0
stays: 4185
stays in cohort: 3884
True stays
outcome
1    3373
0     511
Name: count, dtype: int64
FM for Predict RFD 48 hours from sample window
days = 0
stays: 2662
stays in cohort: 2458
True stays
outcome
1    2075
0     383
Name: count, dtype: int64
FM for Predict RFD 72 hours from sample window
days = 0
stays: 1856
stays in cohort: 1711
True stays
outcome
1    1414
0     297
Name: count, dtype: int64
FM for Predict RFD 96 hours from sample window
days = 0
stays: 1359
stays in cohort: 1248
True stays
outcome
1    1019
0     229
Name: count, dtype: int64
CPU times: user 8min 22s, sys: 21.9 s, total: 8min 43s
Wall time: 8min 41s


In [10]:
class_table

Unnamed: 0,0hrs,24hrs,48hrs,72hrs,96hrs
Total ICU admissions,7034,3884,2458,1711,1248
Positive outcomes (class = 1),6300,3373,2075,1414,1019
Negative outcomes (class = 0),734,511,383,297,229


In [None]:
PREDICT_HOURS = 0
days = 0
HOURS_SINCE_ADM = 4
HOURS_BEFORE_RFD = 4

df = pd.read_pickle(DATA_PATH)
df['LOS_AC'] = df['LOS_AC'] - (PREDICT_HOURS / 24) - days #subtract PREDICT_HOURS and days from LOS
df = df[df.LOS_AC>=1]

# Prepare table for before callout sample window 
dfl = df.copy()
dfl = dfl[~dfl.VARIABLE.isin(single_feature_variables)] # remove lab results
dfl = dfl[dfl['hrs_bRFD']>= 24*days + PREDICT_HOURS]
dfl = dfl[dfl['hrs_bRFD']<= 24*days + PREDICT_HOURS + HOURS_BEFORE_RFD]
dfl['init'] = 0

#print("There are %d icu stays with variables recorded in the final %d hour window, %d hours before CALLOUT." %(len(dfl['ICUSTAY_ID'].unique()),HOURS_BEFORE_RFD,PREDICT_HOURS))

# Prepare table for initial data window
dfi = df.copy()
dfi = dfi[~dfi.VARIABLE.isin(single_feature_variables)] # remove lab results for initial data features
dfi = dfi[dfi['hrs_SA']>= (dfi['LOS_AC']*24 - 4) % 24]
dfi = dfi[dfi['hrs_SA']<= (dfi['LOS_AC']*24 - 4) % 24 + HOURS_SINCE_ADM]
dfi['init'] = 1

#dfi['VARIABLE'] = 'init_' + dfi['VARIABLE'] # add initial tag to separate from other measurements
#print("There are %d icu stays with variables recorded in the initial %d hour window" %(len(dfi['ICUSTAY_ID'].unique()),HOURS_SINCE_ADM))

# Prepare table for lab test results over lookback window
dfs = df.copy()
dfs = dfs[dfs['hrs_bRFD']>=0]
dfs = dfs[dfs['hrs_bRFD']<=48]
dfs = dfs[dfs.VARIABLE.isin(single_feature_variables)] # Last result gets chosen
dfs['init'] = 0

In [17]:
%%time
bal_class_summary = dict()

HOURS_BEFORE_RFD = 4
HOURS_SINCE_ADM = 4
PREDICT_WHEN = [0, 24, 48, 72, 96]
resample_days = [0, 4, 5, 6, 7, 8]
for when in PREDICT_WHEN:
    print('FM for Predict RFD '+str(when)+' hours from sample window')
    FM, n_true_stays = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
    FM = FM[FM.cohort==1]
    #FM.to_pickle('pickle_vars/FM_'+str(when)+'_resample_days'+str(resample_days[1])+'-'+str(resample_days[-1])+'window'+str(HOURS_BEFORE_RFD)+'-'+str(HOURS_SINCE_ADM)+'.pkl.xz')
    print(FM.outcome.value_counts())
    
    x = pd.Series()
    x['Total ICU admissions'] = int(len(FM))
    x['Positive outcomes (class = 1)'] = int(len(FM[FM.outcome==1]))
    x['Negative outcomes (class = 0)'] = int(len(FM[FM.outcome==0]))

    bal_class_summary[str(when)+'hrs'] = x

bal_class_table = pd.DataFrame.from_dict(bal_class_summary)
bal_class_table.to_latex('tex_files/class_table.tex')

FM for Predict RFD 0 hours from sample window
days = 0
stays: 11283
stays in cohort: 10666
True stays
days = 4
stays: 1778
stays in cohort: 1636
days = 5
stays: 1322
stays in cohort: 1216
days = 6
stays: 1037
stays in cohort: 956
days = 7
stays: 846
stays in cohort: 785
days = 8
stays: 675
stays in cohort: 623
outcome
1    9656
0    6226
Name: count, dtype: int64
FM for Predict RFD 24 hours from sample window
days = 0
stays: 6986
stays in cohort: 6556
True stays
days = 4
stays: 1322
stays in cohort: 1216
days = 5
stays: 1037
stays in cohort: 956
days = 6
stays: 846
stays in cohort: 785
days = 7
stays: 675
stays in cohort: 623
days = 8
stays: 538
stays in cohort: 494
outcome
1    5848
0    4782
Name: count, dtype: int64
FM for Predict RFD 48 hours from sample window
days = 0
stays: 3944
stays in cohort: 3662
True stays
days = 4
stays: 1037
stays in cohort: 956
days = 5
stays: 846
stays in cohort: 785
days = 6
stays: 675
stays in cohort: 623
days = 7
stays: 538
stays in cohort: 494
days 

In [11]:
a0 = pd.Series()
a0['day 4'] = 1636
a0['day 5'] = 1216
a0['day 6'] = 956
a0['day 7'] = 785
a0['day 8'] = 623

a24 = pd.Series()
a24['day 4'] = 1216
a24['day 5'] = 956
a24['day 6'] = 785
a24['day 7'] = 623
a24['day 8'] = 494

a48 = pd.Series()
a48['day 4'] = 956
a48['day 5'] = 785
a48['day 6'] = 623
a48['day 7'] = 494
a48['day 8'] = 403

bal_cohort = dict()
bal_cohort[str(0)] = a0
bal_cohort[str(24)] = a24
bal_cohort[str(48)] = a48

bal_cohort_table = pd.DataFrame.from_dict(bal_cohort)

In [10]:
%%time
HOURS_BEFORE_RFD = 4
HOURS_SINCE_ADM = 4
PREDICT_WHEN = [0, 24, 48, 72, 96]
resample_days = [[0],[4],[5],[6],[7],[8]]
for when in PREDICT_WHEN:
    for day in resample_days:

        print('FM for Predict RFD '+str(when)+' hours from sample window')
        FM, n_true_stays = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, day)
        FM = FM[FM.cohort==1]
        #FM.to_pickle('pickle_vars/FM_'+str(when)+'_resample_days'+str(resample_days[1])+'-'+str(resample_days[-1])+'window'+str(HOURS_BEFORE_RFD)+'-'+str(HOURS_SINCE_ADM)+'.pkl.xz')
        print(FM.outcome.value_counts())

FM for Predict RFD 0 hours from sample window
days = 0
stays: 11283
stays in cohort: 10666
True stays
outcome
1    9656
0    1010
Name: count, dtype: int64
FM for Predict RFD 0 hours from sample window
days = 4
stays: 1778
stays in cohort: 1636
outcome
0    1636
Name: count, dtype: int64
FM for Predict RFD 0 hours from sample window
days = 5
stays: 1322
stays in cohort: 1216
outcome
0    1216
Name: count, dtype: int64
FM for Predict RFD 0 hours from sample window
days = 6
stays: 1037
stays in cohort: 956
outcome
0    956
Name: count, dtype: int64
FM for Predict RFD 0 hours from sample window
days = 7
stays: 846
stays in cohort: 785
outcome
0    785
Name: count, dtype: int64
FM for Predict RFD 0 hours from sample window
days = 8
stays: 675
stays in cohort: 623
outcome
0    623
Name: count, dtype: int64
FM for Predict RFD 24 hours from sample window
days = 0
stays: 6986
stays in cohort: 6556
True stays
outcome
1    5848
0     708
Name: count, dtype: int64
FM for Predict RFD 24 hours from

In [None]:
%%time
FM_summary = dict()
HOURS_BEFORE_RFD = 8
HOURS_SINCE_ADM = 8
PREDICT_WHEN = [0]
resample_days = [0]
for when in PREDICT_WHEN:
    print('FM for Predict RFD '+str(when)+' hours from sample window')
    FM_bal = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
    FM_bal.to_pickle('pickle_vars/FM_bal_'+str(when)+'_resample_days_none.pkl.xz')
    FM_summary[str(when)+'hrs'] = FM_bal.isna().sum()/len(FM_bal)
    print(FM_bal.outcome.value_counts())

In [12]:
%%time
FM_summary = dict()
HOURS_BEFORE_RFD = 4
HOURS_SINCE_ADM = 4
PREDICT_WHEN = [0, 24, 48, 72, 96]
resample_days = [0,4,5,6,7,8]
for when in PREDICT_WHEN:
    print('FM for Predict RFD '+str(when)+' hours from sample window')
    FM_bal = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
    FM_bal.to_pickle('pickle_vars/FM_bal_'+str(when)+'_resample_days'+str(resample_days[1])+'-'+str(resample_days[-1])+'window'+str(HOURS_BEFORE_RFD)+'-'+str(HOURS_SINCE_ADM)+'.pkl.xz')
    FM_summary[str(when)+'hrs'] = FM_bal.isna().sum()/len(FM_bal)
    print(FM_bal.outcome.value_counts())

FM for Predict RFD 0 hours from sample window
days = 0
stays: 11283
stays in cohort: 10666
days = 4
stays: 1778
stays in cohort: 1636
days = 5
stays: 1322
stays in cohort: 1216
days = 6
stays: 1037
stays in cohort: 956
days = 7
stays: 846
stays in cohort: 785
days = 8
stays: 675
stays in cohort: 623
outcome
1    9655
0    7286
Name: count, dtype: int64
FM for Predict RFD 24 hours from sample window
days = 0
stays: 6986
stays in cohort: 6556
days = 4
stays: 1322
stays in cohort: 1216
days = 5
stays: 1037
stays in cohort: 956
days = 6
stays: 846
stays in cohort: 785
days = 7
stays: 675
stays in cohort: 623
days = 8
stays: 538
stays in cohort: 494
outcome
1    5848
0    5556
Name: count, dtype: int64
FM for Predict RFD 48 hours from sample window
days = 0
stays: 3944
stays in cohort: 3662
days = 4
stays: 1037
stays in cohort: 956
days = 5
stays: 846
stays in cohort: 785
days = 6
stays: 675
stays in cohort: 623
days = 7
stays: 538
stays in cohort: 494
days = 8
stays: 440
stays in cohort: 4

In [8]:
%%time
FM_summary = dict()
HOURS_BEFORE_RFD = 8
HOURS_SINCE_ADM = 8
PREDICT_WHEN = [0, 24, 48, 72, 96]
resample_days = [0,3,4,5,6,7,8]
for when in PREDICT_WHEN:
    print('FM for Predict RFD '+str(when)+' hours from sample window')
    FM_bal = create_bal_FM(when, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
    FM_bal.to_pickle('pickle_vars/FM_bal_'+str(when)+'_resample_days'+str(resample_days[1])+'-'+str(resample_days[-1])+'.pkl.xz')
    FM_summary[str(when)+'hrs'] = FM_bal.isna().sum()/len(FM_bal)
    print(FM_bal.outcome.value_counts())

FM for Predict RFD 0 hours from sample window
days = 0
stays: 11208
stays in cohort: 10597
days = 3
stays: 2427
stays in cohort: 2243
days = 4
stays: 1708
stays in cohort: 1574
days = 5
stays: 1287
stays in cohort: 1182
days = 6
stays: 1018
stays in cohort: 938
days = 7
stays: 820
stays in cohort: 759
days = 8
stays: 658
stays in cohort: 608
outcome
1    9603
0    9523
Name: count, dtype: int64
FM for Predict RFD 24 hours from sample window
days = 0
stays: 6545
stays in cohort: 6131
days = 3
stays: 1708
stays in cohort: 1574
days = 4
stays: 1287
stays in cohort: 1182
days = 5
stays: 1018
stays in cohort: 938
days = 6
stays: 820
stays in cohort: 759
days = 7
stays: 658
stays in cohort: 608
days = 8
stays: 523
stays in cohort: 479
outcome
0    7111
1    5448
Name: count, dtype: int64
FM for Predict RFD 48 hours from sample window
days = 0
stays: 3772
stays in cohort: 3501
days = 3
stays: 1287
stays in cohort: 1182
days = 4
stays: 1018
stays in cohort: 938
days = 5
stays: 820
stays in coh

In [29]:
resample_days = [0]
FM_bal = create_bal_FM(0, HOURS_BEFORE_RFD, HOURS_SINCE_ADM, resample_days)
print(FM_bal.outcome.value_counts())

days = 0
stays: 9352
stays in cohort: 8823
outcome
1    7961
0    1391
Name: count, dtype: int64
