In [41]:
import pandas as pd
import numpy as np
import sqlite3
import tsfresh
import plotly.express as px

In [62]:
path_in = "/home/idies/workspace/Storage/zmurphy3/PCM Team Emerald/Data/Processed/Merged_norehab.db"
con = sqlite3.connect(path_in)
insheet = pd.read_sql_query("SELECT * FROM timeseries_instantaneous", con)
startstop = pd.read_sql_query("SELECT * FROM timeseries_startstop", con)
static = pd.read_sql_query("SELECT * FROM static_predictors", con)
outcomes = pd.read_sql_query("SELECT * FROM outcomes", con)

In [68]:
scores = insheet[(insheet['measure'] == 'glasgow_score') & (insheet['timestamp'] >= 1440) & (insheet['timestamp'] < 10080)]

In [69]:
min_gcs = scores.groupby('mrn_csn_pair').min().value

In [70]:
min_gcs

mrn_csn_pair
(10, 2913)      12.0
(1001, 2708)    12.0
(1002, 3238)    14.0
(1004, 1204)    11.0
(1005, 2109)    11.0
                ... 
(992, 1997)      4.0
(993, 8)         4.0
(995, 1464)     13.0
(996, 1653)     14.0
(998, 3015)      3.0
Name: value, Length: 1727, dtype: float64

In [71]:
# Drop all patients missing GCS
insheet = insheet[insheet['mrn_csn_pair'].isin(min_gcs.index)]
startstop = startstop[startstop['mrn_csn_pair'].isin(min_gcs.index)]
static = static[static['mrn_csn_pair'].isin(min_gcs.index)]
outcomes = outcomes[outcomes['mrn_csn_pair'].isin(min_gcs.index)]

In [72]:
# Drop all GCS as predictors so they don't influence decision
gcs = ['glasgow_eye_opening', 'glasgow_motor_response', 'glasgow_score', 'glasgow_verbal_response', 'orientation', 'consciousness']
insheet = insheet[~insheet['measure'].isin(gcs)]

In [73]:
insheet = insheet.append(startstop).sort_values('mrn_csn_pair')

In [74]:
min_gcs

mrn_csn_pair
(10, 2913)      12.0
(1001, 2708)    12.0
(1002, 3238)    14.0
(1004, 1204)    11.0
(1005, 2109)    11.0
                ... 
(992, 1997)      4.0
(993, 8)         4.0
(995, 1464)     13.0
(996, 1653)     14.0
(998, 3015)      3.0
Name: value, Length: 1727, dtype: float64

In [75]:
pats = insheet.sort_values('mrn_csn_pair')['mrn_csn_pair'].unique()

first = insheet[(insheet['timestamp'] < 1440) & (insheet['timestamp'] >= 0)]
first = first.sort_values('mrn_csn_pair').reset_index(drop=True)

extracted_flowsheet = tsfresh.extract_features(first, column_id='mrn_csn_pair', column_sort='timestamp', column_kind='measure', column_value='value', n_jobs=8)
# Drop features that are only NaN
extracted_flowsheet = extracted_flowsheet.dropna(axis=1, how='all')

tsfresh.utilities.dataframe_functions.impute(extracted_flowsheet)
# Add back the mrn_csn_pair
extracted_flowsheet.insert(0, 'mrn_csn_pair', pats)
extracted_flowsheet = extracted_flowsheet.reset_index(drop=True)

Feature Extraction: 100%|██████████| 40/40 [07:36<00:00, 11.40s/it]
 'cvc_line__sample_entropy' 'line__sample_entropy'
 'diuretic__sample_entropy' 'drain__sample_entropy'
 'antiarrhythmic__sample_entropy' 'Floor__sample_entropy'] did not have any finite values. Filling with zeros.


In [76]:
flowsheet = extracted_flowsheet
flowsheet = flowsheet.reset_index(drop=True)

In [77]:
min_gcs = pd.Series(min_gcs).reset_index(drop=True)
bin_min_gcs = (min_gcs <= 13).astype(int)

In [79]:
flowsheet = flowsheet.loc[:, (flowsheet != 0).any(axis=0)]

# Do tsfresh feature filtering to dramatically reduce feature space
feature_table = tsfresh.feature_selection.relevance.calculate_relevance_table(flowsheet.drop('mrn_csn_pair', axis=1), bin_min_gcs,
                                                                              n_jobs=8)
# Concat data into one place, dropping irrelevant features
complete = pd.concat([flowsheet.drop('mrn_csn_pair', axis=1).loc[:, feature_table['relevant']],
                      static.sort_values('mrn_csn_pair').reset_index(drop=True)], axis=1)

# Insert LOS and  mrn_csn_pair to the data file
complete.insert(0, 'bin_min_gcs', bin_min_gcs)
#complete.insert(1, 'time_in_hospital', outcomes.sort_values('mrn_csn_pair')['time_in_hospital'].reset_index(drop=True))
#complete = complete[complete['time_in_hospital'] > cutoff]
#complete = complete.drop('time_in_hospital', axis=1)
pairs = complete['mrn_csn_pair']
complete = complete.drop('mrn_csn_pair', axis=1)
complete.insert(1, 'mrn_csn_pair', pairs)

In [80]:
bin_min_gcs.sum() / bin_min_gcs.shape[0]

0.35089751013317894

In [81]:
complete.shape

(1727, 9256)

In [82]:
complete.to_csv("/home/idies/workspace/Storage/zmurphy3/PCM Team Emerald/Data/Processed/complete_24h_72h_min_gcs.csv")