In [134]:
import numpy as np
import pandas as pd
import sqlite3
import tsfresh
import dask.dataframe as dd

In [135]:
# Get path and connection
path_in = "S:\Dehydration_stroke\Team Emerald\Working Data\Preprocessed\Working\Processed.db"
path_out = "S:\Dehydration_stroke\Team Emerald\Working Data\Preprocessed\Working\Models.db"
con = sqlite3.connect(path_in)
con_out = sqlite3.connect(path_out)

In [136]:
# Load in extracted timeseries data
df = pd.read_csv('extracted_flowsheet_first24h.csv')
df = df.drop('Unnamed: 0', axis=1)

In [262]:
# Import sql data
adt = pd.read_sql_query("SELECT * FROM ADT", con)
hx = pd.read_sql_query("SELECT * FROM HX", con)
dx = pd.read_sql_query("SELECT * FROM DX", con)
dem = pd.read_sql_query("SELECT * FROM DEMOGRAPHICS", con)
lda = pd.read_sql_query("SELECT * FROM LDA", con)
patients = pd.read_sql_query("SELECT * FROM mrn_csn_pairs", con)
outcome = pd.read_sql_query("SELECT * FROM primary_outcome", con_out)

In [263]:
# Look at only relevant patients
df = df[df['mrn_csn_pair'].isin(patients['mrn_csn_pair'])]
adt = adt[adt['mrn_csn_pair'].isin(patients['mrn_csn_pair'])]
hx = hx[hx['mrn_csn_pair'].isin(patients['mrn_csn_pair'])]
dx = dx[dx['mrn_csn_pair'].isin(patients['mrn_csn_pair'])]
dem = dem[dem['mrn_csn_pair'].isin(patients['mrn_csn_pair'])]
lda = lda[lda['mrn_csn_pair'].isin(patients['mrn_csn_pair'])]
outcome = outcome[outcome['mrn_csn_pair'].isin(patients['mrn_csn_pair'])].reset_index(drop=True)[['mrn_csn_pair', 'LOS']]

In [141]:
adt = adt.drop('index', axis=1).sort_values('mrn_csn_pair', ignore_index=True)

In [142]:
# Do this for now to get correct size, in future might want to use cumulative count? Might have clinical significance
dx = dx.drop_duplicates('mrn_csn_pair').drop('index', axis=1)
dx = dx.sort_values('mrn_csn_pair', ignore_index=True)

In [143]:
# Get all patients that are not in the hx data
temp = pd.DataFrame({'mrn_csn_pair': patients[~(patients['mrn_csn_pair'].isin(hx['mrn_csn_pair']))]['mrn_csn_pair']})
# Add them to the dataframe with all 0s
hx = hx.append(temp).fillna(0).drop('index', axis=1).sort_values('mrn_csn_pair')
# If no other conditions, put 1 in none col..coulda done this the easy way but whatever
hx['None'] = hx.drop('mrn_csn_pair', axis=1).sum(axis=1).eq(0).astype(int)
hx = hx.reset_index(drop=True)

In [145]:
# I think there was 1 re-admittance? Or some kind of error? We'll just keep the first admittance for now
dem = dem.drop(['admission_datetime', 'discharge_datetime', 'time_in_hospital_minutes'], axis=1)
dem = dem.drop_duplicates('mrn_csn_pair').drop('index', axis=1)
dem = dem.sort_values('mrn_csn_pair', ignore_index=True)

In [146]:
# Get all patients that are not in the lda data
temp = pd.DataFrame({'mrn_csn_pair': patients[~(patients['mrn_csn_pair'].isin(lda['mrn_csn_pair']))]['mrn_csn_pair']})
# Add them to the dataframe with all 0s
lda =lda.append(temp).fillna(0).drop('index', axis=1).sort_values('mrn_csn_pair')
# If no other conditions, put 1 in none col..coulda done this the easy way but whatever
lda['None'] = lda.drop('mrn_csn_pair', axis=1).sum(axis=1).eq(0).astype(int)
lda = lda.reset_index(drop=True)

In [147]:
# Drop all columns that are completely NaN.
df = df.dropna(axis=1, how='all')

In [148]:
df2 = pd.concat([adt, hx, dx, dem, lda], axis=1)
df2 = df2.drop('mrn_csn_pair', axis=1)
del adt, hx, dx, dem, lda

In [149]:
ddf = dd.from_pandas(df, npartitions=8)
ddf2 = dd.from_pandas(df2, npartitions=1)
del df, df2

In [150]:
# Merge with dask so we dont get memory errors
ddf = ddf.join(ddf2)

In [151]:
# Turn back into pandas
df = ddf.compute()
del ddf, ddf2

In [152]:
# Output to temp storage in case things crash
df.to_csv('complete.csv')

In [268]:
# Kinda arbirtarily drop things so we have no N/A...will need to refine this for sure
dropped = df.dropna(axis=1, thresh=1200).dropna(axis=0, thresh=3000).dropna(axis=1)
dropped = dropped.reset_index(drop=True)

In [269]:
# Drop the last ~50 values as they are not in the flowsheet data
dropped_temp = dropped.iloc[:, :2040]

In [270]:
# Match outcome variables
outcome_var = outcome[outcome['mrn_csn_pair'].isin(dropped['mrn_csn_pair'])]

In [271]:
outcome_var = outcome_var.reset_index(drop=True)['LOS']
dropped_temp = dropped_temp.drop('mrn_csn_pair', axis=1)

In [272]:
# Do tsfresh feature filtering
feature_table = tsfresh.feature_selection.relevance.calculate_relevance_table(dropped_temp, outcome_var, n_jobs=6)

In [280]:
complete = pd.concat([dropped_temp.loc[:, feature_table['relevant']], dropped.iloc[:, 2041:]], axis=1)

In [281]:
complete.to_csv("S:\Dehydration_stroke\Team Emerald\Working Data\Preprocessed\Working\Complete.csv")

In [None]:
con.close()
con_out.close()