In [1]:
# File to create the desired complete database to use in the models later
# Make extra table with mrn_csn_pairs of all patients we are currently including
import numpy as np
import pandas as pd
import sqlite3
import tsfresh
import dask.dataframe as dd

In [2]:
# Get path and connection
path_in = "S:\Dehydration_stroke\Team Emerald\Working Data\Preprocessed\Working\Processed.db"
path_out = "S:\Dehydration_stroke\Team Emerald\Working Data\Preprocessed\Working\Models.db"
con = sqlite3.connect(path_in)
con_out = sqlite3.connect(path_out)

In [None]:
# Retrieve flowsheet table
flowsheet = pd.read_sql_query("SELECT * FROM FLOWSHEET", con)
neuro = pd.read_sql_query('SELECT * FROM NEURO', con)
flowsheet = flowsheet.drop(['mrn', 'csn'], axis = 1).dropna()
neuro = neuro.drop(['mrn', 'csn'], axis=1).dropna()

In [None]:
# Append neuro data so we get all timeseries in one place
flowsheet = flowsheet.append(neuro, ignore_index=True)

In [None]:
# Convert to actual datetime for manipulation and sort by key pair and then datetime
flowsheet.loc[:, 'recorded_datetime'] = pd.to_datetime(flowsheet.loc[:, 'recorded_datetime'])
flowsheet = flowsheet.sort_values(['mrn_csn_pair', 'recorded_datetime'])

In [None]:
# Get all unique patitents so we can pull first 24 hours of data
pats = flowsheet.sort_values('mrn_csn_pair')['mrn_csn_pair'].unique()

In [None]:
first = pd.DataFrame()
# Pull first 24 hours of data, takes a bit to run
for i in pats:
    temp = flowsheet[flowsheet['mrn_csn_pair'] == i]
    # Create mast with 24 hour filter
    mask = (temp['recorded_datetime'] >= temp['recorded_datetime'].reset_index(drop=True)[0]) & (temp['recorded_datetime'] < (np.datetime64(temp['recorded_datetime'].reset_index(drop=True)[0]) + np.timedelta64(24,'h')))
    first = first.append(temp[mask], ignore_index=True)

In [3]:
# In case you dont want to run the above, just pull from database
first = pd.read_sql_query("SELECT * FROM flowsheet_first24h", con_out)
pats = first['mrn_csn_pair'].unique()

In [4]:
# Have to break dataset up to avoid memory issues, takes ~5 min to run
extracted_flowsheet = pd.DataFrame()
for i in range(8):
    split1 = int(pats.shape[0] / 8) * i
    split2 = int(pats.shape[0] / 8) * (i + 1)
    key1 = pats[split1]
    key2 = pats[split2]
    index1 = first[first['mrn_csn_pair'] == key1].index[0]
    index2 = first[first['mrn_csn_pair'] == key2].index[0] - 1
    if i == 7:
        index2 = first.shape[0] - 1
    temp = first.loc[index1 : index2, :]
    extracted_temp = tsfresh.extract_features(temp, column_id='mrn_csn_pair', column_sort='recorded_datetime', column_kind='Name', column_value='value', n_jobs=6)
    extracted_flowsheet = extracted_flowsheet.append(extracted_temp, ignore_index=True)
    

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:50<00:00,  1.69s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:44<00:00,  1.48s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:47<00:00,  1.58s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:40<00:00,  1.36s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:48<00:00,  1.62s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:54<00:00,  1.81s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:45<00:00,  1.51s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:46<00:00,  1.54s/it]


In [6]:
extracted_flowsheet.insert(0, 'mrn_csn_pair', pats)

In [None]:
first.to_sql('flowsheet_first24h', con_out, if_exists='replace')

In [7]:
# Uhh..have to save to excel because sql reallllyyy hates lots of columns.
extracted_flowsheet.to_csv('extracted_flowsheet_first24h.csv')

In [1]:
con.close()
con_out.close()

NameError: name 'con' is not defined