In [101]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

<center><h2><b>Leer DB</b></h2></center>

In [104]:
#
# medication: Columns (11) have mixed types. Specify dtype option on import or set low_memory=False-
# TODO: Feature idea - 
#
def read_csvs():
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral', 'carePlanGoal',
                 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note', 'nurseAssessment', 'nurseCare',
                 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic', 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        #dfs[ds_name.lower()] = (pd.read_csv('../db/csv/' + ds_name + '.csv'), )
        dfs[ds_name.lower()] = pd.read_csv('../db/csv_clean/' + ds_name + '.csv')
    
    return dfs

def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1_clean.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

def make_relationships(dfs):
    relationships = []
    pk_fk = json.loads( open('keys.json').read() )
    i = 0

    for ds_name in pk_fk:
        #if pk_fk[ds_name]['pk'] != False:
        #    dfs[ds_name][0].set_index(pk_fk[ds_name]['pk'])
        
        if pk_fk[ds_name]['fk'] != False and ds_name not in ('hospital', 'medication'):
            #print(ds_name, pk[ds_name]['fk'])
            fk_atr, target_table, target_atr = pk_fk[ds_name]['fk']
            
            #print((target_table, target_atr, ds_name, fk_atr))
            relationships.append((target_table, target_atr, ds_name, fk_atr))
            
    relationships.append(('hospital', 'hospitalid', 'patient', 'hospitalid'))

    return relationships

#------------------------------------------------------------------------------------------------------

dfs = read_csvs()
relationships = make_relationships(dfs)


<center><h2><b>Transformación de columnas</b></h2></center>

In [109]:
#X = dfs['patient']
X = sql_query("""
    SELECT *
    FROM patient P INNER JOIN apacheApsVar A ON P.patientunitstayid = A.patientunitstayid
""")

y = X['unitdischargeoffset']

X_T = ColumnTransformer(transformers=[
    ('patientunitstayid',       'drop',           [0]),
    ('gender',                  OneHotEncoder(),  [1]),
    ('age',                     'passthrough',    [2]), # TODO: Probar categórica
    ('ethnicity',               OneHotEncoder(),  [3]),
    ('hospitalid',              'passthrough',    [4]), # TODO: ''
    ('apacheadmissiondx',       'drop',           [5]), # JK
    ('admissionheight',         'passthrough',    [6]),
    ('hospitaladmitoffset',     'passthrough',    [7]),
    ('hospitaladmitsource',      OneHotEncoder(), [8]),
    ('hospitaldischargeoffset', 'passthrough',    [9]),
    ('unitvisitnumber',         'passthrough',    [10]),    
    ('admissionweight',         'passthrough',    [11]),
    ('unitdischargeoffset',     'drop',           [12]), # obv
    #('unabridgedunitlos', 'passthrough', [13])
]).fit_transform(X)

<center><h2><b>Entrenamiento y calcular Error</b></h2></center>

In [110]:
import time
import statistics as stat

def cv_avg_std(reg, X, y, scoring):
    maes = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    avg = stat.mean(maes)
    std_dev = stat.variance(maes)**(1/2)
    
    return maes, avg, std_dev

def make_df(datos_reg):
    error_df = pd.DataFrame()

    error_df['Regresor']                = datos_reg.keys()
    error_df['Average MAE']             = [ abs(dato['avg']) for dato in datos_reg.values() ]
    error_df['Standard Deviation MAE']  = [ dato['std_dev'] for dato in datos_reg.values() ]
    error_df['Average R2']              = [ dato['avg_r2'] for dato in datos_reg.values() ]
    error_df['time']                    = [ dato['time'] for dato in datos_reg.values() ]
    
    return error_df

datos_reg = {}
regressors = [
    ('Random Forest', RandomForestRegressor())
]

# Medir tiempo y hacer predicciones para cada regresor
for reg_name, reg in regressors:
    start_time = time.time()

    maes, avg, std_dev = cv_avg_std(reg, X_T, y, 'neg_mean_absolute_error')
    maes, r2, _ = cv_avg_std(reg, X_T, y, 'r2')
    
    datos_reg[reg_name] = { 'avg': avg, 'std_dev': std_dev, 'time': time.time() - start_time }
    datos_reg[reg_name]['avg_r2'] = r2

make_df(datos_reg)

Unnamed: 0,Regresor,Average MAE,Standard Deviation MAE,Average R2,time
0,Random Forest,2059.738024,276.458815,0.385292,6.656238


<center><h2>JOIN test</h2></center>

In [None]:
q = """
    SELECT *
    FROM patient P INNER JOIN apachepatientresult A ON (P.patientunitstayid = A.patientunitstayid)
"""

conn = db.connect('test.db')
df = pd.read_sql_query(q, conn)
conn.close()
df