In [62]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json
import time
import statistics as stat

import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

<center><h2><b>Leer DB</b></h2></center>

In [63]:
#
# medication: Columns (11) have mixed types. Specify dtype option on import or set low_memory=False-
# TODO: Feature idea - 
#
def read_csvs():
    datasets = [ 'admissiondrug', 'admissionDx', '_allergyDrug', '_allergyNonDrug', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL',
                 'carePlanGeneral', 'carePlanGoal','carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab',
                 'note', 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic', 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        #dfs[ds_name.lower()] = (pd.read_csv('../db/csv/' + ds_name + '.csv'), )
        dfs[ds_name.lower()] = pd.read_csv('../db/csv_clean/' + ds_name + '.csv')
    
    return dfs

def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1_clean.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

def make_relationships(dfs):
    relationships = []
    pk_fk = json.loads( open('keys.json').read() )
    i = 0

    for ds_name in pk_fk:
        #if pk_fk[ds_name]['pk'] != False:
        #    dfs[ds_name][0].set_index(pk_fk[ds_name]['pk'])
        
        if pk_fk[ds_name]['fk'] != False and ds_name not in ('hospital', 'medication'):
            #print(ds_name, pk[ds_name]['fk'])
            fk_atr, target_table, target_atr = pk_fk[ds_name]['fk']
            
            #print((target_table, target_atr, ds_name, fk_atr))
            relationships.append((target_table, target_atr, ds_name, fk_atr))
            
    relationships.append(('hospital', 'hospitalid', 'patient', 'hospitalid'))

    return relationships

#------------------------------------------------------------------------------------------------------

dfs = read_csvs()
relationships = make_relationships(dfs)


<center><h2><b>Transformación de columnas</b></h2></center>

In [67]:
X_query = """
    SELECT *
    FROM patient P LEFT JOIN diagnosis        D ON P.patientunitstayid = D.patientunitstayid
                   LEFT JOIN admissiondrug   AD ON P.patientunitstayid = AD.patientunitstayid
                   LEFT JOIN respiratoryCare RC ON P.patientunitstayid = RC.patientunitstayid
                   LEFT JOIN ( SELECT patientunitstayid, unabridgedunitlos, apacheversion
                                FROM apachePatientResult
                              ) A ON P.patientunitstayid = A.patientunitstayid
    WHERE A.apacheversion = "IV"
"""

X = sql_query(X_query).drop(columns=['patientunitstayid', 'apacheversion'])
y = X['unitdischargeoffset']

# Reemplazar NaN por un valor por defecto
X['ASPIRIN'] = X['ASPIRIN'].fillna(0.0)
X['LISINOPRIL'] = X['LISINOPRIL'].fillna(0.0)
X['LASIX'] = X['LASIX'].fillna(0.0)
X['currenthistoryseqnum'] = X['currenthistoryseqnum'].fillna('None')

In [68]:
X

Unnamed: 0,gender,age,ethnicity,hospitalid,apacheadmissiondx,admissionheight,hospitaladmitoffset,hospitaladmitsource,hospitaldischargeoffset,unitvisitnumber,...,avg_hospital_stay,admission_bmi,last1,last2,last3,ASPIRIN,LISINOPRIL,LASIX,currenthistoryseqnum,unabridgedunitlos
0,Female,87,Caucasian,59,"Rhythm disturbance (atrial, supraventricular)",157.5,-8,Emergency Department,2616,1,...,366,3.387097,,,,0.0,0.0,0.0,,1.5625
1,Male,76,Caucasian,68,"Endarterectomy, carotid",167.0,-1,Operating Room,1218,1,...,0,2.154839,s/p cartoid endarterectomy,bradycardia,coronary artery disease,0.0,0.0,0.0,,0.5506
2,Female,34,Caucasian,56,"Overdose, other toxin, poison or drug",172.7,-23,Emergency Department,1138,1,...,0,2.864013,,,,0.0,0.0,0.0,,0.7784
3,Male,61,Caucasian,68,"GI perforation/rupture, surgery for",177.8,-10,Emergency Department,5263,1,...,0,1.938931,sepsis,sepsis,diverticulitis of colon,0.0,0.0,0.0,,0.9506
4,Female,55,Caucasian,63,"Endarterectomy, carotid",157.5,-19,Operating Room,1086,1,...,610,2.172414,,,,0.0,0.0,0.0,,0.3305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1833,Male,52,African American,458,"Angina, stable (asymp or stable pattern of sym...",159.0,0,Direct Admit,15845,1,...,12911,2.056921,chest pain,,,0.0,0.0,0.0,Mid,1.0347
1834,Female,62,Caucasian,459,"Fistula/abscess, surgery for (not inflammatory...",165.1,-68242,Operating Room,36894,1,...,0,1.227509,ventilatory failure,ventilatory failure,enteric fistula,0.0,0.0,0.0,High,3.7458
1835,Male,41,African American,458,"CABG alone, coronary artery bypass grafting",177.8,-1512,Direct Admit,18482,2,...,0,1.400000,s/p CABG < 7 days,s/p CABG < 7 days,acute myocardial infarction (with ST elevation),0.0,0.0,0.0,High,2.9590
1836,Male,41,African American,458,"Infarction, acute myocardial (MI)",177.8,-136,Direct Admit,19858,1,...,18482,1.400000,acute myocardial infarction (with ST elevation),,,0.0,0.0,0.0,High,0.9506


In [73]:
X_T = ColumnTransformer(transformers=[
    ('gender',                  OneHotEncoder(),  [0]),
    ('age',                     'passthrough',    [1]), # TODO: Probar categórica
    ('ethnicity',               OneHotEncoder(),  [2]),
    ('hospitalid',              OneHotEncoder(),  [3]), # TODO: ''
    ('apacheadmissiondx',       'drop',           [4]), # JK
    ('admissionheight',         'passthrough',    [5]),
    ('hospitaladmitoffset',     'passthrough',    [6]),
    ('hospitaladmitsource',      OneHotEncoder(), [7]),
    ('hospitaldischargeoffset', 'drop',           [8]),
    ('unitvisitnumber',         'passthrough',    [9]),    
    ('admissionweight',         'passthrough',    [10]),
    ('unitdischargeoffset',     'drop',           [11]), # obv
    ('_avg_unit_stay',          'passthrough',    [12]),
    ('_avg_hospital_stay',      'passthrough',    [13]),
    ('_admission_bmi',          'passthrough',    [14]),
    ('_DIAGNOSIS_last1',        'drop',           [15]),
    ('_DIAGNOSIS_last2',        'drop',           [16]),
    ('_DIAGNOSIS_last3',        'drop',           [17]),
    ('_admissionAspirin',       'passthrough',    [18]),
    ('_admissionLisinopril',    'passthrough',    [19]),
    ('_admissionLasix',         'passthrough',    [20]),
    ('_respCare_currenthistoryseqnum',   OneHotEncoder(),  [21]),
    ('unabridgedunitlos',       'drop',           [22])
]).fit_transform(X)

# Mostrar el cambio en columnas
print(X.shape, '->', X_T.shape)

(1838, 23) -> (1838, 208)


<center><h2><b>Entrenamiento y calcular Error</b></h2></center>

In [74]:
# avg df: 3484.0752 = 2.420139 dias.
def cv_avg_std(reg, X, y, scoring):
    maes = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    avg = stat.mean(maes)
    std_dev = stat.variance(maes)**(1/2)
    
    return maes, avg, std_dev

def make_df(datos_reg):
    error_df = pd.DataFrame()

    error_df['Regresor']                = datos_reg.keys()
    error_df['Average MAE']             = [ abs(dato['avg']) for dato in datos_reg.values() ]
    error_df['Standard Deviation MAE']  = [ dato['std_dev'] for dato in datos_reg.values() ]
    error_df['Average R2']              = [ dato['avg_r2'] for dato in datos_reg.values() ]
    error_df['time']                    = [ dato['time'] for dato in datos_reg.values() ]
    
    return error_df

datos_reg = {}
regressors = [
    ('Random Forest', RandomForestRegressor())
]

# Medir tiempo y hacer predicciones para cada regresor
for reg_name, reg in regressors:
    start_time = time.time()

    maes, avg, std_dev = cv_avg_std(reg, X_T, y, 'neg_mean_absolute_error')
    maes, r2, _ = cv_avg_std(reg, X_T, y, 'r2')
    
    datos_reg[reg_name] = { 'avg': avg, 'std_dev': std_dev, 'time': time.time() - start_time }
    datos_reg[reg_name]['avg_r2'] = r2

make_df(datos_reg)

Unnamed: 0,Regresor,Average MAE,Standard Deviation MAE,Average R2,time
0,Random Forest,2522.362636,400.980818,-0.009394,21.648863
