In [8]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json
import time
import statistics as stat

import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

#!pip install xboost
#from xgboost import XGBRegressor

<center><h2><b>Leer DB</b></h2></center>

In [9]:
#
# medication: Columns (11) have mixed types. Specify dtype option on import or set low_memory=False-
# TODO: Feature idea - 
#
def read_csvs():
    datasets = [ 'admissiondrug', 'admissionDx', '_allergyDrug', '_allergyNonDrug', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL',
                 'carePlanGeneral', 'carePlanGoal','carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab',
                 'note', 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic', 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        #dfs[ds_name.lower()] = (pd.read_csv('../db/csv/' + ds_name + '.csv'), )
        dfs[ds_name.lower()] = pd.read_csv('../db/csv_clean/' + ds_name + '.csv')
    
    return dfs

def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1_clean.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

def make_relationships(dfs):
    relationships = []
    pk_fk = json.loads( open('keys.json').read() )
    i = 0

    for ds_name in pk_fk:
        #if pk_fk[ds_name]['pk'] != False:
        #    dfs[ds_name][0].set_index(pk_fk[ds_name]['pk'])
        
        if pk_fk[ds_name]['fk'] != False and ds_name not in ('hospital', 'medication'):
            #print(ds_name, pk[ds_name]['fk'])
            fk_atr, target_table, target_atr = pk_fk[ds_name]['fk']
            
            #print((target_table, target_atr, ds_name, fk_atr))
            relationships.append((target_table, target_atr, ds_name, fk_atr))
            
    relationships.append(('hospital', 'hospitalid', 'patient', 'hospitalid'))

    return relationships

#------------------------------------------------------------------------------------------------------

dfs = read_csvs()
relationships = make_relationships(dfs)


In [10]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)
#X.head(10)
#sql_query("""
#""").head(100)

<center><h2><b>Transformación de columnas</b></h2></center>

In [11]:
X_query = """
    SELECT *
    FROM patient P LEFT JOIN diagnosis        D     ON P.patientunitstayid = D.patientunitstayid
                   LEFT JOIN admissiondrug   AD     ON P.patientunitstayid = AD.patientunitstayid
                   LEFT JOIN respiratoryCare RC     ON P.patientunitstayid = RC.patientunitstayid
                   LEFT JOIN physicalExam    PE     ON P.patientunitstayid = PE.patientunitstayid
                   LEFT JOIN admissionDx     ADX    ON P.patientunitstayid = ADX.patientunitstayid
                   LEFT JOIN carePlanCareProvider C ON P.patientunitstayid = C.patientunitstayid
                   LEFT JOIN infusiondrug ID        ON P.patientunitstayid = ID.patientunitstayid
                   LEFT JOIN carePlanGeneral CG ON P.patientunitstayid = CG.patientunitstayid
                   LEFT JOIN carePlanInfectiousDisease CGID ON P.patientunitstayid = CGID.patientunitstayid
                   LEFT JOIN carePlanGoal CPG ON P.patientunitstayid = CPG.patientunitstayid
"""

X = sql_query(X_query).drop(columns=['patientunitstayid'])
y = X['unitdischargeoffset']

# Reemplazar NaN por un valor por defecto
X['ASPIRIN'] = X['ASPIRIN'].fillna(0.0)
X['LISINOPRIL'] = X['LISINOPRIL'].fillna(0.0)
X['LASIX'] = X['LASIX'].fillna(0.0)
X['currenthistoryseqnum'] = X['currenthistoryseqnum'].fillna('_Unknown')

for col in 'BPD_Current BPD_Highest BPD_Lowest BPS_Current BPS_Highest BPS_Lowest Blood_Loss Dialysis_Net O2Sat_Current O2Sat_Highest O2Sat_Lowest Urine Intubated Comatose Ventilated Motor Verbal Eyes'.split():
    X[col] = X[col].fillna(-1)

for col in 'Cardiovascular Respiratory Neurologic Care_Patient_Family Care_Pulmonary Care_Fluid_Balance_Treatments Care_Activity_Safety Care_Cardiovascular Care_Infection_Labs'.split():
    X[col] = X[col].fillna(0)
    
for col in ['Categoria_Hospital', 'Categoria_Cardiology', 'Categoria_Internal_Medicine', 'Intervencion_I', 'Intervencion_II', 'Intervencion_III', 'Intervencion_IV']:
    X[col] = X[col].fillna(0)
    
for col in ['last_propofol', 'last_propofol_drugrate', 'last_propofol_infusionrate', 'last_propofol_drugamount', 'last_propofol_volumeoffluid', 'last_propofol_patientweight', 'last_insulin',
 'last_insulin_drugrate', 'last_insulin_infusionrate', 'last_insulin_drugamount', 'last_insulin_volumeoffluid', 'last_insulin_patientweight', 'last_amiodarone', 'last_amiodarone_drugrate',
 'last_amiodarone_infusionrate', 'last_amiodarone_drugamount', 'last_amiodarone_volumeoffluid', 'last_amiodarone_patientweight', 'last_dexmedetomidine', 'last_dexmedetomidine_drugrate', 'last_dexmedetomidine_infusionrate',
 'last_dexmedetomidine_drugamount', 'last_dexmedetomidine_volumeoffluid', 'last_dexmedetomidine_patientweight', 'last_pantoprazole', 'last_pantoprazole_drugrate', 'last_pantoprazole_infusionrate', 'last_pantoprazole_drugamount',
 'last_pantoprazole_volumeoffluid', 'last_pantoprazole_patientweight', 'last_fentanyl', 'last_fentanyl_drugrate', 'last_fentanyl_infusionrate', 'last_fentanyl_drugamount', 'last_fentanyl_volumeoffluid',
 'last_fentanyl_patientweight', 'last_norepinephrine', 'last_norepinephrine_drugrate', 'last_norepinephrine_infusionrate', 'last_norepinephrine_drugamount', 'last_norepinephrine_volumeoffluid', 'last_norepinephrine_patientweight',
 'last_midazolam', 'last_midazolam_drugrate', 'last_midazolam_infusionrate', 'last_midazolam_drugamount', 'last_midazolam_volumeoffluid', 'last_midazolam_patientweight', 'last_heparin',
 'last_heparin_drugrate', 'last_heparin_infusionrate', 'last_heparin_drugamount', 'last_heparin_volumeoffluid', 'last_heparin_patientweight']:
        X[col] = X[col].fillna(-1)
        
for col in ['Categoria_Ventilacion', 'Categoria_DVT_Prophylaxis', 'Categoria_Airway', 'Categoria_Care_Limitation', 'Categoria_Stress_Ulcer_Prophylaxis']:
    X[col] = X[col].fillna(0)
    
for col in ['infectdiseasesite', 'infectdiseaseassessment']:
    X[col] = X[col].fillna('_None')

In [12]:
X.head(15)

Unnamed: 0,gender,age,ethnicity,hospitalid,apacheadmissiondx,admissionheight,hospitaladmitoffset,hospitaladmitsource,hospitaldischargeoffset,unitvisitnumber,admissionweight,unitdischargeoffset,avg_unit_stay,avg_hospital_stay,admission_bmi,last1,last2,last3,ASPIRIN,LISINOPRIL,LASIX,currenthistoryseqnum,BPD_Current,BPD_Highest,BPD_Lowest,BPS_Current,BPS_Highest,BPS_Lowest,Blood_Loss,Dialysis_Net,O2Sat_Current,O2Sat_Highest,O2Sat_Lowest,Urine,Intubated,Comatose,Ventilated,Motor,Verbal,Eyes,Cardiovascular,Respiratory,Neurologic,Categoria_Hospital,Categoria_Cardiology,Categoria_Internal_Medicine,Intervencion_I,Intervencion_II,Intervencion_III,Intervencion_IV,last_norepinephrine,last_norepinephrine_drugrate,last_norepinephrine_infusionrate,last_norepinephrine_drugamount,last_norepinephrine_volumeoffluid,last_norepinephrine_patientweight,last_heparin,last_heparin_drugrate,last_heparin_infusionrate,last_heparin_drugamount,last_heparin_volumeoffluid,last_heparin_patientweight,last_amiodarone,last_amiodarone_drugrate,last_amiodarone_infusionrate,last_amiodarone_drugamount,last_amiodarone_volumeoffluid,last_amiodarone_patientweight,last_dexmedetomidine,last_dexmedetomidine_drugrate,last_dexmedetomidine_infusionrate,last_dexmedetomidine_drugamount,last_dexmedetomidine_volumeoffluid,last_dexmedetomidine_patientweight,last_insulin,last_insulin_drugrate,last_insulin_infusionrate,last_insulin_drugamount,last_insulin_volumeoffluid,last_insulin_patientweight,last_propofol,last_propofol_drugrate,last_propofol_infusionrate,last_propofol_drugamount,last_propofol_volumeoffluid,last_propofol_patientweight,last_midazolam,last_midazolam_drugrate,last_midazolam_infusionrate,last_midazolam_drugamount,last_midazolam_volumeoffluid,last_midazolam_patientweight,last_fentanyl,last_fentanyl_drugrate,last_fentanyl_infusionrate,last_fentanyl_drugamount,last_fentanyl_volumeoffluid,last_fentanyl_patientweight,last_pantoprazole,last_pantoprazole_drugrate,last_pantoprazole_infusionrate,last_pantoprazole_drugamount,last_pantoprazole_volumeoffluid,last_pantoprazole_patientweight,Categoria_Ventilacion,Categoria_DVT_Prophylaxis,Categoria_Airway,Categoria_Care_Limitation,Categoria_Stress_Ulcer_Prophylaxis,infectdiseasesite,infectdiseaseassessment,Care_Patient_Family,Care_Pulmonary,Care_Fluid_Balance_Treatments,Care_Activity_Safety,Care_Cardiovascular,Care_Infection_Labs
0,Female,87,Caucasian,59,"CHF, congestive heart failure",157.5,-2258,Emergency Department,366,2,67.6,344,0,0,-1.0,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
1,Female,87,Caucasian,59,"Rhythm disturbance (atrial, supraventricular)",157.5,-8,Emergency Department,2616,1,46.5,2250,344,366,3.387097,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,1.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,2.0,2.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
2,Male,76,Caucasian,68,"Endarterectomy, carotid",167.0,-1,Operating Room,1218,1,77.5,793,0,0,2.154839,s/p cartoid endarterectomy,bradycardia,coronary artery disease,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
3,Female,34,Caucasian,56,"Overdose, other toxin, poison or drug",172.7,-23,Emergency Department,1138,1,60.3,1121,0,0,2.864013,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,0.0,0.0,1.0,0.0,0.0,28.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
4,Male,61,Caucasian,68,"GI perforation/rupture, surgery for",177.8,-10,Emergency Department,5263,1,91.7,1369,0,0,1.938931,sepsis,sepsis,diverticulitis of colon,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,0.0,0.0,0.0,0.0,0.0,153.0,0.0,1.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,2.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
5,Female,55,Caucasian,63,"CHF, congestive heart failure",157.5,-495,Operating Room,610,2,85.45,610,0,0,-1.0,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
6,Female,55,Caucasian,63,"Endarterectomy, carotid",157.5,-19,Operating Room,1086,1,72.5,476,610,610,2.172414,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
7,Female,60,Hispanic,67,"Coma/change in level of consciousness (for hepatic see GI, for diabetic see Endocrine, if related to cardiac arrest, see CV)",154.9,0,Emergency Department,3962,1,95.6,2381,0,0,1.620293,,,,0.0,0.0,0.0,Low,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,3,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3.0,1.0,2.0,1.0,1.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
8,Male,28,Caucasian,61,"Overdose, other toxin, poison or drug",182.9,-1,Emergency Department,1272,1,91.8,1268,0,0,1.992375,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,4,4,0.0,0.0,1.0,0.0,0.0,-82.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,2.0,1.0,1.0,1.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0
9,Female,34,Caucasian,68,"Sepsis, pulmonary",165.1,-121,Emergency Department,387,2,80.2,387,0,0,-1.0,,,,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X_T = ColumnTransformer(transformers=[
    ('gender',                  OneHotEncoder(),  [0]),
    ('age',                     'passthrough',    [1]), # TODO: Probar categórica
    ('ethnicity',               OneHotEncoder(),  [2]),
    ('hospitalid',              OneHotEncoder(),  [3]), # TODO: ''
    ('apacheadmissiondx',       'drop',           [4]), # JK
    ('admissionheight',         'passthrough',    [5]),
    ('hospitaladmitoffset',     'passthrough',    [6]),
    ('hospitaladmitsource',      OneHotEncoder(), [7]),
    ('hospitaldischargeoffset', 'drop',           [8]),
    ('unitvisitnumber',         'passthrough',    [9]),    
    ('admissionweight',         'passthrough',    [10]),
    ('unitdischargeoffset',     'drop',           [11]), # obv
    ('_avg_unit_stay',          'passthrough',    [12]),
    ('_avg_hospital_stay',      'passthrough',    [13]),
    ('_admission_bmi',          'passthrough',    [14]),
    ('_DIAGNOSIS_last1',        'drop',           [15]),
    ('_DIAGNOSIS_last2',        'drop',           [16]),
    ('_DIAGNOSIS_last3',        'drop',           [17]),
    ('_admissionAspirin',       'passthrough',    [18]),
    ('_admissionLisinopril',    'passthrough',    [19]),
    ('_admissionLasix',         'passthrough',    [20]),
    ('_respCare_currenthistoryseqnum',   OneHotEncoder(),  [21]),
    
    ('a0', 'passthrough',  [22]),
    ('a1', 'passthrough',  [23]),
    ('a2', 'passthrough',  [24]),
    ('a3', 'passthrough',  [25]),
    ('a4', 'passthrough',  [26]),
    ('a5', 'passthrough',  [27]),
    ('a6', 'passthrough',  [28]),
    ('a7', 'passthrough',  [29]),
    ('a8', 'passthrough',  [30]),
    ('a9', 'passthrough',  [31]),
    ('b1', 'passthrough',  [32]),    
    ('b2', 'passthrough',  [33]),
    ('b3', 'passthrough',  [34]),
    ('b4', 'passthrough',  [35]),
    ('b5', 'passthrough',  [36]),
    ('b6', 'passthrough',  [37]),
    ('b7', 'passthrough',  [38]),
    ('b8', 'passthrough',  [39]),

    # AdmissionDx
    ('_admissionCardiovascular', 'passthrough',  [40]),
    ('_admissionRespiratory', 'passthrough',  [41]),
    ('_admissionNeurologic', 'passthrough',  [42]),
  
    # CarePlanProvider
# Random Forest	2273.361306	358.72646	0.140683	82.065168
    ('Categoria_Hospital', 'passthrough',  [43]), 
    ('Categoria_Cardiology', 'passthrough',  [44]),
    ('Categoria_Internal_Medicine', 'passthrough',  [45]),
    ('Intervencion_I', 'passthrough',  [46]),
    ('Intervencion_II', 'passthrough',  [47]),
    ('Intervencion_III', 'passthrough',  [48]), 
    ('Intervencion_IV', 'passthrough',  [49]),
    
    # Medicamentos
    ('9287', 'drop', [50]),
    ('9419', 'drop', [51]),
    ('8161', 'drop', [52]),
    ('2615', 'drop', [53]),
    ('7038', 'drop', [54]),
    ('1696', 'drop', [55]),
    ('7340', 'drop', [56]),
    ('4964', 'drop', [57]),
    ('3927', 'drop', [58]),
    ('1041', 'drop', [59]),
    ('3996', 'drop', [60]),
    ('1947', 'drop', [61]),
    ('3232', 'drop', [62]),
    ('3410', 'drop', [63]),
    ('1411', 'drop', [64]),
    ('7636', 'drop', [65]),
    ('6900', 'drop', [66]),
    ('9622', 'drop', [67]),
    ('5275', 'drop', [68]),
    ('9092', 'drop', [69]),
    ('7364', 'drop', [70]),
    ('3343', 'drop', [71]),
    ('6196', 'drop', [72]),
    ('4519', 'drop', [73]),
    ('4634', 'drop', [74]),
    ('1233', 'drop', [75]),
    ('9876', 'drop', [76]),
    ('8832', 'drop', [77]),
    ('9536', 'drop', [78]),
    ('9307', 'drop', [79]),
    ('4331', 'drop', [80]),
    ('2764', 'drop', [81]),
    ('2197', 'drop', [82]),
    ('3814', 'drop', [83]),
    ('7092', 'drop', [84]),
    ('7680', 'drop', [85]),
    ('1068', 'drop', [86]),
    ('5827', 'drop', [87]),
    ('9052', 'drop', [88]),
    ('3484', 'drop', [89]),
    ('2130', 'drop', [90]),
    ('2185', 'drop', [91]),
    ('6193', 'drop', [92]),
    ('3634', 'drop', [93]),
    ('5877', 'drop', [94]),
    ('7850', 'drop', [95]),
    ('2354', 'drop', [96]),
    ('1046', 'drop', [97]),
    ('4857', 'drop', [98]),
    ('3274', 'drop', [99]),
    ('9907', 'drop', [100]),
    ('9037', 'drop', [101]),
    ('9950', 'drop', [102]),
    ('1253', 'drop', [103]),
    
    # CarePlanGeneral
    ('Categoria_Ventilacion','passthrough',  [104]),
    ('Categoria_DVT_Prophylaxis', 'passthrough',  [105]),
    ('Categoria_Airway', 'passthrough',  [106]),
    ('Categoria_Care_Limitation', 'passthrough',  [107]),
    ('Categoria_Stress_Ulcer_Prophylaxis', 'passthrough',  [108]),
    
    # CarePlanInfectiousDisease
    ('CPID_tipo_infeccion', OneHotEncoder(),  [109]),
    ('CPID_seguridad_infeccion', OneHotEncoder(),  [110]),

    
     
    
    # CarePlanGoal
    ('Care_Patient_Family', 'passthrough',  [111]),
    ('Care_Pulmonary', 'passthrough',  [112]),
    ('Care_Fluid_Balance_Treatments', 'passthrough',  [113]),
    ('Care_Activity_Safety', 'passthrough',  [114]),
    ('Care_Cardiovascular', 'passthrough',  [115]),
    ('Care_Infection_Labs', 'passthrough',  [116])
]).fit_transform(X)

# Mostrar el cambio en columnas
print(X.shape, '->', X_T.shape)

(2531, 117) -> (2531, 277)


<center><h2><b>Entrenamiento y calcular Error</b></h2></center>

In [21]:
from xgboost import XGBRegressor
# avg df: 3484.0752 = 2.420139 dias.
def cv_avg_std(reg, X, y, scoring):
    maes = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    avg = stat.mean(maes)
    std_dev = stat.variance(maes)**(1/2)
    
    return maes, avg, std_dev

def make_df(datos_reg):
    error_df = pd.DataFrame()

    error_df['Regresor']                = datos_reg.keys()
    error_df['Average MAE']             = [ abs(dato['avg']) for dato in datos_reg.values() ]
    error_df['Standard Deviation MAE']  = [ dato['std_dev'] for dato in datos_reg.values() ]
    error_df['Average R2']              = [ dato['avg_r2'] for dato in datos_reg.values() ]
    error_df['time']                    = [ dato['time'] for dato in datos_reg.values() ]
    
    return error_df

datos_reg = {}
regressors = [
    #('RF', RandomForestRegressor()
    ('Random Forest', RandomForestRegressor(
      criterion="absolute_error",
      max_features=48,
      max_depth=50,
      random_state=600738951),

# XGBRegressor	2077.741577	181.879263	0.309546	13.085054
#    ( 'XGBRegressor', XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
    )
]

# Medir tiempo y hacer predicciones para cada regresor
for reg_name, reg in regressors:
    start_time = time.time()

    maes, avg, std_dev = cv_avg_std(reg, X_T, y, 'neg_mean_absolute_error')
    maes, r2, _ = cv_avg_std(reg, X_T, y, 'r2')
    
    datos_reg[reg_name] = { 'avg': avg, 'std_dev': std_dev, 'time': time.time() - start_time }
    datos_reg[reg_name]['avg_r2'] = r2

make_df(datos_reg)

Unnamed: 0,Regresor,Average MAE,Standard Deviation MAE,Average R2,time
0,Random Forest,1998.258554,108.00929,0.347924,248.894837
