In [212]:
import sqlite3 as db
import pandas as pd
#import featuretools as ft
import json
import time
import statistics as stat
import random

import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

!pip install xgboost
from xgboost import XGBRegressor



<center><h2><b>Leer DB</b></h2></center>

In [213]:
#
# medication: Columns (11) have mixed types. Specify dtype option on import or set low_memory=False-
# TODO: Feature idea - 
#
def read_csvs():
    datasets = [ 'admissiondrug', 'admissionDx', '_allergyDrug', '_allergyNonDrug', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL',
                 'carePlanGeneral', 'carePlanGoal','carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab',
                 'note', 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic', 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        #dfs[ds_name.lower()] = (pd.read_csv('../db/csv/' + ds_name + '.csv'), )
        dfs[ds_name.lower()] = pd.read_csv('../db/csv_clean/' + ds_name + '.csv')
    
    return dfs

def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1_clean.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

def make_relationships(dfs):
    relationships = []
    pk_fk = json.loads( open('keys.json').read() )
    i = 0

    for ds_name in pk_fk:
        #if pk_fk[ds_name]['pk'] != False:
        #    dfs[ds_name][0].set_index(pk_fk[ds_name]['pk'])
        
        if pk_fk[ds_name]['fk'] != False and ds_name not in ('hospital', 'medication'):
            #print(ds_name, pk[ds_name]['fk'])
            fk_atr, target_table, target_atr = pk_fk[ds_name]['fk']
            
            #print((target_table, target_atr, ds_name, fk_atr))
            relationships.append((target_table, target_atr, ds_name, fk_atr))
            
    relationships.append(('hospital', 'hospitalid', 'patient', 'hospitalid'))

    return relationships

#------------------------------------------------------------------------------------------------------

dfs = read_csvs()
relationships = make_relationships(dfs)


<center><h2><b>Transformación de columnas</b></h2></center>

In [214]:
X_query = """
    SELECT *
    FROM patient P LEFT JOIN diagnosis        D     ON P.patientunitstayid = D.patientunitstayid
                   LEFT JOIN admissiondrug   AD     ON P.patientunitstayid = AD.patientunitstayid
                   LEFT JOIN respiratoryCare RC     ON P.patientunitstayid = RC.patientunitstayid
                   LEFT JOIN physicalExam    PE     ON P.patientunitstayid = PE.patientunitstayid
                   LEFT JOIN admissionDx     ADX    ON P.patientunitstayid = ADX.patientunitstayid
                   LEFT JOIN carePlanCareProvider C ON P.patientunitstayid = C.patientunitstayid
                   LEFT JOIN infusiondrug ID        ON P.patientunitstayid = ID.patientunitstayid
                   LEFT JOIN carePlanGeneral CG ON P.patientunitstayid = CG.patientunitstayid
                   LEFT JOIN carePlanInfectiousDisease CGID ON P.patientunitstayid = CGID.patientunitstayid
                   LEFT JOIN carePlanGoal CPG ON P.patientunitstayid = CPG.patientunitstayid
                   LEFT JOIN vitalAperiodic VAP ON P.patientunitstayid = VAP.patientunitstayid
                   LEFT JOIN vitalPeriodic VP ON P.patientunitstayid = VP.patientunitstayid
                   LEFT JOIN medication M ON P.patientunitstayid = M.patientunitstayid
                   LEFT JOIN allergy AL ON P.patientunitstayid = AL.patientunitstayid
                   LEFT JOIN max_offset MO ON P.patientunitstayid = MO.patientunitstayid
"""

X = sql_query(X_query).drop(columns=['patientunitstayid'])
y = X['unitdischargeoffset']

X['apacheadmissiondx'] = X['apacheadmissiondx'].fillna('_Unknown')

# Reemplazar NaN por un valor por defecto
X['ASPIRIN'] = X['ASPIRIN'].fillna(0.0)
X['LISINOPRIL'] = X['LISINOPRIL'].fillna(0.0)
X['LASIX'] = X['LASIX'].fillna(0.0)
X['currenthistoryseqnum'] = X['currenthistoryseqnum'].fillna('_Unknown')

for col in 'BPD_Current BPD_Highest BPD_Lowest BPS_Current BPS_Highest BPS_Lowest Blood_Loss Dialysis_Net O2Sat_Current O2Sat_Highest O2Sat_Lowest Urine Intubated Comatose Ventilated Motor Verbal Eyes'.split():
    X[col] = X[col].fillna(-1)

for col in 'Cardiovascular Respiratory Neurologic Care_Patient_Family Care_Pulmonary Care_Fluid_Balance_Treatments Care_Activity_Safety Care_Cardiovascular Care_Infection_Labs'.split():
    X[col] = X[col].fillna(0)
    
for col in ['Categoria_Hospital', 'Categoria_Cardiology', 'Categoria_Internal_Medicine', 'Intervencion_I', 'Intervencion_II', 'Intervencion_III', 'Intervencion_IV']:
    X[col] = X[col].fillna(0)
    
for col in ['last_propofol', 'last_propofol_drugrate', 'last_propofol_infusionrate', 'last_propofol_drugamount', 'last_propofol_volumeoffluid', 'last_propofol_patientweight', 'last_insulin',
 'last_insulin_drugrate', 'last_insulin_infusionrate', 'last_insulin_drugamount', 'last_insulin_volumeoffluid', 'last_insulin_patientweight', 'last_amiodarone', 'last_amiodarone_drugrate',
 'last_amiodarone_infusionrate', 'last_amiodarone_drugamount', 'last_amiodarone_volumeoffluid', 'last_amiodarone_patientweight', 'last_dexmedetomidine', 'last_dexmedetomidine_drugrate', 'last_dexmedetomidine_infusionrate',
 'last_dexmedetomidine_drugamount', 'last_dexmedetomidine_volumeoffluid', 'last_dexmedetomidine_patientweight', 'last_pantoprazole', 'last_pantoprazole_drugrate', 'last_pantoprazole_infusionrate', 'last_pantoprazole_drugamount',
 'last_pantoprazole_volumeoffluid', 'last_pantoprazole_patientweight', 'last_fentanyl', 'last_fentanyl_drugrate', 'last_fentanyl_infusionrate', 'last_fentanyl_drugamount', 'last_fentanyl_volumeoffluid',
 'last_fentanyl_patientweight', 'last_norepinephrine', 'last_norepinephrine_drugrate', 'last_norepinephrine_infusionrate', 'last_norepinephrine_drugamount', 'last_norepinephrine_volumeoffluid', 'last_norepinephrine_patientweight',
 'last_midazolam', 'last_midazolam_drugrate', 'last_midazolam_infusionrate', 'last_midazolam_drugamount', 'last_midazolam_volumeoffluid', 'last_midazolam_patientweight', 'last_heparin',
 'last_heparin_drugrate', 'last_heparin_infusionrate', 'last_heparin_drugamount', 'last_heparin_volumeoffluid', 'last_heparin_patientweight']:
        X[col] = X[col].fillna(-1)
        
for col in ['Categoria_Ventilacion', 'Categoria_DVT_Prophylaxis', 'Categoria_Airway', 'Categoria_Care_Limitation', 'Categoria_Stress_Ulcer_Prophylaxis']:
    X[col] = X[col].fillna(0)
    
for col in ['infectdiseasesite', 'infectdiseaseassessment']:
    X[col] = X[col].fillna('_None')

# vitalAperiodic
X['last_aperiodic_off'] = X['last_aperiodic_off'].fillna(-1)
for col in ['last_aperiodic_systolic', 'last_aperiodic_diastolic', 'last_aperiodic_mean']:
    X[col] = X[col].fillna(X[col].median())
    
# vitalPeriodic
for col in ['temperature', 'sao2', 'respiration', 'cvp', 'heartrate']:
    X[col] = X[col].fillna(X[col].median())

# Medication
X['last_order_offset'] = X['last_order_offset'].fillna(-1)

# Allergy
for col in ['nDrugsAllergic', 'nNondrugsAllergic', 'totalAllergic']:
    X[col] = X[col].fillna(0)
    
# Diagnosis
for col in ['last1_off', 'last2_off', 'last3_off', 'last4_off']:
    X[col] = X[col].fillna(-1)

In [215]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)
#X.head(10)
#sql_query("""
#""").head(100)

X.head(10)

Unnamed: 0,gender,age,ethnicity,hospitalid,apacheadmissiondx,admissionheight,hospitaladmitoffset,hospitaladmitsource,hospitaldischargeoffset,unitvisitnumber,admissionweight,unitdischargeoffset,avg_unit_stay,avg_hospital_stay,admission_bmi,last1,last2,last3,last4,last1_off,last2_off,last3_off,last4_off,ASPIRIN,LISINOPRIL,LASIX,currenthistoryseqnum,BPD_Current,BPD_Highest,BPD_Lowest,BPS_Current,BPS_Highest,BPS_Lowest,Blood_Loss,Dialysis_Net,O2Sat_Current,O2Sat_Highest,O2Sat_Lowest,Urine,Intubated,Comatose,Ventilated,Motor,Verbal,Eyes,Cardiovascular,Respiratory,Neurologic,Categoria_Hospital,Categoria_Cardiology,Categoria_Internal_Medicine,Intervencion_I,Intervencion_II,Intervencion_III,Intervencion_IV,last_pantoprazole,last_pantoprazole_drugrate,last_pantoprazole_infusionrate,last_pantoprazole_drugamount,last_pantoprazole_volumeoffluid,last_pantoprazole_patientweight,last_fentanyl,last_fentanyl_drugrate,last_fentanyl_infusionrate,last_fentanyl_drugamount,last_fentanyl_volumeoffluid,last_fentanyl_patientweight,last_propofol,last_propofol_drugrate,last_propofol_infusionrate,last_propofol_drugamount,last_propofol_volumeoffluid,last_propofol_patientweight,last_amiodarone,last_amiodarone_drugrate,last_amiodarone_infusionrate,last_amiodarone_drugamount,last_amiodarone_volumeoffluid,last_amiodarone_patientweight,last_norepinephrine,last_norepinephrine_drugrate,last_norepinephrine_infusionrate,last_norepinephrine_drugamount,last_norepinephrine_volumeoffluid,last_norepinephrine_patientweight,last_heparin,last_heparin_drugrate,last_heparin_infusionrate,last_heparin_drugamount,last_heparin_volumeoffluid,last_heparin_patientweight,last_insulin,last_insulin_drugrate,last_insulin_infusionrate,last_insulin_drugamount,last_insulin_volumeoffluid,last_insulin_patientweight,last_dexmedetomidine,last_dexmedetomidine_drugrate,last_dexmedetomidine_infusionrate,last_dexmedetomidine_drugamount,last_dexmedetomidine_volumeoffluid,last_dexmedetomidine_patientweight,last_midazolam,last_midazolam_drugrate,last_midazolam_infusionrate,last_midazolam_drugamount,last_midazolam_volumeoffluid,last_midazolam_patientweight,Categoria_Ventilacion,Categoria_DVT_Prophylaxis,Categoria_Airway,Categoria_Care_Limitation,Categoria_Stress_Ulcer_Prophylaxis,infectdiseasesite,infectdiseaseassessment,Care_Patient_Family,Care_Pulmonary,Care_Fluid_Balance_Treatments,Care_Activity_Safety,Care_Cardiovascular,Care_Infection_Labs,last_aperiodic_off,last_aperiodic_systolic,last_aperiodic_diastolic,last_aperiodic_mean,temperature,sao2,respiration,cvp,heartrate,last_order_offset,nDrugsAllergic,nNondrugsAllergic,totalAllergic,max_offset
0,Female,87,Caucasian,59,,157.5,-2258,Emergency Department,366,2,67.6,344,0,0,-1.0,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,334.0,153.0,78.0,103.0,-1.0,-1.0,-1.0,-1.0,-1.0,1486.0,0.0,0.0,0.0,1486.0
1,Female,87,Caucasian,59,"rhythm disturbance (atrial, supraventricular)",157.5,-8,Emergency Department,2616,1,46.5,2250,344,366,3.387097,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,1.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,14.0,14.0,14.0,242.0,899.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,2204.0,175.0,77.0,102.0,-1.0,-1.0,-1.0,-1.0,-1.0,1396.0,0.0,0.0,0.0,2244.0
2,Male,76,Caucasian,68,"endarterectomy, carotid",167.0,-1,Operating Room,1218,1,77.5,793,0,0,2.154839,s/p cartoid endarterectomy,bradycardia,coronary artery disease,,10.0,10.0,10.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,13.0,13.0,13.0,10.0,13.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,754.0,107.0,54.0,78.0,-1.0,-1.0,-1.0,-1.0,-1.0,1033.0,0.0,0.0,0.0,1183.0
3,Female,34,Caucasian,56,"overdose, other toxin, poison or drug",172.7,-23,Emergency Department,1138,1,60.3,1121,0,0,2.864013,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,0.0,0.0,1.0,0.0,0.0,28.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,28.0,288.0,28.0,28.0,288.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,1043.0,124.0,70.0,93.0,-1.0,-1.0,-1.0,-1.0,-1.0,1013.0,0.0,0.0,0.0,1118.0
4,Male,61,Caucasian,68,"gi perforation/rupture, surgery for",177.8,-10,Emergency Department,5263,1,91.7,1369,0,0,1.938931,sepsis,sepsis,diverticulitis of colon,s/p exploratory laparotomy,330.0,167.0,13.0,167.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,0.0,0.0,0.0,0.0,0.0,153.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,14.0,139.0,14.0,14.0,153.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,1180.0,132.0,71.0,95.0,-1.0,-1.0,-1.0,-1.0,-1.0,3542.0,0.0,0.0,0.0,5225.0
5,Female,55,Caucasian,63,,157.5,-495,Operating Room,610,2,85.45,610,0,0,-1.0,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,500.0,120.0,71.0,89.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,605.0
6,Female,55,Caucasian,63,"endarterectomy, carotid",157.5,-19,Operating Room,1086,1,72.5,476,610,610,2.172414,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,5,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,43.0,43.0,43.0,43.0,43.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,466.0,127.0,64.0,89.0,-1.0,-1.0,-1.0,-1.0,-1.0,918.0,0.0,0.0,0.0,918.0
7,Female,60,Hispanic,67,"coma/change in level of consciousness (for hepatic see gi, for diabetic see endocrine, if related to cardiac arrest, see cv)",154.9,0,Emergency Department,3962,1,95.6,2381,0,0,1.620293,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,Low,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,3,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3104.0,51.0,1194.0,51.0,51.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,2335.0,111.0,56.0,76.0,-1.0,-1.0,-1.0,-1.0,-1.0,2469.0,0.0,0.0,0.0,3852.0
8,Male,28,Caucasian,61,"overdose, other toxin, poison or drug",182.9,-1,Emergency Department,1272,1,91.8,1268,0,0,1.992375,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,4,4,0.0,0.0,1.0,0.0,0.0,-82.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-82.0,1507.0,-82.0,-82.0,-36.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,1258.0,145.0,98.0,115.0,-1.0,96.0,26.0,-1.0,111.0,102.0,0.0,0.0,0.0,1258.0
9,Female,34,Caucasian,68,,165.1,-121,Emergency Department,387,2,80.2,387,0,0,-1.0,,,,,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,_Unknown,-1.0,-1,-1,-1,-1,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,_None,_None,0.0,0.0,0.0,0.0,0.0,0.0,261.0,97.0,65.0,76.0,-1.0,-1.0,-1.0,-1.0,-1.0,382.0,0.0,0.0,0.0,382.0


In [222]:
transformers = [
    # Patient
    ('gender',                  OneHotEncoder(),  []),
    ('age',                     'passthrough',    []), # TODO: Probar categórica
    ('ethnicity',               OneHotEncoder(),  []),
    ('hospitalid',              OneHotEncoder(),  []), # TODO: ''
    ('apacheadmissiondx',       'drop',  [4]), # JK
    ('admissionheight',         'passthrough',    []),
    ('hospitaladmitoffset',     'passthrough',    []),
    ('hospitaladmitsource',      OneHotEncoder(), []),
    ('hospitaldischargeoffset', 'drop',           []),
    ('unitvisitnumber',         'passthrough',    []),    
    ('admissionweight',         'passthrough',    []),
    ('unitdischargeoffset',     'drop',           []), # obv
    ('_avg_unit_stay',          'passthrough',    []),
    ('_avg_hospital_stay',      'passthrough',    []),
    ('_admission_bmi',          'passthrough',    []),
    
    # Diagnosis
    ('_DIAGNOSIS_last1',        'drop',           []),
    ('_DIAGNOSIS_last2',        'drop',           []),
    ('_DIAGNOSIS_last3',        'drop',           []),
    ('_DIAGNOSIS_last4',        'drop',           []),


    ('_DIAGNOSIS_last1_off',        'passthrough',           []),
    ('_DIAGNOSIS_last2_off',        'passthrough',           []),
    ('_DIAGNOSIS_last3_off',        'passthrough',           []),
    ('_DIAGNOSIS_last4_off',        'passthrough',           []),
    
    # AdmissionDrug
    ('_admissionAspirin',       'passthrough',    []),
    ('_admissionLisinopril',    'passthrough',    []),
    ('_admissionLasix',         'passthrough',    []),
    
    # respCare
    ('_respCare_currenthistoryseqnum',   OneHotEncoder(),  []),
    
    ('a0', 'passthrough',  []),
    ('a1', 'passthrough',  []),
    ('a2', 'passthrough',  []),
    ('a3', 'passthrough',  []),
    ('a4', 'passthrough',  []),
    ('a5', 'passthrough',  []),
    ('a6', 'passthrough',  []),
    ('a7', 'passthrough',  []),
    ('a8', 'passthrough',  []),
    ('a9', 'passthrough',  []),
    ('b1', 'passthrough',  []),    
    ('b2', 'passthrough',  []),
    ('b3', 'passthrough',  []),
    ('b4', 'passthrough',  []),
    ('b5', 'passthrough',  []),
    ('b6', 'passthrough',  []),
    ('b7', 'passthrough',  []),
    ('b8', 'passthrough',  []),

    # AdmissionDx
    ('_admissionCardiovascular', 'passthrough',  []),
    ('_admissionRespiratory', 'passthrough',  []),
    ('_admissionNeurologic', 'passthrough',  []),
  
    # CarePlanProvider
# Random Forest	2273.361306	358.72646	0.140683	82.065168
    ('Categoria_Hospital', 'passthrough',  []), 
    ('Categoria_Cardiology', 'passthrough',  []),
    ('Categoria_Internal_Medicine', 'passthrough',  []),
    ('Intervencion_I', 'passthrough',  []),
    ('Intervencion_II', 'passthrough',  []),
    ('Intervencion_III', 'passthrough',  []), 
    ('Intervencion_IV', 'passthrough',  []),
    
    # Medicamentos
    ('9287', 'drop', []),
    ('9419', 'drop', []),
    ('8161', 'drop', []),
    ('2615', 'drop', []),
    ('7038', 'drop', []),
    ('1696', 'drop', []),
    ('7340', 'drop', []),
    ('4964', 'drop', []),
    ('3927', 'drop', []),
    ('1041', 'drop', []),
    ('3996', 'drop', []),
    ('1947', 'drop', []),
    ('3232', 'drop', []),
    ('3410', 'drop', []),
    ('1411', 'drop', []),
    ('7636', 'drop', []),
    ('6900', 'drop', []),
    ('9622', 'drop', []),
    ('5275', 'drop', []),
    ('9092', 'drop', []),
    ('7364', 'drop', []),
    ('3343', 'drop', []),
    ('6196', 'drop', []),
    ('4519', 'drop', []),
    ('4634', 'drop', []),
    ('1233', 'drop', []),
    ('9876', 'drop', []),
    ('8832', 'drop', []),
    ('9536', 'drop', []),
    ('9307', 'drop', []),
    ('4331', 'drop', []),
    ('2764', 'drop', []),
    ('2197', 'drop', []),
    ('3814', 'drop', []),
    ('7092', 'drop', []),
    ('7680', 'drop', []),
    ('1068', 'drop', []),
    ('5827', 'drop', []),
    ('9052', 'drop', []),
    ('3484', 'drop', []),
    ('2130', 'drop', []),
    ('2185', 'drop', []),
    ('6193', 'drop', []),
    ('3634', 'drop', []),
    ('5877', 'drop', []),
    ('7850', 'drop', []),
    ('2354', 'drop', []),
    ('1046', 'drop', []),
    ('4857', 'drop', []),
    ('3274', 'drop', []),
    ('9907', 'drop', []),
    ('9037', 'drop', []),
    ('9950', 'drop', []),
    ('1253', 'drop', []),
    
    # CarePlanGeneral
    ('Categoria_Ventilacion','passthrough',  []),
    ('Categoria_DVT_Prophylaxis', 'passthrough',  []),
    ('Categoria_Airway', 'passthrough',  []),
    ('Categoria_Care_Limitation', 'passthrough',  []),
    ('Categoria_Stress_Ulcer_Prophylaxis', 'passthrough',  []),
    
    # CarePlanInfectiousDisease
    ('CPID_tipo_infeccion', OneHotEncoder(),  []),
    ('CPID_seguridad_infeccion', OneHotEncoder(),  []),
    
    ('last_aperiodic_off', 'passthrough',  []),
    ('last_aperiodic_systolic', 'passthrough',  []),
    ('last_aperiodic_diastolic', 'passthrough',  []),
    ('last_aperiodic_mean', 'passthrough',  []),
    
    # vitalPeriodic
    ('VP_temp', 'passthrough',  []),
    ('VP_sao2', 'passthrough',  []),
    ('VP_respiration', 'passthrough',  []),
    ('VP_cvp', 'passthrough',  []),
    ('VP_heartrate', 'passthrough',  []),

    
    # Medication
    ('last_order_offset', 'passthrough', []),
    
    # Allergy
    ('nDrugsAllergic', OneHotEncoder(),  []),
    ('nNondrugsAllergic', OneHotEncoder(),  []),
    ('totalAllergic', OneHotEncoder(),  []),
    
    # Max Offset
    ('maxOffset', 'drop',  []),
    
]

for i in range(len(transformers)):
    if len(transformers[i][2]) > 0: transformers[i][2].pop()
    transformers[i][2].append(i)

X_T = ColumnTransformer(transformers=transformers).fit_transform(X)

# Mostrar el cambio en columnas
print(X.shape, '->', X_T.shape)

(2531, 136) -> (2531, 322)


<center><h2><b>Busqueda de Hiperparametros</b></h2></center>

In [217]:
param_grid={
    "n_estimators": [50, 60, 70, 80],
    "min_samples_leaf": [1, 2],
    "min_samples_split": [2, 3],
    "max_depth": [50, 55, 65, 70, 60],
    "criterion": ["absolute_error"],
    "random_state": [random.randint(0,100000000) for x in range (10)], 
}

#reg = RandomForestRegressor()  

#cv = RandomizedSearchCV(reg, param_grid, random_state=42, n_jobs=-1, scoring="r2", n_iter=25)
#cv.fit(X_T, y)

#display("Best score: ", cv.best_score_)
#display("Best parameters:", cv.best_params_)

<center><h2><b>Entrenamiento y calcular Error</b></h2></center>

In [227]:
# from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor



# avg df: 3484.0752 = 2.420139 dias.
def cv_avg_std(reg, X, y, scoring):
    maes = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    avg = stat.mean(maes)
    std_dev = stat.variance(maes)**(1/2)
    
    return maes, avg, std_dev

def make_df(datos_reg):
    error_df = pd.DataFrame()

    error_df['Regresor']                = datos_reg.keys()
    error_df['Average MAE']             = [ abs(dato['avg']) for dato in datos_reg.values() ]
    error_df['Standard Deviation MAE']  = [ dato['std_dev'] for dato in datos_reg.values() ]
    error_df['Average R2']              = [ dato['avg_r2'] for dato in datos_reg.values() ]
    error_df['time']                    = [ dato['time'] for dato in datos_reg.values() ]
    
    return error_df

lr = LinearRegression()

elasticnet = ElasticNet(random_state=42, l1_ratio=0.1, alpha=0)


rfreg = RandomForestRegressor(
  criterion="absolute_error",
  max_features=48,
  max_depth=50,
  random_state=600738951)


xboostreg = XGBRegressor()


datos_reg = {}
regressors = [
    #('RF', RandomForestRegressor()
    # 0,28
    #('Lineal',LinearRegression()
    # 0.569196
    #(VotingRegressor([('lr', lr), ('en', elasticnet), ('rfreg', rfreg)], weights=[1, 1, 2])
    # 0.633962
    #('VotingRegressor', VotingRegressor([('xboost', xboostreg), ('rfreg', rfreg)], weights=[1, 2])
    # 0.630538
    # ('BaggingRegressor', BaggingRegressor(base_estimator=rfreg, n_estimators=10, random_state=600738951)
    
    # 1487.222797	131.556619	0.658033
    ('Random Forest', RandomForestRegressor(
        n_estimators = 121,
        min_samples_leaf = 2,
        min_samples_split = 3,
        max_depth = 91,
        max_features = 189,
        warm_start = False,
        bootstrap = False,
        random_state = 14684358)
      #criterion="absolute_error",
      #max_features=48,
      #max_depth=50,
      #random_state=600738951),

     # ~0,60 no me acuerdo
     # ('XGBRegressor', XGBRegressor()
    )
]

# Medir tiempo y hacer predicciones para cada regresor
for reg_name, reg in regressors:
    start_time = time.time()

    maes, avg, std_dev = cv_avg_std(reg, X_T, y, 'neg_mean_absolute_error')
    maes, r2, _ = cv_avg_std(reg, X_T, y, 'r2')
    
    datos_reg[reg_name] = { 'avg': avg, 'std_dev': std_dev, 'time': time.time() - start_time }
    datos_reg[reg_name]['avg_r2'] = r2

make_df(datos_reg)

Unnamed: 0,Regresor,Average MAE,Standard Deviation MAE,Average R2,time
0,Random Forest,311.064815,114.4654,0.943713,62.677147
