In [142]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error

<center><h2><b>Leer DB</b></h2></center>

In [143]:
#
# medication: Columns (11) have mixed types. Specify dtype option on import or set low_memory=False-
#
def read_csvs():
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab',
                 'medication',
                 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        #dfs[ds_name.lower()] = (pd.read_csv('../db/csv/' + ds_name + '.csv'), )
        dfs[ds_name.lower()] = pd.read_csv('../db/csv_clean/' + ds_name + '.csv')
    
    return dfs

def read_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

def make_relationships(dfs):
    relationships = []
    pk_fk = json.loads( open('keys.json').read() )
    i = 0

    for ds_name in pk_fk:
        #if pk_fk[ds_name]['pk'] != False:
        #    dfs[ds_name][0].set_index(pk_fk[ds_name]['pk'])
        
        if pk_fk[ds_name]['fk'] != False and ds_name not in ('hospital', 'medication'):
            #print(ds_name, pk[ds_name]['fk'])
            fk_atr, target_table, target_atr = pk_fk[ds_name]['fk']
            
            #print((target_table, target_atr, ds_name, fk_atr))
            relationships.append((target_table, target_atr, ds_name, fk_atr))
            
    relationships.append(('hospital', 'hospitalid', 'patient', 'hospitalid'))

    return relationships

#------------------------------------------------------------------------------------------------------

dfs = read_csvs()
relationships = make_relationships(dfs)

#feature_matrix, feature_defs = ft.dfs(
#    dataframes=dfs,
#    relationships=relationships,
#    target_dataframe_name='patient',
#)

#dfs['patient'][0]

<center><h2><b>Transformación de columnas</b></h2></center>

In [144]:
# Dividir en X e Y
X = dfs['patient'][:-100]
Y = dfs['patient'][-100:]

def unionColumnas(X, Y, columnName):
    return list(set(X[columnName]).union(set(Y[columnName])))

categGender = unionColumnas(X, Y, "gender")
categEthnicity = unionColumnas(X, Y, "ethnicity")
categHospitalAdmitSource = unionColumnas(X, Y, "hospitaladmitsource")

transformersX = [
    ('patientunitstayid', 'drop', [0]),
    ('gender', OneHotEncoder(categories=[categGender]), [1]),
    ('age', 'passthrough', [2]), # TODO: Probar categórica
    ('ethnicity', OneHotEncoder(categories=[categEthnicity]), [3]),
    ('hospitalid', 'passthrough', [4]), # TODO: ''
    ('apacheadmissiondx', 'drop', [5]), # JK
    ('admissionheight', 'passthrough', [6]),
    ('hospitaladmitoffset', 'passthrough', [7]),
    ('hospitaladmitsource', OneHotEncoder(categories=[categHospitalAdmitSource]), [8]),
    ('hospitaldischargeoffset', 'passthrough', [9]),
    ('unitvisitnumber', 'passthrough', [10]),    
    ('admissionweight', 'passthrough', [11]),
    ('unitdischargeoffset', 'drop', [12]), # obv
]

# Normbrar el orden de columnas de 0 a n-1
transformersY = deepcopy(transformersX)[:12] #Eliminar unitdischargeoffset

for i, t in enumerate(transformersY): # Renumerar los indices de las columnas
    transformersY[i][2].pop()
    transformersY[i][2].append(i)

# Transformar columnas
X_T = ColumnTransformer(transformers=transformersX).fit_transform(X)
Y_T = ColumnTransformer(transformers=transformersY).fit_transform(Y)

<center><h2><b>Entrenamiento</b></h2></center>

In [145]:
rfreg = RandomForestRegressor()
trained_rf = rfreg.fit(X_T, X['unitdischargeoffset'])

y_pred_rf = trained_rf.predict(Y_T)

<center><h2><b>Calcular Error</b></h2></center>

In [149]:
mae = mean_absolute_error(X['unitdischargeoffset'][:100], y_pred_rf)
avg = sum(X['unitdischargeoffset']) / len(X['unitdischargeoffset'])

print('MAE:', round(mae, 4))
print('AVG:', round(avg, 4))
print('DIFF:', round(abs(mae-avg), 4))

MAE: 4711.1175
AVG: 3841.591
DIFF: 869.5265
