In [140]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

In [141]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [142]:
# Dropear IDs de todas las tablas (primera columna)
def drop_keys(dfs):
    for df_key in dfs.keys():
        if df_key not in ('hospital', 'patient'): # No dropear hospitalId o patientUnitStayId
            df = dfs[df_key]
            dfs[df_key] = df.drop(columns=[df.columns.values[0]])

if not has_dropped_keys:
    drop_keys(dfs)
    has_dropped_keys = True

In [143]:
dfs['carePlanGeneral']

Unnamed: 0,patientunitstayid,activeupondischarge,cplitemoffset,cplgroup,cplitemvalue
0,174826,True,49,Ventilation,Spontaneous - adequate
1,174826,True,49,Care Limitation,Full therapy
2,174826,True,49,Stress Ulcer Prophylaxis,Proton pump inhibitor
3,174826,True,49,Airway,Not intubated/normal airway
4,174826,True,49,DVT Prophylaxis,Compression devices
...,...,...,...,...,...
33143,3334564,True,1,Care Limitation,Full therapy
33144,3334564,True,2231,DVT Prophylaxis,Combined device and drug therapy
33145,3334564,False,1,Ventilation,Spontaneous - adequate
33146,3334564,False,1,Activity,Do not elevate HOB


In [144]:
def has_Ventilation (row):
    return 1 if row['cplgroup'] == 'Ventilation' else 0

def has_DVT_Prophylaxis (row):
    return 1 if row['cplgroup'] == 'DVT Prophylaxis' else 0

def has_Airway (row):
    return 1 if row['cplgroup'] == 'Airway' else 0

def has_Care_Limitation (row):
    return 1 if row['cplgroup'] == 'Care Limitation' else 0

def has_Stress_Ulcer_Prophylaxis  (row):
    return 1 if row['cplgroup'] == 'Stress Ulcer Prophylaxis' else 0

In [145]:
# Hay 15 filas con valor nulo en la columna cplitemvalue, asi que eliminamos esas filas
dfs['carePlanGeneral'] = dfs['carePlanGeneral'].dropna()

dfs['carePlanGeneral'] = dfs['carePlanGeneral'].drop(columns=[
    'activeupondischarge', 
    'cplitemvalue' # Valores muy dispares que se alejan de la solucion de los paciente
], axis=1)

In [146]:
dfs['carePlanGeneral']['cplgroup'].value_counts()[:5]

#Ventilation                                  3715
#DVT Prophylaxis                              3296
#Airway                                       3141
#Care Limitation                              2967
#Stress Ulcer Prophylaxis                     2946

Ventilation                 3715
DVT Prophylaxis             3296
Airway                      3141
Care Limitation             2967
Stress Ulcer Prophylaxis    2946
Name: cplgroup, dtype: int64

In [147]:
dfs['carePlanGeneral'].head(25)

Unnamed: 0,patientunitstayid,cplitemoffset,cplgroup
0,174826,49,Ventilation
1,174826,49,Care Limitation
2,174826,49,Stress Ulcer Prophylaxis
3,174826,49,Airway
4,174826,49,DVT Prophylaxis
5,223303,5,Stress Ulcer Prophylaxis
6,223303,5,Airway
7,223303,5,Care Limitation
8,223303,5,DVT Prophylaxis
9,223303,5,Ventilation


In [148]:
_dfs = dfs["carePlanGeneral"]['patientunitstayid']

category_Ventilation               = dfs["carePlanGeneral"].apply(lambda row : has_Ventilation(row), axis=1)
category_DVT_Prophylaxis           = dfs["carePlanGeneral"].apply(lambda row : has_DVT_Prophylaxis(row), axis=1)
category_Airway                    = dfs["carePlanGeneral"].apply(lambda row : has_Airway(row), axis=1)
category_Care_Limitation           = dfs["carePlanGeneral"].apply(lambda row : has_Care_Limitation(row), axis=1)
category_Stress_Ulcer_Prophylaxis  = dfs["carePlanGeneral"].apply(lambda row : has_Stress_Ulcer_Prophylaxis(row), axis=1)

_category_Ventilation                = pd.DataFrame(category_Ventilation, columns=['Categoria_Ventilacion'])
_category_DVT_Prophylaxis            = pd.DataFrame(category_DVT_Prophylaxis, columns=['Categoria_DVT_Prophylaxis'])
_category_Airway                     = pd.DataFrame(category_Airway, columns=['Categoria_Airway'])
_category_Care_Limitation            = pd.DataFrame(category_Care_Limitation, columns=['Categoria_Care_Limitation'])
_category_Stress_Ulcer_Prophylaxis   = pd.DataFrame(category_Stress_Ulcer_Prophylaxis, columns=['Categoria_Stress_Ulcer_Prophylaxis'])

categoria = pd.concat([_dfs, _category_Ventilation ,_category_DVT_Prophylaxis,_category_Airway, _category_Care_Limitation, _category_Stress_Ulcer_Prophylaxis ], axis=1)
#categoria = categoria.drop_duplicates()
categoria = categoria.groupby(['patientunitstayid']).sum()
categoria = categoria.reset_index()

dfs['carePlanGeneral'] = categoria

In [149]:
dfs['carePlanGeneral']

Unnamed: 0,patientunitstayid,Categoria_Ventilacion,Categoria_DVT_Prophylaxis,Categoria_Airway,Categoria_Care_Limitation,Categoria_Stress_Ulcer_Prophylaxis
0,141764,0,0,0,0,0
1,141765,1,1,1,2,2
2,143870,1,1,1,1,1
3,144815,1,1,1,1,1
4,145427,1,1,1,1,2
...,...,...,...,...,...,...
2450,3351763,2,1,2,1,2
2451,3352230,3,2,3,1,2
2452,3352231,1,1,1,1,1
2453,3352333,3,5,3,6,5


In [150]:
dfs['carePlanGeneral'].info()
print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2455 entries, 0 to 2454
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   patientunitstayid                   2455 non-null   int64
 1   Categoria_Ventilacion               2455 non-null   int64
 2   Categoria_DVT_Prophylaxis           2455 non-null   int64
 3   Categoria_Airway                    2455 non-null   int64
 4   Categoria_Care_Limitation           2455 non-null   int64
 5   Categoria_Stress_Ulcer_Prophylaxis  2455 non-null   int64
dtypes: int64(6)
memory usage: 115.2 KB

