In [757]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json
import random

In [758]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [759]:
# Dropear IDs de todas las tablas (primera columna)
def drop_keys(dfs):
    for df_key in dfs.keys():
        if df_key not in ('hospital', 'patient'): # No dropear hospitalId o patientUnitStayId
            df = dfs[df_key]
            dfs[df_key] = df.drop(columns=[df.columns.values[0]])

if not has_dropped_keys:
    drop_keys(dfs)
    has_dropped_keys = True

In [760]:
dfs['carePlanCareProvider']

Unnamed: 0,patientunitstayid,careprovidersaveoffset,providertype,specialty,interventioncategory,managingphysician,activeupondischarge
0,149713,11,,family practice,I,Managing,True
1,157016,2,,obstetrics/gynecology,I,Managing,True
2,165840,26,,internal medicine,I,Managing,True
3,174826,49,,critical care medicine (CCM),,Managing,True
4,174956,3,,cardiology,Unknown,Managing,True
...,...,...,...,...,...,...,...
5622,3333069,135,,surgery-trauma,II,Consulting,False
5623,3333069,3396,,critical care medicine (CCM),II,Consulting,True
5624,3334564,550,,surgery-trauma,II,Managing,True
5625,3334564,-7,,surgery-trauma,II,Managing,False


In [761]:
print(dfs['carePlanCareProvider']['managingphysician'].value_counts()[:4])
print()
dfs['carePlanCareProvider'].info()
print()

Managing      3046
Consulting    2581
Name: managingphysician, dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5627 entries, 0 to 5626
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   patientunitstayid       5627 non-null   int64  
 1   careprovidersaveoffset  5627 non-null   int64  
 2   providertype            0 non-null      float64
 3   specialty               5089 non-null   object 
 4   interventioncategory    4224 non-null   object 
 5   managingphysician       5627 non-null   object 
 6   activeupondischarge     5627 non-null   bool   
dtypes: bool(1), float64(1), int64(2), object(3)
memory usage: 269.4+ KB



In [762]:
def clean_interventioncategory(categ):
    if categ == categ and categ != 'Unknown': # No NaN
        return categ
    else:
        rand = random.randint(0, 2)
        categs = ['I', 'II', 'III']
        return categs[rand]

In [763]:
# Eliminamos filas con specialidades nullas
df = dfs['carePlanCareProvider']
dfs['carePlanCareProvider'] = df[df['specialty'] == df['specialty']]

dfs['carePlanCareProvider'] = dfs['carePlanCareProvider'].drop(columns=[
    'providertype', # Todo nulls
    'managingphysician', # No consideramos que sean importantes para que el algoritmo aprenda
    'activeupondischarge', # No consideramos que sean importantes para que el algoritmo aprenda
])

# Rellenamos categoria de Intervención
dfs['carePlanCareProvider']['interventioncategory']   = dfs['carePlanCareProvider']['interventioncategory'].apply(clean_interventioncategory)

In [764]:
dfs['carePlanCareProvider']

Unnamed: 0,patientunitstayid,careprovidersaveoffset,specialty,interventioncategory
0,149713,11,family practice,I
1,157016,2,obstetrics/gynecology,I
2,165840,26,internal medicine,I
3,174826,49,critical care medicine (CCM),II
4,174956,3,cardiology,III
...,...,...,...,...
5621,3333069,3396,cardiology,II
5622,3333069,135,surgery-trauma,II
5623,3333069,3396,critical care medicine (CCM),II
5624,3334564,550,surgery-trauma,II


In [765]:
dfs['carePlanCareProvider']['specialty'].value_counts()[:3] # -> internal medicine:1005 /  cardiology:561 / hospitalist:554

# Filtramos y nos quedamos con las specialty más comunes
df = dfs['carePlanCareProvider']
dfs['carePlanCareProvider'] = df[df.specialty.isin (['internal medicine', 'cardiology', 'hospitalist'])]

dfs['carePlanCareProvider']

Unnamed: 0,patientunitstayid,careprovidersaveoffset,specialty,interventioncategory
2,165840,26,internal medicine,I
4,174956,3,cardiology,III
6,202294,3,internal medicine,I
7,210642,3,hospitalist,I
8,223303,5,internal medicine,III
...,...,...,...,...
5613,3231363,4768,hospitalist,II
5614,3231363,14,hospitalist,II
5615,3231363,4768,hospitalist,III
5616,3241571,124,hospitalist,III


In [766]:
# Sumamos los offsets con mismo patientid y misma especialidad 
def clean_careprovidersaveoffset(row, df):           
    return  df[(df['patientunitstayid']==row['patientunitstayid']) & (df['specialty']==row['specialty'])]['careprovidersaveoffset'].sum()

In [767]:
# 2120 rows × 4 columns
dfs['carePlanCareProvider']['careprovidersaveoffset']   =  dfs['carePlanCareProvider'].apply(lambda row : clean_careprovidersaveoffset(row, dfs['carePlanCareProvider']), axis = 1)

# Quitamos duplicidad -> 1416 rows × 4 columns
dfs['carePlanCareProvider'] = dfs['carePlanCareProvider'].drop_duplicates()

In [768]:
dfs['carePlanCareProvider']

Unnamed: 0,patientunitstayid,careprovidersaveoffset,specialty,interventioncategory
2,165840,26,internal medicine,I
4,174956,3,cardiology,III
6,202294,3,internal medicine,I
7,210642,3,hospitalist,I
8,223303,5,internal medicine,III
...,...,...,...,...
5609,3155534,-6,internal medicine,III
5613,3231363,9550,hospitalist,II
5615,3231363,9550,hospitalist,III
5616,3241571,124,hospitalist,III


In [769]:
# set(dfs["carePlanCareProvider"]["interventioncategory"]) -> I / II / III / IV
# specialty -> internal medicine:1005 /  cardiology:561 / hospitalist:554

def has_category_I (row):
    return 1 if row["interventioncategory"] == 'I' else 0
def has_category_II (row):
    return 1 if row["interventioncategory"] == 'II' else 0
def has_category_III (row):
    return 1 if row["interventioncategory"] == 'III' else 0
def has_category_IV (row):
    return 1 if row["interventioncategory"] == 'IV' else 0

def has_hospitalist (row):
    return row['careprovidersaveoffset'] if row['specialty'] == 'hospitalist' else 0

def has_cardiology (row):
    return row['careprovidersaveoffset'] if row['specialty'] == 'cardiology' else 0

def has_internal_medicine (row):
    return row['careprovidersaveoffset'] if row['specialty'] == 'internal medicine' else 0

In [770]:
_dfs = dfs["carePlanCareProvider"]["patientunitstayid"]

# Creamos nuevas columnas
category_I   = dfs["carePlanCareProvider"].apply(lambda row : has_category_I(row), axis=1)
category_II  = dfs["carePlanCareProvider"].apply(lambda row : has_category_II(row), axis=1)
category_III = dfs["carePlanCareProvider"].apply(lambda row : has_category_III(row), axis=1)
category_IV  = dfs["carePlanCareProvider"].apply(lambda row : has_category_IV(row), axis=1)

category_hospital           = dfs["carePlanCareProvider"].apply(lambda row : has_hospitalist(row), axis=1)
category_cardiology         = dfs["carePlanCareProvider"].apply(lambda row : has_cardiology(row), axis=1)
category_internal_medicine  = dfs["carePlanCareProvider"].apply(lambda row : has_internal_medicine(row), axis=1)

# Vemos si hay intervenciones y de que tipo son
_category_I    = pd.DataFrame(category_I, columns=['Intervencion_I'])
_category_II   = pd.DataFrame(category_II, columns=['Intervencion_II'])
_category_III  = pd.DataFrame(category_III, columns=['Intervencion_III'])
_category_IV   = pd.DataFrame(category_IV, columns=['Intervencion_IV'])

# Juntamos intervenciones y agrupamos en base a patientId escogiendo el max (1 or 0)
Intervention = pd.concat([_dfs, _category_I,_category_II,_category_III, _category_IV], axis=1)
Intervention = Intervention.groupby(['patientunitstayid']).max()

# Similar a intervenciones pero con las especialidades más comunes
_category_hospital           = pd.DataFrame(category_hospital, columns=['Categoria_Hospital'])
_category_cardiology         = pd.DataFrame(category_cardiology, columns=['Categoria_Cardiology'])
_category_internal_medicine  = pd.DataFrame(category_internal_medicine, columns=['Categoria_Internal_Medicine'])

# Similar pero sumando 
medicine = pd.concat([_dfs, _category_hospital ,_category_cardiology,_category_internal_medicine ], axis=1)
medicine = medicine.drop_duplicates()
medicine = medicine.groupby(['patientunitstayid']).sum()

dfs['carePlanCareProvider'] = pd.concat([medicine, Intervention], axis=1)

In [771]:
dfs['carePlanCareProvider'].reset_index()

Unnamed: 0,patientunitstayid,Categoria_Hospital,Categoria_Cardiology,Categoria_Internal_Medicine,Intervencion_I,Intervencion_II,Intervencion_III,Intervencion_IV
0,141765,14,0,0,0,0,1,0
1,144815,0,0,28,1,0,0,0
2,145427,0,0,153,1,0,1,0
3,148611,0,0,-82,1,0,0,0
4,151179,0,0,2005,1,0,1,0
...,...,...,...,...,...,...,...,...
1120,3351295,0,768,0,0,1,0,0
1121,3351297,0,1854,0,0,1,0,0
1122,3352230,0,1996,0,0,1,0,0
1123,3352231,0,534,0,0,1,0,0


In [772]:
dfs['carePlanCareProvider'].info()
print()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1125 entries, 141765 to 3353113
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Categoria_Hospital           1125 non-null   int64
 1   Categoria_Cardiology         1125 non-null   int64
 2   Categoria_Internal_Medicine  1125 non-null   int64
 3   Intervencion_I               1125 non-null   int64
 4   Intervencion_II              1125 non-null   int64
 5   Intervencion_III             1125 non-null   int64
 6   Intervencion_IV              1125 non-null   int64
dtypes: int64(7)
memory usage: 70.3 KB

