In [180]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json
import re
import math

In [181]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [182]:
# Dropear IDs de todas las tablas (primera columna)
def drop_keys(dfs):
    for df_key in dfs.keys():
        if df_key not in ('hospital', 'patient'): # No dropear hospitalId o patientUnitStayId
            df = dfs[df_key]
            dfs[df_key] = df.drop(columns=[df.columns.values[0]])

if not has_dropped_keys:
    drop_keys(dfs)
    has_dropped_keys = True

In [183]:
dfs['admissionDx']

Unnamed: 0,patientunitstayid,admitdxenteredoffset,admitdxpath,admitdxname,admitdxtext
0,2900423,162,admission diagnosis|Non-operative Organ System...,Cardiovascular,Cardiovascular
1,2900423,162,admission diagnosis|Was the patient admitted f...,No,No
2,2900423,162,admission diagnosis|All Diagnosis|Non-operativ...,"Sepsis, pulmonary","Sepsis, pulmonary"
3,2902156,944,admission diagnosis|All Diagnosis|Non-operativ...,"Rhythm disturbance (atrial, supraventricular)","Rhythm disturbance (atrial, supraventricular)"
4,2902156,944,admission diagnosis|Non-operative Organ System...,Cardiovascular,Cardiovascular
...,...,...,...,...,...
7573,2898513,50,admission diagnosis|All Diagnosis|Non-operativ...,"Encephalopathy, hepatic","Encephalopathy, hepatic"
7574,2899929,11,admission diagnosis|Was the patient admitted f...,Yes,Yes
7575,2899929,11,admission diagnosis|Elective|Yes,Yes,Yes
7576,2899929,11,admission diagnosis|All Diagnosis|Operative|Di...,"TURP, transurethral prostate resection for ben...","TURP, transurethral prostate resection for ben..."


In [184]:
def clean_admitdxname(row):
    admitdxname = row ['admitdxname']
    
    m = re.search('^(\w+)', admitdxname)
    
    return m.group(0)

In [185]:
# Drop columns
dfs['admissionDx'] = dfs['admissionDx'].drop(columns=[
    'admitdxtext', # En la gran mayoría de los casos es igual a admitdxname
], axis=1)

dfs['admissionDx']['admitdxname']     = dfs['admissionDx'].apply(lambda row : clean_admitdxname(row), axis=1)

# Filtramos y quitamos los Yes y No, no tienen mucho sentido
df = dfs['admissionDx']
dfs['admissionDx'] = df[df['admitdxname'] != 'Yes']
df = dfs['admissionDx']
dfs['admissionDx'] = df[df['admitdxname'] != 'No']

In [186]:
dfs['admissionDx']['admitdxname'].value_counts()

Cardiovascular      983
Respiratory         363
Neurologic          318
Sepsis              288
Gastrointestinal    264
                   ... 
Myasthenia            1
Heart                 1
Tamponade             1
Hematomas             1
Ventriculostomy       1
Name: admitdxname, Length: 160, dtype: int64

In [187]:
#Cardiovascular      983
#Respiratory         363
#Neurologic          318

def has_common_admission_C(row):
    drug = row ['admitdxname'].strip()
       
    if drug == "Cardiovascular": 
        return 1
    else:
        return 0

def has_common_admission_R(row):
    drug = row ['admitdxname'].strip()
    
    if drug == 'Respiratory':
        return 1
    else:
        return 0

def has_common_admission_N(row):
    drug = row ['admitdxname'].strip()
    
    if drug in 'Neurologic':
        return 1
    else:
        return 0

In [188]:
_dfs = dfs["admissionDx"]["patientunitstayid"]

# Creamos nuevas columnas
common_admission_C   = dfs["admissionDx"].apply(lambda row : has_common_admission_C(row), axis=1)
common_admission_R   = dfs["admissionDx"].apply(lambda row : has_common_admission_R(row), axis=1)
common_admission_N   = dfs["admissionDx"].apply(lambda row : has_common_admission_N(row), axis=1)


_common_admission_C  = pd.DataFrame(common_admission_C, columns=['AdmissionCardiovascular'])
_common_admission_R  = pd.DataFrame(common_admission_R, columns=['AdmissionRespiratory'])
_common_admission_N  = pd.DataFrame(common_admission_N, columns=['AdmissionNeurologic'])


dfs["admissionDx"] = pd.concat([_dfs, _common_admission_C,_common_admission_R,_common_admission_N], axis=1)

In [189]:
dfs["admissionDx"] = dfs["admissionDx"].groupby(['patientunitstayid']).max()

dfs["admissionDx"].reset_index()

Unnamed: 0,patientunitstayid,AdmissionCardiovascular,AdmissionRespiratory,AdmissionNeurologic
0,141765,1,0,0
1,143870,1,0,0
2,144815,0,0,1
3,145427,0,0,0
4,147307,1,0,0
...,...,...,...,...
2216,3351763,0,0,0
2217,3352230,1,0,0
2218,3352231,1,0,0
2219,3352333,0,0,0


In [190]:
display(dfs['admissionDx'].head(20))

Unnamed: 0_level_0,AdmissionCardiovascular,AdmissionRespiratory,AdmissionNeurologic
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
141765,1,0,0
143870,1,0,0
144815,0,0,1
145427,0,0,0
147307,1,0,0
147784,0,0,1
148611,0,0,1
149713,1,0,0
151179,1,0,0
151867,0,0,0


In [191]:
dfs['admissionDx'].info()
print()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2221 entries, 141765 to 3353113
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   AdmissionCardiovascular  2221 non-null   int64
 1   AdmissionRespiratory     2221 non-null   int64
 2   AdmissionNeurologic      2221 non-null   int64
dtypes: int64(3)
memory usage: 69.4 KB

