In [2]:
##########################################################################################
#                            Stuff a tener en cuenta                                     #
##########################################################################################
# 1. Data profiling
# 2. Visualizations para outliers
# 3. Datos irrelevantes
# 4. Eliminar duplicados
# 5. Calcular varianza de columnas
# 6. Usar random forest para calcular variables importantes
# 7. Conversión de tipos (a tipos indicados en la documentación en principio)
# 8. Dividir datos complejos
# 9. Arreglar typos
# 10. Feature engineering
# 11. Estandarizar datos
# 12. Eliminar columnas con muchos valores nulos o poca varianza
# 12. Hacer algo con valores null, "" y NaN
# 13. Si hay atributos derivados, comprobar que son correctos
# 14. Hacer algo con datos inválidos de acuerdo a la documentación
# 15. Darle una vuelta a si los datos tienen sentido
# 16. Considerar si tiene sentido considerar datos externos (ej. riesgo de muerte dada una enfermedad)

In [3]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

In [4]:
def read_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

def read_csvs():
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

In [5]:
dfs = read_csvs()

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [6]:
###########################################################################
# Columnas con muchos null (comparativamente. hay otras con 36K y 3K null #
###########################################################################

# allergy (2474)
# 9   drugname              1480 non-null   object 
# 12  drughiclseqno         1480 non-null   float64

# apachePatientResult (3676)
# 20  actualventdays                 1042 non-null   float64
# 21  predventdays                   1004 non-null   float64
# 22  unabridgedactualventdays       1042 non-null   float64

# apachePredVar (2205)
# 27  electivesurgery    342 non-null    float64

# carePlanProvider (5627)
# 3   providertype            0 non-null      float64

# carePlanGoal (3633)
# 4   cplgoalvalue         2128 non-null   object

# carePlanInfectiousDisease (112)
# 6   responsetotherapy        2 non-null      object
# 7   treatment                62 non-null     object

# customLab (30)
# 5   labotherresult     18 non-null     float64

# infusiondrug (38256)
# 5   infusionrate       16779 non-null  float64
# 6   drugamount         13813 non-null  float64
# 7   volumeoffluid      14108 non-null  float64
# 8   patientweight      4036 non-null   float64

# medication (75604)
# 11  loadingdose         8 non-null      object

# microLab (342)
# 5   antibiotic          139 non-null    object
# 6   sensitivitylevel    136 non-null    object

# respiratoryCare (5436)
# 4   airwaytype            794 non-null    object 
# 5   airwaysize            636 non-null    float64
# 6   airwayposition        965 non-null    object 
# 7   cuffpressure          103 non-null    float64
# 12  apneaparams           111 non-null    object 
# 13  lowexhmvlimit         2050 non-null   float64
# 14  hiexhmvlimit          542 non-null    float64
# 15  lowexhtvlimit         551 non-null    float64
# 16  hipeakpreslimit       452 non-null    float64
# 17  lowpeakpreslimit      177 non-null    float64
# 18  hirespratelimit       837 non-null    float64
# 19  lowrespratelimit      286 non-null    float64
# 20  sighpreslimit         0 non-null      float64
# 21  lowironoxlimit        0 non-null      float64
# 22  highironoxlimit       0 non-null      float64
# 23  meanairwaypreslimit   0 non-null      float64
# 24  peeplimit             200 non-null    float64
# 25  cpaplimit             253 non-null    float64
# 26  setapneainterval      255 non-null    float64
# 27  setapneatv            222 non-null    float64
# 28  setapneaippeephigh    23 non-null     float64
# 29  setapnearr            245 non-null    float64
# 30  setapneapeakflow      240 non-null    float64
# 31  setapneainsptime      112 non-null    float64
# 32  setapneaie            0 non-null      float64
# 33  setapneafio2          247 non-null    float64

# vitalAperiodic (274088)
# 6   paop                  169 non-null     float64
# 7   cardiacoutput         14235 non-null   float64
# 8   cardiacinput          11865 non-null   float64
# 9   svr                   17789 non-null   float64
# 10  svri                  3440 non-null    float64
# 11  pvr                   169 non-null     float64
# 12  pvri                  168 non-null     float64

# vitalPeriodic
# 3   temperature        112820 non-null   float64
# 6   respiration        1381351 non-null  float64
# 7   cvp                202284 non-null   float64
# 8   etco2              72780 non-null    float64
# 9   systemicsystolic   226475 non-null   float64
# 10  systemicdiastolic  226471 non-null   float64
# 11  systemicmean       228400 non-null   float64
# 12  pasystolic         32083 non-null    float64
# 13  padiastolic        32080 non-null    float64
# 14  pamean             32632 non-null    float64
# 15  st1                613267 non-null   float64
# 16  st2                656963 non-null   float64
# 17  st3                596372 non-null   float64
# 18  icp                13193 non-null    float64


#--------------------------------------------------------------------------------------------------------------------------------------------------------

for df_name in dfs:
    print(df_name + '\n')
    dfs[df_name].info()
    print()


# admissionDrug
#dfs['medication']['loadingdose'].value_counts()

admissiondrug

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7417 entries, 0 to 7416
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   admissiondrugid     7417 non-null   int64  
 1   patientunitstayid   7417 non-null   int64  
 2   drugoffset          7417 non-null   int64  
 3   drugenteredoffset   7417 non-null   int64  
 4   drugnotetype        7417 non-null   object 
 5   specialtytype       7417 non-null   object 
 6   usertype            7417 non-null   object 
 7   rxincluded          7417 non-null   bool   
 8   writtenineicu       7417 non-null   bool   
 9   drugname            7417 non-null   object 
 10  drugdosage          7417 non-null   float64
 11  drugunit            7417 non-null   object 
 12  drugadmitfrequency  7417 non-null   object 
 13  drughiclseqno       7417 non-null   int64  
dtypes: bool(2), float64(1), int64(5), object(6)
memory usage: 710.0+ KB

admissionDx

<class 

In [40]:
dfs['treatment']

Unnamed: 0,treatmentid,patientunitstayid,treatmentoffset,treatmentstring,activeupondischarge
0,9579899,242895,838,cardiovascular|arrhythmias|anticoagulant admin...,False
1,8788989,242895,512,cardiovascular|consultations|Cardiology consul...,False
2,10293108,242895,838,cardiovascular|non-operative procedures|extern...,False
3,9017080,242895,70,pulmonary|vascular disorders|VTE prophylaxis|l...,False
4,9853526,242895,70,cardiovascular|consultations|Cardiology consul...,False
...,...,...,...,...,...
38285,82612510,3352230,2176,pulmonary|ventilation and oxygenation|mechanic...,False
38286,82477425,3352230,2176,neurologic|pain / agitation / altered mentatio...,False
38287,82402829,3352231,117,cardiovascular|non-operative procedures|intraa...,False
38288,82335549,3352333,317,gastrointestinal|consultations|Gastroenterolog...,False


In [41]:
table_name = 'treatment'

for atr in dfs[table_name].columns.values:
    print(atr)
    print()
    #var = dfs[table_name][atr.lower()].var() * 100
    
    #if var < 100: display('var:', var)
    display( dfs[table_name][atr.lower()].value_counts() )
    print()

treatmentid



9579899     1
64618005    1
67227072    1
61875865    1
65895216    1
           ..
27696729    1
27086453    1
25255149    1
24088341    1
82251705    1
Name: treatmentid, Length: 38290, dtype: int64


patientunitstayid



1363755    1668
408514      537
1325742     482
3116577     438
3239262     437
           ... 
2934903       1
2921129       1
2918903       1
2917555       1
3353113       1
Name: patientunitstayid, Length: 1910, dtype: int64


treatmentoffset



38      175
63      144
32      137
24      135
41      132
       ... 
365       1
3337      1
307       1
3035      1
1465      1
Name: treatmentoffset, Length: 2399, dtype: int64


treatmentstring



pulmonary|ventilation and oxygenation|mechanical ventilation                                   1088
pulmonary|radiologic procedures / bronchoscopy|chest x-ray                                      750
cardiovascular|intravenous fluid|normal saline administration                                   541
gastrointestinal|medications|antiemetic|serotonin antagonist|ondansetron                        458
gastrointestinal|medications|stress ulcer prophylaxis|pantoprazole                              446
                                                                                               ... 
neurologic|neuromyopathy therapy|reversal of neuromuscular blockade|pyridostigmine                1
cardiovascular|vascular disorders|anticoagulant administration|low molecular weight heparin       1
renal|medications|immunosuppressive therapy|tacrolimus                                            1
cardiovascular|hypertension|analgesics|bolus parenteral analgesics                                1



activeupondischarge



False    27401
True     10889
Name: activeupondischarge, dtype: int64


