In [2]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json
import re

In [72]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [None]:
sql_query("""
    SELECT *
    FROM infusiondrug
    WHERE drugname LIKE '%Insulin%'
""").head(20) # 4 + 5 + 4 + 5 + 4

In [89]:
dfs['infusiondrug']['drugname'].value_counts().head(20)

Fentanyl (mcg/hr)              2893
Propofol (mcg/kg/min)          2528
Norepinephrine (mcg/min)       2344
Norepinephrine (ml/hr)         2053
Propofol (ml/hr)               1974
Fentanyl (ml/hr)               1912
Insulin (units/hr)             1383
Heparin (ml/hr)                1306
Midazolam (mg/hr)              1217
Dexmedetomidine (ml/hr)         870
Propofol ()                     820
Amiodarone (ml/hr)              799
Midazolam (ml/hr)               771
Heparin (units/hr)              768
Fentanyl ()                     677
Pantoprazole (ml/hr)            653
Dexmedetomidine (mcg/kg/hr)     652
Diltiazem (mg/hr)               609
Insulin (ml/hr)                 588
Amiodarone (mg/min)             556
Name: drugname, dtype: int64

In [20]:
# Reemplazar NaN numéricos por un valor por defecto
dfs['infusiondrug']['drugrate'] = dfs['infusiondrug']['drugrate'].fillna(-1)
dfs['infusiondrug']['infusionrate'] = dfs['infusiondrug']['infusionrate'].fillna(-1)
dfs['infusiondrug']['drugamount'] = dfs['infusiondrug']['drugamount'].fillna(-1)
dfs['infusiondrug']['volumeoffluid'] = dfs['infusiondrug']['volumeoffluid'].fillna(-1)
dfs['infusiondrug']['patientweight'] = dfs['infusiondrug']['patientweight'].fillna(-1)

In [21]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)

df = dfs['infusiondrug']
df[df['patientweight'] == df['patientweight']]

Unnamed: 0,infusiondrugid,patientunitstayid,infusionoffset,drugname,drugrate,infusionrate,drugamount,volumeoffluid,patientweight
0,40215081,1461035,768,Volume (mL) Magnesium (ml/hr),25,-1.0,-1.0,-1.0,-1.0
1,38752780,1461035,648,Volume (mL) Magnesium (ml/hr),25,-1.0,-1.0,-1.0,-1.0
2,36960718,1461035,-1812,Volume (mL) Magnesium (ml/hr),25,-1.0,-1.0,-1.0,-1.0
3,38679313,1461035,-611,Volume (mL) Magnesium (ml/hr),25.42,-1.0,-1.0,-1.0,-1.0
4,40681648,1461035,828,Volume (mL) Magnesium (ml/hr),25,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...
38251,42808150,1827129,15,IVF (ml/hr),75,75.0,-1.0,-1.0,-1.0
38252,44183396,1827129,2069,IVF (ml/hr),50,50.0,-1.0,-1.0,-1.0
38253,42738145,1827129,1534,IVF (ml/hr),50,50.0,-1.0,-1.0,-1.0
38254,42128446,1827129,5904,Epidural (ml/hr),14,14.0,-1.0,-1.0,-1.0


In [117]:
#
# Mantener las medidas más reciente para cada medicamento de los 10 que sean más frecuentes
#

# Medicamentos a tener en cuenta
acceptedDrugs = set(['fentanyl', 'propofol', 'norepinephrine', 'insulin', 'heparin', 'midazolam', 'dexmedetomidine', 'amiodarone', 'heparin', 'pantoprazole'])

def parse_drug_name(drug):
    drug = drug.lower()
    m = re.search('(.+)\(.+?$', drug)

    if type(m) == re.Match:
        return m.group(1).strip()
    else:
        return drug

patients = {}
columns = []
for drug in acceptedDrugs:
    columns += ['last_' + drug, 'last_' + drug + '_drugrate', 'last_' + drug + '_infusionrate',
                'last_' + drug + '_drugamount', 'last_' + drug + '_volumeoffluid', 'last_' + drug + '_patientweight']

for i, row in df.iterrows():
    pusID = row['patientunitstayid']
    drug  = parse_drug_name(row['drugname'])
    drugrate  = row['drugrate']
    infusionrate = row['infusionrate']
    drugamount = row['drugamount']
    volumeoffluid = row['volumeoffluid']
    patientweight = row['patientweight']
    infusionOffset = row['infusionoffset'] 
    
    if infusionOffset < 0 or drug not in acceptedDrugs:
        continue
    else:
        if pusID not in patients:
            patients[pusID] = {}
            
            for targetdrug in acceptedDrugs:
                patients[pusID][targetdrug] = { 'drugrate': -1, 'infusionrate': -1, 'drugamount': -1, 'volumeoffluid': -1, 'patientweight': -1, 'lastOffset': float('inf') }
        else:
            if infusionOffset < patients[pusID][drug]['lastOffset']:
                patients[pusID][drug] = { 'drugrate': drugrate, 'infusionrate': infusionrate, 'drugamount': drugamount, 'volumeoffluid': volumeoffluid, 'patientweight': patientweight, 'lastOffset': infusionOffset }

new_df = pd.DataFrame(columns=columns)
keys = list(patients.keys())

new_df['patientunitstayid'] = keys

for drug in acceptedDrugs:
    new_df['last_' + drug] = [ -1 if patients[key][drug]['lastOffset'] == float('inf') else patients[key][drug]['lastOffset'] for key in keys ]
    new_df['last_' + drug + '_drugrate'] = [ patients[key][drug]['drugrate'] for key in keys ]
    new_df['last_' + drug + '_infusionrate'] = [ patients[key][drug]['infusionrate'] for key in keys ]
    new_df['last_' + drug + '_drugamount'] = [ patients[key][drug]['drugamount'] for key in keys ]
    new_df['last_' + drug + '_volumeoffluid'] = [ patients[key][drug]['volumeoffluid'] for key in keys ]
    new_df['last_' + drug + '_patientweight'] = [ patients[key][drug]['patientweight'] for key in keys ]

new_df.head(10)

Unnamed: 0,last_propofol,last_propofol_drugrate,last_propofol_infusionrate,last_propofol_drugamount,last_propofol_volumeoffluid,last_propofol_patientweight,last_insulin,last_insulin_drugrate,last_insulin_infusionrate,last_insulin_drugamount,last_insulin_volumeoffluid,last_insulin_patientweight,last_amiodarone,last_amiodarone_drugrate,last_amiodarone_infusionrate,last_amiodarone_drugamount,last_amiodarone_volumeoffluid,last_amiodarone_patientweight,last_dexmedetomidine,last_dexmedetomidine_drugrate,last_dexmedetomidine_infusionrate,last_dexmedetomidine_drugamount,last_dexmedetomidine_volumeoffluid,last_dexmedetomidine_patientweight,last_pantoprazole,last_pantoprazole_drugrate,last_pantoprazole_infusionrate,last_pantoprazole_drugamount,last_pantoprazole_volumeoffluid,last_pantoprazole_patientweight,last_fentanyl,last_fentanyl_drugrate,last_fentanyl_infusionrate,last_fentanyl_drugamount,last_fentanyl_volumeoffluid,last_fentanyl_patientweight,last_norepinephrine,last_norepinephrine_drugrate,last_norepinephrine_infusionrate,last_norepinephrine_drugamount,last_norepinephrine_volumeoffluid,last_norepinephrine_patientweight,last_midazolam,last_midazolam_drugrate,last_midazolam_infusionrate,last_midazolam_drugamount,last_midazolam_volumeoffluid,last_midazolam_patientweight,last_heparin,last_heparin_drugrate,last_heparin_infusionrate,last_heparin_drugamount,last_heparin_volumeoffluid,last_heparin_patientweight,patientunitstayid
0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,3757,0.5,16.7,900.0,500.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,1805312
1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,1378,4.0,15.0,4.0,250.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,1598701
2,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,912,2.0,7.5,4.0,250.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,1607519
3,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,1821,0.2,4.2,0.4,100.0,84.1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,6,1.0,1.9,8.0,250.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,2721454
4,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,58,3.0,11.3,4.0,250.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,3110594
5,27,20,10.3,500.0,50.0,86.1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,3597,0.7,15.1,0.2,50.0,86.1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,177,50,5.0,1.0,100.0,-1.0,27,4.0,3.8,16.0,250.0,-1.0,177,5,5.0,100.0,100.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,426976
6,-1,-1,-1.0,-1.0,-1.0,-1.0,18,8,8.0,100.0,100.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,168,10.67,10.0,16.0,250.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,472359
7,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,559,1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,1373,8.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,965562
8,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,560,80,8.0,1.0,100.0,-1.0,560,6.99,13.1,8.0,250.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,1852625
9,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,43,1.83,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,-1,-1,-1.0,-1.0,-1.0,-1.0,2787683


In [119]:
len(columns)

54

In [11]:
dfs['infusiondrug'] = dfs['infusiondrug'].drop_duplicates()
dfs['infusiondrug'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38256 entries, 0 to 38255
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   infusiondrugid     38256 non-null  int64  
 1   patientunitstayid  38256 non-null  int64  
 2   infusionoffset     38256 non-null  int64  
 3   drugname           38256 non-null  object 
 4   drugrate           38155 non-null  object 
 5   infusionrate       16779 non-null  float64
 6   drugamount         13813 non-null  float64
 7   volumeoffluid      14108 non-null  float64
 8   patientweight      4036 non-null   float64
dtypes: float64(4), int64(3), object(2)
memory usage: 2.9+ MB


In [17]:
# Redondear offset al múltiplo de 50 más cercano hacia abajo para eliminar duplicados
dfs['diagnosis']['diagnosisoffset'] = dfs['diagnosis'].apply(lambda row: (int(row['diagnosisoffset'])//50) * 50, axis=1)
dfs['diagnosis'] = dfs['diagnosis'].drop_duplicates()

In [18]:
new_df = pd.DataFrame(columns=['patientunitstayid', 'diag1', 'diag2', 'diag3', 'additional1', 'additional2', 'additional3'])

def clean_diagnosisstring(group):
    pusID, group_df = group
    priority = {
        'Primary': [],
        'Major':  [],
        'Other': []
    }
    
    for i, row in group_df.iterrows():
        if row['diagnosisstring'] == row['diagnosisstring'] and row['diagnosispriority'] == row['diagnosispriority']: # not NaN
            diag = row['diagnosisstring'].split('|')[-1]
            priority_lv = row['diagnosispriority']
            
            priority[priority_lv].append(diag)
    
    primary_str = ( ['None'] + priority['Primary'] )[-1]
    major_str   = ( ['None'] + priority['Major']   )[-1]
    other_str   = ( ['None'] + priority['Other']   )[-1]

    return [ pusID, primary_str, major_str, other_str ]
    
new_df = pd.DataFrame(columns=['patientunitstayid', 'lastPrimary', 'lastMajor', 'lastOther'])
for i,  group in enumerate(dfs['diagnosis'].groupby('patientunitstayid')):
    row = clean_diagnosisstring(group)
    new_df.loc[i] = row

new_df

Unnamed: 0,patientunitstayid,lastPrimary,lastMajor,lastOther
0,143870,s/p cartoid endarterectomy,bradycardia,coronary artery disease
1,145427,diverticulitis of colon,diverticulitis of colon,due to bowel perforation
2,151179,septic shock,hypotension,infectious dermatitis
3,151867,s/p exploratory laparotomy,,viscus perforation
4,151900,septic shock,acute respiratory distress,pneumonia
...,...,...,...,...
2150,3351763,ventilatory failure,,enteric fistula
2151,3352230,acute myocardial infarction (with ST elevation),acute myocardial infarction (with ST elevation),IABP
2152,3352231,acute myocardial infarction (with ST elevation),,
2153,3352333,obstruction of colon,alcohol,


In [19]:
#for i, row in dfs['diagnosis'].iterrows():
#    print(row['icd9code'], ' -> ', row['diagnosisstring'])

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)
dfs['diagnosis'][dfs['diagnosis']['patientunitstayid'] == 3351763].head(400)

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
19242,46219997,3351763,False,50,surgery|respiratory failure|ventilatory failure,"518.81, J96.00",Primary
19243,46076409,3351763,False,1000,surgery|general surgery postop issues|enteric fistula,,Other
19244,46219983,3351763,True,2950,surgery|respiratory failure|ventilatory failure,"518.81, J96.00",Primary
19245,46076410,3351763,True,2400,surgery|general surgery postop issues|enteric fistula,,Other


In [20]:
dfs['diagnosis']

# Crear _last_diagnosis a partir de diagnosisstring

#def clean_diagnosisstring(row):
#    if row['diagnosisstring'] != row['diagnosisstring']: # If NaN
#        return ''
#    else:
#        return row['diagnosisstring'].split('|')[-1] + ' / ' + row['diagnosispriority']
    
#dfs['diagnosis']['primary'] = dfs['diagnosis'].apply(clean_diagnosisstring, axis=1)

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,7607199,346380,False,5000,cardiovascular|ventricular disorders|hypertension,"401.9, I10",Other
1,7570429,346380,False,650,neurologic|altered mental status / pain|change in mental status,"780.09, R41.82",Major
2,7705483,346380,True,5000,cardiovascular|shock / hypotension|hypotension,"458.9, I95.9",Major
3,7848601,346380,True,5000,neurologic|altered mental status / pain|schizophrenia,"295.90, F20.9",Major
4,7451475,346380,False,5000,pulmonary|disorders of vasculature|pulmonary embolism|thrombus,"415.19, I26.99",Major
...,...,...,...,...,...,...,...
24973,43897237,3158501,True,0,neurologic|altered mental status / pain|encephalopathy|metabolic,"348.31, G93.41",Other
24974,44151864,3158501,True,0,hematology|white blood cell disorders|leukocytosis,"288.8, D72.829",Other
24975,44379558,3158501,True,0,pulmonary|disorders of acid base|respiratory alkalosis|etiology unknown,"276.2, E87.2",Other
24976,44000639,3158501,True,0,neurologic|altered mental status / pain|delirium,"293.0, F05",Other


In [21]:
dfs['diagnosis'] = dfs['diagnosis'].drop(columns=[
    'activeupondischarge',
    'diagnosisstring',
    'icd9code',
    'diagnosispriority'
]) # No creo que importe si el análisis se ha resuelto en la estancia