In [4]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

In [5]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv_clean/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

In [6]:
dfs['diagnosis'] = dfs['diagnosis'].drop_duplicates()
dfs['diagnosis'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24748 entries, 0 to 24747
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   patientunitstayid    24748 non-null  int64 
 1   activeupondischarge  24748 non-null  bool  
 2   diagnosisoffset      24748 non-null  int64 
 3   diagnosisstring      24748 non-null  object
 4   icd9code             21012 non-null  object
 5   diagnosispriority    24748 non-null  object
dtypes: bool(1), int64(2), object(3)
memory usage: 1.2+ MB


In [7]:
# Redondear offset al múltiplo de 50 más cercano hacia abajo para eliminar duplicados
dfs['diagnosis']['diagnosisoffset'] = dfs['diagnosis'].apply(lambda row: (int(row['diagnosisoffset'])//50) * 50, axis=1)
dfs['diagnosis'] = dfs['diagnosis'].drop_duplicates()

In [28]:
new_df = pd.DataFrame(columns=['patientunitstayid', 'diag1', 'diag2', 'diag3', 'additional1', 'additional2', 'additional3'])

def clean_diagnosisstring(group):
    pusID, group_df = group
    priority = {
        'Primary': [],
        'Major':  [],
        'Other': []
    }
    
    for i, row in group_df.iterrows():
        if row['diagnosisstring'] == row['diagnosisstring'] and row['diagnosispriority'] == row['diagnosispriority']: # not NaN
            diag = row['diagnosisstring'].split('|')[-1]
            priority_lv = row['diagnosispriority']
            
            priority[priority_lv].append(diag)
    
    primary_str = ( ['None'] + priority['Primary'] )[-1]
    major_str   = ( ['None'] + priority['Major']   )[-1]
    other_str   = ( ['None'] + priority['Other']   )[-1]

    
    return [ pusID, primary_str, major_str, other_str ]
    
    
new_df = pd.DataFrame(columns=['patientunitstayid', 'lastPrimary', 'lastMajor', 'lastOther'])
for i,  group in enumerate(dfs['diagnosis'].groupby('patientunitstayid')):
    row = clean_diagnosisstring(group)
    new_df.loc[i] = row

new_df

Unnamed: 0,patientunitstayid,lastPrimary,lastMajor,lastOther
0,143870,s/p cartoid endarterectomy,bradycardia,coronary artery disease
1,145427,diverticulitis of colon,diverticulitis of colon,due to bowel perforation
2,151179,septic shock,hypotension,infectious dermatitis
3,151867,s/p exploratory laparotomy,,viscus perforation
4,151900,septic shock,acute respiratory distress,pneumonia
...,...,...,...,...
2150,3351763,ventilatory failure,,enteric fistula
2151,3352230,acute myocardial infarction (with ST elevation),acute myocardial infarction (with ST elevation),IABP
2152,3352231,acute myocardial infarction (with ST elevation),,
2153,3352333,obstruction of colon,alcohol,


In [30]:
#for i, row in dfs['diagnosis'].iterrows():
#    print(row['icd9code'], ' -> ', row['diagnosisstring'])

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)
dfs['diagnosis'][dfs['diagnosis']['patientunitstayid'] == 3351763].head(400)

Unnamed: 0,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
19068,3351763,False,50,surgery|respiratory failure|ventilatory failure,"518.81, J96.00",Primary
19069,3351763,False,1000,surgery|general surgery postop issues|enteric fistula,,Other
19070,3351763,True,2950,surgery|respiratory failure|ventilatory failure,"518.81, J96.00",Primary
19071,3351763,True,2400,surgery|general surgery postop issues|enteric fistula,,Other


In [23]:
dfs['diagnosis']

# Crear _last_diagnosis a partir de diagnosisstring

#def clean_diagnosisstring(row):
#    if row['diagnosisstring'] != row['diagnosisstring']: # If NaN
#        return ''
#    else:
#        return row['diagnosisstring'].split('|')[-1] + ' / ' + row['diagnosispriority']
    
#dfs['diagnosis']['primary'] = dfs['diagnosis'].apply(clean_diagnosisstring, axis=1)

Unnamed: 0,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,346380,False,5000,cardiovascular|ventricular disorders|hypertension,"401.9, I10",Other
1,346380,False,650,neurologic|altered mental status / pain|change...,"780.09, R41.82",Major
2,346380,True,5000,cardiovascular|shock / hypotension|hypotension,"458.9, I95.9",Major
3,346380,True,5000,neurologic|altered mental status / pain|schizo...,"295.90, F20.9",Major
4,346380,False,5000,pulmonary|disorders of vasculature|pulmonary e...,"415.19, I26.99",Major
...,...,...,...,...,...,...
24743,3158501,True,0,neurologic|altered mental status / pain|enceph...,"348.31, G93.41",Other
24744,3158501,True,0,hematology|white blood cell disorders|leukocyt...,"288.8, D72.829",Other
24745,3158501,True,0,pulmonary|disorders of acid base|respiratory a...,"276.2, E87.2",Other
24746,3158501,True,0,neurologic|altered mental status / pain|delirium,"293.0, F05",Other


In [None]:
dfs['diagnosis'] = dfs['diagnosis'].drop(columns=[
    'activeupondischarge',
    'diagnosisstring',
    'icd9code',
    'diagnosispriority'
]) # No creo que importe si el análisis se ha resuelto en la estancia