In [13]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

In [14]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [None]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)
dfs['diagnosis'].head(100)

In [16]:
dfs['diagnosis'] = dfs['diagnosis'].drop_duplicates()
dfs['diagnosis'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24978 entries, 0 to 24977
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   diagnosisid          24978 non-null  int64 
 1   patientunitstayid    24978 non-null  int64 
 2   activeupondischarge  24978 non-null  bool  
 3   diagnosisoffset      24978 non-null  int64 
 4   diagnosisstring      24978 non-null  object
 5   icd9code             21206 non-null  object
 6   diagnosispriority    24978 non-null  object
dtypes: bool(1), int64(3), object(3)
memory usage: 1.4+ MB


In [17]:
# Redondear offset al múltiplo de 50 más cercano hacia abajo para eliminar duplicados
dfs['diagnosis']['diagnosisoffset'] = dfs['diagnosis'].apply(lambda row: (int(row['diagnosisoffset'])//50) * 50, axis=1)
dfs['diagnosis'] = dfs['diagnosis'].drop_duplicates()

In [18]:
new_df = pd.DataFrame(columns=['patientunitstayid', 'diag1', 'diag2', 'diag3', 'additional1', 'additional2', 'additional3'])

def clean_diagnosisstring(group):
    pusID, group_df = group
    priority = {
        'Primary': [],
        'Major':  [],
        'Other': []
    }
    
    for i, row in group_df.iterrows():
        if row['diagnosisstring'] == row['diagnosisstring'] and row['diagnosispriority'] == row['diagnosispriority']: # not NaN
            diag = row['diagnosisstring'].split('|')[-1]
            priority_lv = row['diagnosispriority']
            
            priority[priority_lv].append(diag)
    
    primary_str = ( ['None'] + priority['Primary'] )[-1]
    major_str   = ( ['None'] + priority['Major']   )[-1]
    other_str   = ( ['None'] + priority['Other']   )[-1]

    return [ pusID, primary_str, major_str, other_str ]
    
new_df = pd.DataFrame(columns=['patientunitstayid', 'lastPrimary', 'lastMajor', 'lastOther'])
for i,  group in enumerate(dfs['diagnosis'].groupby('patientunitstayid')):
    row = clean_diagnosisstring(group)
    new_df.loc[i] = row

new_df

Unnamed: 0,patientunitstayid,lastPrimary,lastMajor,lastOther
0,143870,s/p cartoid endarterectomy,bradycardia,coronary artery disease
1,145427,diverticulitis of colon,diverticulitis of colon,due to bowel perforation
2,151179,septic shock,hypotension,infectious dermatitis
3,151867,s/p exploratory laparotomy,,viscus perforation
4,151900,septic shock,acute respiratory distress,pneumonia
...,...,...,...,...
2150,3351763,ventilatory failure,,enteric fistula
2151,3352230,acute myocardial infarction (with ST elevation),acute myocardial infarction (with ST elevation),IABP
2152,3352231,acute myocardial infarction (with ST elevation),,
2153,3352333,obstruction of colon,alcohol,


In [19]:
#for i, row in dfs['diagnosis'].iterrows():
#    print(row['icd9code'], ' -> ', row['diagnosisstring'])

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 400)
dfs['diagnosis'][dfs['diagnosis']['patientunitstayid'] == 3351763].head(400)

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
19242,46219997,3351763,False,50,surgery|respiratory failure|ventilatory failure,"518.81, J96.00",Primary
19243,46076409,3351763,False,1000,surgery|general surgery postop issues|enteric fistula,,Other
19244,46219983,3351763,True,2950,surgery|respiratory failure|ventilatory failure,"518.81, J96.00",Primary
19245,46076410,3351763,True,2400,surgery|general surgery postop issues|enteric fistula,,Other


In [20]:
dfs['diagnosis']

# Crear _last_diagnosis a partir de diagnosisstring

#def clean_diagnosisstring(row):
#    if row['diagnosisstring'] != row['diagnosisstring']: # If NaN
#        return ''
#    else:
#        return row['diagnosisstring'].split('|')[-1] + ' / ' + row['diagnosispriority']
    
#dfs['diagnosis']['primary'] = dfs['diagnosis'].apply(clean_diagnosisstring, axis=1)

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,7607199,346380,False,5000,cardiovascular|ventricular disorders|hypertension,"401.9, I10",Other
1,7570429,346380,False,650,neurologic|altered mental status / pain|change in mental status,"780.09, R41.82",Major
2,7705483,346380,True,5000,cardiovascular|shock / hypotension|hypotension,"458.9, I95.9",Major
3,7848601,346380,True,5000,neurologic|altered mental status / pain|schizophrenia,"295.90, F20.9",Major
4,7451475,346380,False,5000,pulmonary|disorders of vasculature|pulmonary embolism|thrombus,"415.19, I26.99",Major
...,...,...,...,...,...,...,...
24973,43897237,3158501,True,0,neurologic|altered mental status / pain|encephalopathy|metabolic,"348.31, G93.41",Other
24974,44151864,3158501,True,0,hematology|white blood cell disorders|leukocytosis,"288.8, D72.829",Other
24975,44379558,3158501,True,0,pulmonary|disorders of acid base|respiratory alkalosis|etiology unknown,"276.2, E87.2",Other
24976,44000639,3158501,True,0,neurologic|altered mental status / pain|delirium,"293.0, F05",Other


In [21]:
dfs['diagnosis'] = dfs['diagnosis'].drop(columns=[
    'activeupondischarge',
    'diagnosisstring',
    'icd9code',
    'diagnosispriority'
]) # No creo que importe si el análisis se ha resuelto en la estancia