In [127]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

import matplotlib.pyplot as plt
import seaborn as sns
import re


In [128]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [129]:
# Dropear IDs de todas las tablas (primera columna)
def drop_keys(dfs):
    for df_key in dfs.keys():
        if df_key not in ('hospital', 'patient'): # No dropear hospitalId o patientUnitStayId
            df = dfs[df_key]
            dfs[df_key] = df.drop(columns=[df.columns.values[0]])

if not has_dropped_keys:
    drop_keys(dfs)
    has_dropped_keys = True

In [130]:
dfs['medication']

Unnamed: 0,patientunitstayid,drugorderoffset,drugstartoffset,drugivadmixture,drugordercancelled,drugname,drughiclseqno,dosage,routeadmin,frequency,loadingdose,prn,drugstopoffset,gtc
0,141765,134,1396,No,No,WARFARIN SODIUM 5 MG PO TABS,2812.0,5 3,PO,,,No,2739,0
1,141765,1,-188,No,No,5 ML VIAL : DILTIAZEM HCL 25 MG/5ML IV SOLN,182.0,15 3,IV,Once PRN,,Yes,171,38
2,141765,115,856,No,No,ASPIRIN EC 81 MG PO TBEC,1820.0,81 3,PO,Daily,,No,2739,0
3,141765,114,316,No,No,DILTIAZEM HCL 30 MG PO TABS,182.0,30 3,PO,Q6H SCH,,No,2739,0
4,141765,115,856,No,No,LISINOPRIL 5 MG PO TABS,132.0,5 3,PO,Daily,,No,2428,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75599,3353113,3367,3789,No,Yes,,6249.0,1 EACH,TRANSDERM,DAILY,,No,0,97
75600,3353113,2,-55,No,No,,20971.0,100 ML,INTRAVEN,DRIP,,No,2721,83
75601,3353113,2,-2258,No,Yes,LORAZEPAM,4846.0,2-4 MG,INTRAVEN,,,Yes,0,83
75602,3353113,2,-86,No,Yes,,23379.0,20 MG,INTRAMUSC,Q4HRSPRN,,Yes,-65,80


In [131]:
def clean_respiratoryCare(row, df):
    respiratoryCare = row['currenthistoryseqnum']
           
    return  df[df['patientunitstayid']==row['patientunitstayid']]['currenthistoryseqnum'].max()


def clean_dosage(row):
    dosage = row ['dosage']

    m = re.search('\d+', dosage)
    
    try:
        return int(m.group(0))
    except:
        return -1

In [132]:
# TODO: Parsear dosage con drugname

dfs['medication'] = dfs['medication'].drop(columns=[
    'drugorderoffset', # Irrelevante
    'drugivadmixture', # No parece relevante
    'drugordercancelled', # Irrelevante
    #'drugname', # Muy incompleta y redundante con drughiclseqno
    'loadingdose', # Mucho NULL
    'prn', # ? Redundante con frequency
    'gtc', # ? No parece que tenga relevancia
    'drugstartoffset', # Supongo que el verdadero impacto esta en la medicación no en el momento
    'drugstopoffset' # Supongo que el verdadero impacto esta en la medicación no en el momento
], axis=1)


# Drop filas cuya columna de dosage sea Nan
dfs['medication'].drop(dfs['medication'][dfs['medication']['dosage'] != dfs['medication']['dosage']].index, inplace=True)

dfs['medication']['dosage']                 = dfs['medication'].apply(clean_dosage, axis=1)

In [133]:
dfs ['medication'].head(50)

Unnamed: 0,patientunitstayid,drugname,drughiclseqno,dosage,routeadmin,frequency
0,141765,WARFARIN SODIUM 5 MG PO TABS,2812.0,5,PO,
1,141765,5 ML VIAL : DILTIAZEM HCL 25 MG/5ML IV SOLN,182.0,15,IV,Once PRN
2,141765,ASPIRIN EC 81 MG PO TBEC,1820.0,81,PO,Daily
3,141765,DILTIAZEM HCL 30 MG PO TABS,182.0,30,PO,Q6H SCH
4,141765,LISINOPRIL 5 MG PO TABS,132.0,5,PO,Daily
5,143870,ASPIRIN EC 81 MG PO TBEC,1820.0,81,PO,Daily
6,143870,1 ML - DIPHENHYDRAMINE HCL 50 MG/ML IJ SOLN,4480.0,25,IV,Q15 Min PRN
7,143870,METOPROLOL TARTRATE 25 MG PO TABS,2102.0,25,PO,Once PRN
8,143870,CLOPIDOGREL BISULFATE 75 MG PO TABS,17539.0,75,PO,Daily
9,143870,,2104.0,25,PO,Q Evening


In [134]:
dfs['medication'].info()
print()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66299 entries, 0 to 75603
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientunitstayid  66299 non-null  int64  
 1   drugname           41607 non-null  object 
 2   drughiclseqno      62480 non-null  float64
 3   dosage             66299 non-null  int64  
 4   routeadmin         66281 non-null  object 
 5   frequency          56263 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 3.5+ MB

