In [576]:
import sqlite3 as db
import pandas as pd
import featuretools as ft
import json

import matplotlib.pyplot as plt
import seaborn as sns


In [577]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [578]:
# Dropear IDs de todas las tablas (primera columna)
def drop_keys(dfs):
    for df_key in dfs.keys():
        if df_key not in ('hospital', 'patient'): # No dropear hospitalId o patientUnitStayId
            df = dfs[df_key]
            dfs[df_key] = df.drop(columns=[df.columns.values[0]])

if not has_dropped_keys:
    drop_keys(dfs)
    has_dropped_keys = True

In [579]:
dfs['admissiondrug']

Unnamed: 0,patientunitstayid,drugoffset,drugenteredoffset,drugnotetype,specialtytype,usertype,rxincluded,writtenineicu,drugname,drugdosage,drugunit,drugadmitfrequency,drughiclseqno
0,281479,420,444,Daily Progress,eCM Primary,THC Physician,False,True,NOVOLOG ...,0.0,,,20769
1,281479,24,31,Admission,eCM Primary,THC Nurse,True,True,NOVOLOG ...,0.0,,,20769
2,292154,242,243,Daily Progress,eCM Primary,Other,False,True,ALLOPURINOL ...,0.0,,,1100
3,292154,53,69,Admission,eCM Primary,THC Nurse,False,True,DILTIAZEM 24HR CD ...,0.0,,,182
4,292154,242,243,Daily Progress,eCM Primary,Other,False,True,CALCIUM CARBONATE ...,0.0,,,1163
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7412,3246445,1,155,Admission,eCM Primary,THC Nurse,True,True,LIPITOR ...,0.0,,,12404
7413,3246445,1,155,Admission,eCM Primary,THC Nurse,True,True,MYSOLINE ...,0.0,,,1886
7414,3246445,1,155,Admission,eCM Primary,THC Nurse,True,True,CLARITIN ...,0.0,,,7605
7415,3246445,1,155,Admission,eCM Primary,THC Nurse,True,True,FLOMAX ...,0.0,,,13864


In [580]:
dfs['admissiondrug']['drugname'].value_counts()[:3]

# ASPIRIN 251
# LISINOPRIL 206
# LASIX 161

# Son los medicamentos más usados en las admisiones
def has_commun_drug_A(row):
    drug = row ['drugname'].strip()
       
    if drug == "ASPIRIN": 
        return 1
    else:
        return 0

def has_commun_drug_L(row):
    drug = row ['drugname'].strip()
    
    if drug == 'LISINOPRIL':
        return 1
    else:
        return 0

def has_commun_drug_Li(row):
    drug = row ['drugname'].strip()
    
    if drug in 'LASIX':
        return 1
    else:
        return 0

In [581]:
dfs['admissiondrug'] = dfs['admissiondrug'].drop(columns=[
    'drugoffset', 'drugenteredoffset', 'drugnotetype', 'specialtytype', 'rxincluded', 'writtenineicu', 'drugunit', 'drugdosage',
    'drugadmitfrequency', 'drughiclseqno', 'usertype'
], axis=1)

In [582]:
_dfs = dfs["admissiondrug"]["patientunitstayid"]

# Creamos nuevas columnas
commun_drug_A   = dfs["admissiondrug"].apply(lambda row : has_commun_drug_A(row), axis=1)
commun_drug_L   = dfs["admissiondrug"].apply(lambda row : has_commun_drug_L(row), axis=1)
commun_drug_Li  = dfs["admissiondrug"].apply(lambda row : has_commun_drug_Li(row), axis=1)


_commun_drug_A  = pd.DataFrame(commun_drug_A, columns=['ASPIRIN'])
_commun_drug_L  = pd.DataFrame(commun_drug_L, columns=['LISINOPRIL'])
_commun_drug_Li = pd.DataFrame(commun_drug_Li, columns=['LASIX'])


dfs["admissiondrug"] = pd.concat([_dfs, _commun_drug_A,_commun_drug_L,_commun_drug_Li], axis=1)

#Eliminamos columnas duplicadas
dfs['admissiondrug'] = dfs['admissiondrug'].drop_duplicates()

In [583]:
dfs["admissiondrug"] = dfs["admissiondrug"].groupby(['patientunitstayid']).max()

dfs["admissiondrug"]

Unnamed: 0_level_0,ASPIRIN,LISINOPRIL,LASIX
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
281479,0,0,0
292154,1,0,0
310446,0,1,0
332296,0,1,0
350263,0,0,0
...,...,...,...
3237226,0,0,0
3237558,1,0,0
3238529,1,0,1
3245076,1,0,0


Covertimos los 1 y los 0 en True o False

In [584]:
def int_to_boolean (data, column):
    data = dfs["admissiondrug"].copy()                 
    data[column] = data[column].astype(bool)          
    return data

In [588]:
dfs["admissiondrug"] = int_to_boolean (dfs["admissiondrug"], 'ASPIRIN')
dfs["admissiondrug"] = int_to_boolean (dfs["admissiondrug"], 'LISINOPRIL')
dfs["admissiondrug"] = int_to_boolean (dfs["admissiondrug"], 'LASIX')

dfs["admissiondrug"]

Unnamed: 0_level_0,ASPIRIN,LISINOPRIL,LASIX
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
281479,False,False,False
292154,True,False,False
310446,False,True,False
332296,False,True,False
350263,False,False,False
...,...,...,...
3237226,False,False,False
3237558,True,False,False
3238529,True,False,True
3245076,True,False,False


In [586]:
dfs['admissiondrug'].info()
print()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551 entries, 281479 to 3246445
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   ASPIRIN     551 non-null    int64
 1   LISINOPRIL  551 non-null    int64
 2   LASIX       551 non-null    int64
dtypes: int64(3)
memory usage: 17.2 KB

