In [1]:
import sqlite3 as db
import pandas as pd
#import featuretools as ft
import json
import random
import re

<center><h2><b>Leer DB</b></h2></center>

In [2]:
# Hacer una query SQL
def sql_query(q):
    conn = db.connect('../db/sqlite/eicu_v2_0_1.sqlite3')
    df = pd.read_sql_query(q, conn)
    conn.close()
    
    return df

# Leer todos los CSV
def read_csvs():
    #import os
    #print( os.getcwd())
    datasets = [ 'admissiondrug', 'admissionDx', 'allergy', 'apacheApsVar', 'apachePatientResult', 'apachePredVar', 'carePlanCareProvider', 'carePlanEOL', 'carePlanGeneral',
                 'carePlanGoal', 'carePlanInfectiousDisease', 'customLab', 'diagnosis', 'hospital', 'infusiondrug', 'intakeOutput', 'lab', 'medication', 'microLab', 'note',
                 'nurseAssessment', 'nurseCare', 'nurseCharting', 'pastHistory', 'patient', 'physicalExam', 'respiratoryCare', 'respiratoryCharting', 'treatment', 'vitalAperiodic',
                 'vitalPeriodic']

    dfs = {}

    for ds_name in datasets:
        dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')
    
    return dfs

dfs = read_csvs()
has_dropped_keys = False # Para que no se droppeen cada vez que se ejecuta la celda siguiente

  dfs[ds_name] = pd.read_csv('../db/csv/' + ds_name + '.csv')


In [3]:
# Dropear IDs de todas las tablas (primera columna)
def drop_keys(dfs):
    for df_key in dfs.keys():
        if df_key not in ('hospital', 'patient'): # No dropear hospitalId o patientUnitStayId
            df = dfs[df_key]
            dfs[df_key] = df.drop(columns=[df.columns.values[0]])

if not has_dropped_keys:
    drop_keys(dfs)
    has_dropped_keys = True

In [4]:
# Eliminar filas duplicadas
for df_name in dfs:
    dfs[df_name] = dfs[df_name].drop_duplicates()

In [5]:
# TODO: Ajustar todos los offsets al múltiplo más cercano de 100 mins para reducir el número de filas duplicadas

<center><h2><b>admissionDx</b></h2></center>

In [6]:
# Son los medicamentos más usados en las admisiones
def has_commun_drug_A(row):
    #return 1 if row['drugname'].strip() == "ASPIRIN" else 0
    drug = row ['drugname'].strip()
       
    if drug == "ASPIRIN":
        return 1
    else:
        return 0

def has_commun_drug_L(row):
    drug = row ['drugname'].strip()
    
    if drug == 'LISINOPRIL':
        return 1
    else:
        return 0

def has_commun_drug_Li(row):
    drug = row ['drugname'].strip()
    
    if drug in 'LASIX':
        return 1
    else:
        return 0

In [7]:
dfs['admissiondrug'] = dfs['admissiondrug'].drop(columns=[
    'drugoffset', 'drugenteredoffset', 'drugnotetype', 'specialtytype', 'rxincluded', 'writtenineicu', 'drugunit', 'drugdosage',
    'drugadmitfrequency', 'drughiclseqno', 'usertype'
], axis=1)

In [8]:
_dfs = dfs["admissiondrug"]["patientunitstayid"]

# Creamos nuevas columnas
commun_drug_A   = dfs["admissiondrug"].apply(lambda row : has_commun_drug_A(row), axis=1)
commun_drug_L   = dfs["admissiondrug"].apply(lambda row : has_commun_drug_L(row), axis=1)
commun_drug_Li  = dfs["admissiondrug"].apply(lambda row : has_commun_drug_Li(row), axis=1)


_commun_drug_A  = pd.DataFrame(commun_drug_A, columns=['ASPIRIN'])
_commun_drug_L  = pd.DataFrame(commun_drug_L, columns=['LISINOPRIL'])
_commun_drug_Li = pd.DataFrame(commun_drug_Li, columns=['LASIX'])


dfs["admissiondrug"] = pd.concat([_dfs, _commun_drug_A,_commun_drug_L,_commun_drug_Li], axis=1)

#Eliminamos columnas duplicadas
dfs['admissiondrug'] = dfs['admissiondrug'].drop_duplicates()

In [9]:
dfs["admissiondrug"] = dfs["admissiondrug"].groupby(['patientunitstayid']).max()

In [10]:
def int_to_boolean (data, column):
    data = dfs["admissiondrug"].copy()                 
    data[column] = data[column].astype(bool)          
    return data

In [11]:
dfs["admissiondrug"] = int_to_boolean (dfs["admissiondrug"], 'ASPIRIN')
dfs["admissiondrug"] = int_to_boolean (dfs["admissiondrug"], 'LISINOPRIL')
dfs["admissiondrug"] = int_to_boolean (dfs["admissiondrug"], 'LASIX')
dfs["admissiondrug"] = dfs["admissiondrug"].reset_index()

<center><h2><b>admissionDx</b></h2></center>

In [12]:
# Drop columns
#dfs['admissionDx'] = dfs['admissionDx'].drop(columns=[
#    'admitdxtext', # En la gran mayoría de los casos es igual a admitdxname
#], axis=1)

In [13]:
#
# Nota: Es una mierda porque tiene mucho más sentido con onehot, pero hay un huevo de diagnósticos y como 10~20 notas adicionales
# TODO: diag1 = 4 ??
#

# admitdxpath - admitdxname: Cada patientunistayid tiene n filas relacionadas con todos las diagnosis encontradas.
def parse_admitdx(group):
    pusID, group_df = group
    diagnosis = []
    additionalinfo = []
    
    for i, row in group_df.iterrows():
        if row['admitdxname'] in ('No', 'Yes'):
            path_parts = row['admitdxpath'].split('|')
            mode = path_parts[-2]
            
            additionalinfo.append(mode + ' ' + row['admitdxname'])
        else:
            diagnosis.append(row['admitdxname'])
            
    # Rellenar 3 huecos en diagnosis y 2 en additionalinfo
    diagnosis = (sorted(diagnosis) + ['No' for _ in range(3)])[:3]
    additionalinfo = (sorted(additionalinfo) + ['No' for _ in range(3)])[:3]
    
    return [pusID] + diagnosis + additionalinfo

new_df = pd.DataFrame(columns=['patientunitstayid', 'diag1', 'diag2', 'diag3', 'additional1', 'additional2', 'additional3'])
for i,  group in enumerate(dfs['admissionDx'].groupby('patientunitstayid')):
    row = parse_admitdx(group)
    new_df.loc[i] = row

#new_df

In [14]:
def clean_admitdxname(row):
    admitdxname = row ['admitdxname']
    
    m = re.search('^(\w+)', admitdxname)
    
    return m.group(0)

# Drop columns
dfs['admissionDx'] = dfs['admissionDx'].drop(columns=[
    'admitdxtext', # En la gran mayoría de los casos es igual a admitdxname
], axis=1)

dfs['admissionDx']['admitdxname'] = dfs['admissionDx'].apply(lambda row : clean_admitdxname(row), axis=1)

# Filtramos y quitamos los Yes y No, no tienen mucho sentido
df = dfs['admissionDx']
dfs['admissionDx'] = df[df['admitdxname'] != 'Yes']
df = dfs['admissionDx']
dfs['admissionDx'] = df[df['admitdxname'] != 'No']

#Cardiovascular      983
#Respiratory         363
#Neurologic          318

def has_common_admission_C(row):
    drug = row ['admitdxname'].strip()
       
    if drug == "Cardiovascular": 
        return 1
    else:
        return 0

def has_common_admission_R(row):
    drug = row ['admitdxname'].strip()
    
    if drug == 'Respiratory':
        return 1
    else:
        return 0

def has_common_admission_N(row):
    drug = row ['admitdxname'].strip()
    
    if drug in 'Neurologic':
        return 1
    else:
        return 0

_dfs = dfs["admissionDx"]["patientunitstayid"]

# Creamos nuevas columnas
common_admission_C   = dfs["admissionDx"].apply(lambda row : has_common_admission_C(row), axis=1)
common_admission_R   = dfs["admissionDx"].apply(lambda row : has_common_admission_R(row), axis=1)
common_admission_N   = dfs["admissionDx"].apply(lambda row : has_common_admission_N(row), axis=1)


_common_admission_C  = pd.DataFrame(common_admission_C, columns=['Cardiovascular'])
_common_admission_R  = pd.DataFrame(common_admission_R, columns=['Respiratory'])
_common_admission_N  = pd.DataFrame(common_admission_N, columns=['Neurologic'])


dfs["admissionDx"] = pd.concat([_dfs, _common_admission_C,_common_admission_R,_common_admission_N], axis=1)

dfs["admissionDx"] = dfs["admissionDx"].groupby(['patientunitstayid']).max()

dfs["admissionDx"] = dfs["admissionDx"].reset_index()

<center><h2><b>allergy</b></h2></center>

In [15]:
dfs['allergy'] = dfs['allergy'].drop(columns=[
    'allergyenteredoffset', # No sirve de mucho
    'allergynotetype', # No es relevante
    'usertype', # No importa mucho quien la haya encontrado
    'rxincluded', # Probablemente no tenga mucha influencia
    'specialtytype', # Probablemente no tenga mucha influencia
    'writtenineicu' # TODO: Es posible que esta sea importante
])

#
# Dividir en dos tablas de drogas y no drogas para quitar NaNs y eliminar columnas
#
dfs['_allergyDrug']    = dfs['allergy'][dfs['allergy']['allergytype'] == 'Drug']\
    .drop(columns=['allergytype', 'allergyname']) # allergyname siempre es igual a drugname
dfs['_allergyNonDrug'] = dfs['allergy'][dfs['allergy']['allergytype'] == 'Non Drug']\
    .drop(columns=['allergytype', 'drugname', 'drughiclseqno'])

dfs.pop('allergy')

Unnamed: 0,patientunitstayid,allergyoffset,drugname,allergytype,allergyname,drughiclseqno
0,243097,2549,,Non Drug,penicillins,
1,243097,1288,CODEINE PHOSPHATE,Drug,CODEINE PHOSPHATE,1721.0
2,243097,2549,CODEINE PHOSPHATE,Drug,CODEINE PHOSPHATE,1721.0
3,243097,21,,Non Drug,penicillins,
4,243097,3988,CODEINE PHOSPHATE,Drug,CODEINE PHOSPHATE,1721.0
...,...,...,...,...,...,...
2470,3351763,14,,Non Drug,latex,
2471,3351763,14,LEVAQUIN,Drug,LEVAQUIN,12383.0
2472,3353113,77,,Non Drug,Contrast Dye,
2473,3353113,77,PENICILLIN G BENZATHINE,Drug,PENICILLIN G BENZATHINE,3941.0


<center><h2><b>apacheapsvar</b></h2></center>

In [16]:
# ID ya dropeada

<center><h2><b>apachepatientresult</b></h2></center>

<center><h2><b>Apachepredvar</b></h2></center>

<center><h2><b>careplancareprovider</b></h2></center>

In [17]:
# Quitar NaN y Uknown de interventioncategory reemplazando los valores por los 3 más frecuentes
def clean_interventioncategory(categ):
    if categ == categ and categ != 'Unknown': # No NaN
        return categ
    else:
        rand = random.randint(0, 2)
        categs = ['I', 'II', 'III']
        return categs[rand]


dfs['carePlanCareProvider'] = dfs['carePlanCareProvider'].drop(columns=[
    'providertype', # Todo nulls
    'managingphysician', # No consideramos que sean importantes para que el algoritmo aprenda
    'activeupondischarge', # No consideramos que sean importantes para que el algoritmo aprenda
])

# Rellenamos categoria de Intervención
dfs['carePlanCareProvider']['interventioncategory']   = dfs['carePlanCareProvider']['interventioncategory'].apply(clean_interventioncategory)

In [18]:
dfs['carePlanCareProvider']['specialty'].value_counts()[:3] # -> internal medicine:1005 /  cardiology:561 / hospitalist:554

# Filtramos y nos quedamos con las specialty más comunes
df = dfs['carePlanCareProvider']
dfs['carePlanCareProvider'] = df[df.specialty.isin (['internal medicine', 'cardiology', 'hospitalist'])]

# Sumamos los offsets con mismo patientid y misma especialidad 
def clean_careprovidersaveoffset(row, df):           
    return  df[(df['patientunitstayid']==row['patientunitstayid']) & (df['specialty']==row['specialty'])]['careprovidersaveoffset'].sum()

# 2120 rows × 4 columns
dfs['carePlanCareProvider']['careprovidersaveoffset']   =  dfs['carePlanCareProvider'].apply(lambda row : clean_careprovidersaveoffset(row, dfs['carePlanCareProvider']), axis = 1)

# Quitamos duplicidad -> 1416 rows × 4 columns
dfs['carePlanCareProvider'] = dfs['carePlanCareProvider'].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs['carePlanCareProvider']['careprovidersaveoffset']   =  dfs['carePlanCareProvider'].apply(lambda row : clean_careprovidersaveoffset(row, dfs['carePlanCareProvider']), axis = 1)


In [19]:
# set(dfs["carePlanCareProvider"]["interventioncategory"]) -> I / II / III / IV
# specialty -> internal medicine:1005 /  cardiology:561 / hospitalist:554

def has_category_I (row):
    return 1 if row["interventioncategory"] == 'I' else 0
def has_category_II (row):
    return 1 if row["interventioncategory"] == 'II' else 0
def has_category_III (row):
    return 1 if row["interventioncategory"] == 'III' else 0
def has_category_IV (row):
    return 1 if row["interventioncategory"] == 'IV' else 0

def has_hospitalist (row):
    return row['careprovidersaveoffset'] if row['specialty'] == 'hospitalist' else 0

def has_cardiology (row):
    return row['careprovidersaveoffset'] if row['specialty'] == 'cardiology' else 0

def has_internal_medicine (row):
    return row['careprovidersaveoffset'] if row['specialty'] == 'internal medicine' else 0

In [20]:
_dfs = dfs["carePlanCareProvider"]["patientunitstayid"]

# Creamos nuevas columnas
category_I   = dfs["carePlanCareProvider"].apply(lambda row : has_category_I(row), axis=1)
category_II  = dfs["carePlanCareProvider"].apply(lambda row : has_category_II(row), axis=1)
category_III = dfs["carePlanCareProvider"].apply(lambda row : has_category_III(row), axis=1)
category_IV  = dfs["carePlanCareProvider"].apply(lambda row : has_category_IV(row), axis=1)

category_hospital           = dfs["carePlanCareProvider"].apply(lambda row : has_hospitalist(row), axis=1)
category_cardiology         = dfs["carePlanCareProvider"].apply(lambda row : has_cardiology(row), axis=1)
category_internal_medicine  = dfs["carePlanCareProvider"].apply(lambda row : has_internal_medicine(row), axis=1)

# Vemos si hay intervenciones y de que tipo son
_category_I    = pd.DataFrame(category_I, columns=['Intervencion_I'])
_category_II   = pd.DataFrame(category_II, columns=['Intervencion_II'])
_category_III  = pd.DataFrame(category_III, columns=['Intervencion_III'])
_category_IV   = pd.DataFrame(category_IV, columns=['Intervencion_IV'])

# Juntamos intervenciones y agrupamos en base a patientId escogiendo el max (1 or 0)
Intervention = pd.concat([_dfs, _category_I,_category_II,_category_III, _category_IV], axis=1)
Intervention = Intervention.groupby(['patientunitstayid']).max()

# Similar a intervenciones pero con las especialidades más comunes
_category_hospital           = pd.DataFrame(category_hospital, columns=['Categoria_Hospital'])
_category_cardiology         = pd.DataFrame(category_cardiology, columns=['Categoria_Cardiology'])
_category_internal_medicine  = pd.DataFrame(category_internal_medicine, columns=['Categoria_Internal_Medicine'])

# Similar pero sumando 
medicine = pd.concat([_dfs, _category_hospital ,_category_cardiology,_category_internal_medicine ], axis=1)
medicine = medicine.drop_duplicates()
medicine = medicine.groupby(['patientunitstayid']).sum()

dfs['carePlanCareProvider'] = pd.concat([medicine, Intervention], axis=1)
dfs['carePlanCareProvider'] = dfs['carePlanCareProvider'].reset_index()

<center><h2><b>Careplaneol</b></h2></center>

In [21]:
# Ya está lo suficientemente limpia

<center><h2><b>careplangeneral</b></h2></center>

In [22]:
# Hay 15 filas con valor nulo en la columna cplitemvalue, asi que eliminamos esas filas
dfs['carePlanGeneral'] = dfs['carePlanGeneral'].dropna()

dfs['carePlanGeneral'] = dfs['carePlanGeneral'].drop(columns=[
    'activeupondischarge', 
    'cplitemvalue' # Valores muy dispares que se alejan de la solucion de los paciente
], axis=1)

In [23]:
#Ventilation                                  3715
#DVT Prophylaxis                              3296
#Airway                                       3141
#Care Limitation                              2967
#Stress Ulcer Prophylaxis                     2946

def has_Ventilation (row):
    return 1 if row['cplgroup'] == 'Ventilation' else 0

def has_DVT_Prophylaxis (row):
    return 1 if row['cplgroup'] == 'DVT Prophylaxis' else 0

def has_Airway (row):
    return 1 if row['cplgroup'] == 'Airway' else 0

def has_Care_Limitation (row):
    return 1 if row['cplgroup'] == 'Care Limitation' else 0

def has_Stress_Ulcer_Prophylaxis  (row):
    return 1 if row['cplgroup'] == 'Stress Ulcer Prophylaxis' else 0

In [24]:
_dfs = dfs["carePlanGeneral"]['patientunitstayid']

category_Ventilation               = dfs["carePlanGeneral"].apply(lambda row : has_Ventilation(row), axis=1)
category_DVT_Prophylaxis           = dfs["carePlanGeneral"].apply(lambda row : has_DVT_Prophylaxis(row), axis=1)
category_Airway                    = dfs["carePlanGeneral"].apply(lambda row : has_Airway(row), axis=1)
category_Care_Limitation           = dfs["carePlanGeneral"].apply(lambda row : has_Care_Limitation(row), axis=1)
category_Stress_Ulcer_Prophylaxis  = dfs["carePlanGeneral"].apply(lambda row : has_Stress_Ulcer_Prophylaxis(row), axis=1)

_category_Ventilation                = pd.DataFrame(category_Ventilation, columns=['Categoria_Ventilacion'])
_category_DVT_Prophylaxis            = pd.DataFrame(category_DVT_Prophylaxis, columns=['Categoria_DVT_Prophylaxis'])
_category_Airway                     = pd.DataFrame(category_Airway, columns=['Categoria_Airway'])
_category_Care_Limitation            = pd.DataFrame(category_Care_Limitation, columns=['Categoria_Care_Limitation'])
_category_Stress_Ulcer_Prophylaxis   = pd.DataFrame(category_Stress_Ulcer_Prophylaxis, columns=['Categoria_Stress_Ulcer_Prophylaxis'])

categoria = pd.concat([_dfs, _category_Ventilation ,_category_DVT_Prophylaxis,_category_Airway, _category_Care_Limitation, _category_Stress_Ulcer_Prophylaxis ], axis=1)
#categoria = categoria.drop_duplicates()
categoria = categoria.groupby(['patientunitstayid']).sum()
categoria = categoria.reset_index()

dfs['carePlanGeneral'] = categoria

<center><h2><b>careplangoal</b></h2></center>

In [25]:
# Reemplazar NaN por un valor por defecto
dfs['carePlanGoal']['cplitemvalue'] = dfs['carePlanGoal']['cplgoalvalue'].fillna('_Unknown')

<center><h2><b>careplaninfectiousdisease</b></h2></center>

In [26]:
dfs['carePlanInfectiousDisease'] = dfs['carePlanInfectiousDisease'].drop(columns=[
    'responsetotherapy', # Solo tiene 2 no nulls
    'treatment' # La mitad son nulls y no es muy útil para empezar
])

<center><h2><b>customlab</b></h2></center>

In [27]:
dfs['customLab'] = dfs['customLab'].drop(columns=[
    'labotherresult', # Igual a labothervaluetext
])

# Normalizar a minúsculas
dfs['customLab']['labothervaluetext'] = dfs['customLab']['labothervaluetext'].apply(lambda val: val.lower())

<center><h2><b>diagnosis</b></h2></center>

In [28]:
# Reemplazar NaN por un valor por defecto
dfs['diagnosis']['icd9code'] = dfs['diagnosis']['icd9code'].fillna('_Unknown')

# Redondear offset al múltiplo de 50 más cercano hacia abajo para eliminar duplicados
# TODO: Hacer en todas las tablas
#dfs['diagnosis']['diagnosisoffset'] = dfs['diagnosis'].apply(lambda row: (int(row['diagnosisoffset'])//50) * 50, axis=1)
#dfs['diagnosis'] = dfs['diagnosis'].drop_duplicates()

In [29]:
# MANTENER 3 DIAGNOSIS MÁS IMPORTANTES POR SECUENCIA Y PRIORIDAD
new_df = pd.DataFrame(columns=['patientunitstayid', 'diag1', 'diag2', 'diag3', 'additional1', 'additional2', 'additional3'])

def clean_diagnosisstring(group):
    pusID, group_df = group
    priority = {
        'Primary': [],
        'Major':  [],
        'Other': []
    }
    
    # Obtener las diagnossis de un paciente
    for i, row in group_df.iterrows():
        if row['diagnosisstring'] == row['diagnosisstring'] and row['diagnosispriority'] == row['diagnosispriority']: # not NaN
            diag = row['diagnosisstring'].split('|')[-1]
            priority_lv = row['diagnosispriority']
            
            priority[priority_lv].append(diag)
    
    return [pusID] + (priority['Primary'] + priority['Major'] + priority['Other'] + ['None', 'None', 'None'])[:3]

new_df = pd.DataFrame(columns=['patientunitstayid', 'last1', 'last2', 'last3'])
for i,  group in enumerate(dfs['diagnosis'].groupby('patientunitstayid')):
    row = clean_diagnosisstring(group)
    new_df.loc[i] = row

dfs['diagnosis'] = new_df

<center><h2><b>hospital</b></h2></center>

In [30]:
# Reemplazar NaN por un valor por defecto
dfs['hospital']['numbedscategory'] = dfs['hospital']['numbedscategory'].fillna('_Unknown')
dfs['hospital']['region'] = dfs['hospital']['region'].fillna('_Unknown')

<center><h2><b>Infusiondrug</b></h2></center>

In [31]:
# Reemplazar NaN numéricos por un valor por defecto
dfs['infusiondrug']['drugrate'] = dfs['infusiondrug']['drugrate'].fillna(-1)
dfs['infusiondrug']['infusionrate'] = dfs['infusiondrug']['infusionrate'].fillna(-1)
dfs['infusiondrug']['drugamount'] = dfs['infusiondrug']['drugamount'].fillna(-1)
dfs['infusiondrug']['volumeoffluid'] = dfs['infusiondrug']['volumeoffluid'].fillna(-1)
dfs['infusiondrug']['patientweight'] = dfs['infusiondrug']['patientweight'].fillna(-1)

In [32]:
#
# Mantener las medidas más reciente para cada medicamento de los 10 que sean más frecuentes
#

# Medicamentos a tener en cuenta
acceptedDrugs = set(['fentanyl', 'propofol', 'norepinephrine', 'insulin', 'heparin', 'midazolam', 'dexmedetomidine', 'amiodarone', 'heparin', 'pantoprazole'])

def parse_drug_name(drug):
    drug = drug.lower()
    m = re.search('(.+)\(.+?$', drug)

    if type(m) == re.Match:
        return m.group(1).strip()
    else:
        return drug

patients = {}
columns = []
for drug in acceptedDrugs:
    columns += ['last_' + drug, 'last_' + drug + '_drugrate', 'last_' + drug + '_infusionrate',
                'last_' + drug + '_drugamount', 'last_' + drug + '_volumeoffluid', 'last_' + drug + '_patientweight']

for i, row in dfs['infusiondrug'].iterrows():
    pusID          = row['patientunitstayid']
    drug           = parse_drug_name(row['drugname'])
    drugrate       = row['drugrate']
    infusionrate   = row['infusionrate']
    drugamount     = row['drugamount']
    volumeoffluid  = row['volumeoffluid']
    patientweight  = row['patientweight']
    infusionOffset = row['infusionoffset'] 
    
    if infusionOffset < 0 or drug not in acceptedDrugs:
        continue
    else:
        if pusID not in patients:
            patients[pusID] = {}
            
            for targetdrug in acceptedDrugs:
                patients[pusID][targetdrug] = { 'drugrate': -1, 'infusionrate': -1, 'drugamount': -1, 'volumeoffluid': -1, 'patientweight': -1, 'lastOffset': float('inf') }
        else:
            if infusionOffset > patients[pusID][drug]['lastOffset']:
                patients[pusID][drug] = { 'drugrate': drugrate, 'infusionrate': infusionrate, 'drugamount': drugamount, 'volumeoffluid': volumeoffluid, 'patientweight': patientweight, 'lastOffset': infusionOffset }

new_df = pd.DataFrame(columns=columns)
keys = list(patients.keys())

new_df['patientunitstayid'] = keys

for drug in acceptedDrugs:
    new_df['last_' + drug] = [ -1 if patients[key][drug]['lastOffset'] == float('inf') else patients[key][drug]['lastOffset'] for key in keys ]
    new_df['last_' + drug + '_drugrate'] = [ patients[key][drug]['drugrate'] for key in keys ]
    new_df['last_' + drug + '_infusionrate'] = [ patients[key][drug]['infusionrate'] for key in keys ]
    new_df['last_' + drug + '_drugamount'] = [ patients[key][drug]['drugamount'] for key in keys ]
    new_df['last_' + drug + '_volumeoffluid'] = [ patients[key][drug]['volumeoffluid'] for key in keys ]
    new_df['last_' + drug + '_patientweight'] = [ patients[key][drug]['patientweight'] for key in keys ]

dfs['infusiondrug'] = new_df

<center><h2><b>intakeoutput</b></h2></center>

In [33]:
dfs['intakeOutput'] = dfs['intakeOutput'].drop(columns=[
    'intaketotal', # Computada en otra variable
    'outputtotal', # Computada en otra variable
    'cellpath', # Redundante en celltext
    'cellvaluetext' # dependiente de cellvaluenumeric
], axis=1)

<center><h2><b>lab</b></h2></center>

In [34]:
# Quitar filas sin laboresult
df = dfs['lab']
dfs['lab'] = df[df['labresult'].notnull()]

dfs['lab'] = dfs['lab'].drop(columns=[
    'labtypeid', # Irrelevante
    'labname', # Irrelevante
    'labresulttext', # Redundante con labResult
    'labresultrevisedoffset' # Reundante con labresultoffset
], axis=1)

<center><h2><b>medication</b></h2></center>

In [35]:
# df = dfs['medication']

# name_code = {}
# code_name = {}
# for i, row in df.iterrows():
#     if row['drugname'] == row['drugname'] and row['drughiclseqno'] == row['drughiclseqno']:
#         name_code[row['drugname']] = row['drughiclseqno']
#         code_name[row['drughiclseqno']] = row['drugname']

# for i, row in df.iterrows():
#     drugname = row['drugname']
#     code = row['drughiclseqno']
    
#     if drugname != drugname: # NaN
#         if code in code_name:
#             row['drugname'] = code_name[code]
#     if code != code: # NaN
#         if drugname in name_code:
#             row['drughiclseqno'] = name_code[drugname]


In [36]:
# TODO: Parsear dosage con drugname
# TODO: Arreglar drugname con drughiclseqno

# Quitar filas sin laboresult
df = dfs['medication']
dfs['medication'] = df[df['drugname'].notnull()]

dfs['medication'] = dfs['medication'].drop(columns=[
    'drugorderoffset', # Irrelevante
    'drugivadmixture', # No parece relevante
    'drugordercancelled', # Irrelevante
    'loadingdose', # Solamente hay 8 non-nulls
    #'prn', # ? Redundante con frequency
    #'gtc' # ? No parece que tenga relevancia
], axis=1)

#
# TODO:
# 4   drughiclseqno      42150 non-null  float64
# 5   dosage             41607 non-null  object 
# 6   routeadmin         44926 non-null  object 
# 7   frequency          39889 non-null  object 
# 8   prn                44941 non-null  object 
#

<center><h2><b>microLab</b></h2></center>

In [37]:
dfs['microLab'] = dfs['microLab'].drop(columns=[
    'antibiotic', # TODO: ? Irrelevante
    'sensitivitylevel', # No parece relevante
], axis=1)

<center><h2><b>note</b></h2></center>

In [38]:
# Reemplazar NaN por un valor por defecto
dfs['note']['notetext'] = dfs['note']['notetext'].fillna('_Unknown')

dfs['note'] = dfs['note'].drop(columns=[
    'noteenteredoffset', # Irrelevante
    'notetype', # No parece relevante
    'notevalue', # Parece tener información sobre notetext
], axis=1)

<center><h2><b>nurseAssessment</b></h2></center>

In [39]:
# TODO: ? Parece que hay múltiples assessments de diferentes enfermeras al mismo tiempo con diagnósticos diferentes.
# TODO: Unir celllabel	cellattribute	cellattributevalue
# TODO: Hacer algo con cellattributepath

# Reemplazar NaN por un valor por defecto
dfs['nurseAssessment']['cellattributevalue'] = dfs['nurseAssessment']['cellattributevalue'].fillna('_Unknown')

dfs['nurseAssessment'] = dfs['nurseAssessment'].drop(columns=[
    'nurseassessentryoffset', # Irrelevante
], axis=1)

<center><h2><b>nurseCare</b></h2></center>

In [40]:
dfs['nurseCare'] = dfs['nurseCare'].drop(columns=[
    'nursecareentryoffset', # Irrelevante
    'celllabel' # cellattribute es más detallada
], axis=1)

<center><h2><b>nurseCharting</b></h2></center>

In [41]:
# Hacer algo con esta mierda

dfs['nurseCharting'] = dfs['nurseCharting'].drop(columns=[
    'nursingchartentryoffset' # Irrelevante
], axis=1)

<center><h2><b>pastHistory</b></h2></center>

In [42]:
dfs['pastHistory'] = dfs['pastHistory'].drop(columns=[
    'pasthistoryenteredoffset', # Irrelevante
    'pasthistoryvaluetext' # Igual que pasthistoryvalue
], axis=1)

<center><h2><b>patient</b></h2></center>

In [43]:
#
# Añadir a paciente la media del n de minutos que ha estado en la UCI
#
def parse_avg_unit_stay(row, df):
    total_mins = 0
    total_entries = 0
    
    previous_visits_df = df[
        df['patienthealthsystemstayid'] == row['patienthealthsystemstayid']
    ][
        df['patientunitstayid'] < row['patientunitstayid']
    ]
    
    # Obtener las diagnossis de un paciente
    for i, row in previous_visits_df.iterrows():
        mins = row['unitdischargeoffset']
        total_mins += mins
        total_entries += 1
    
    return 0 if total_mins == 0 else total_mins//total_entries # Integer division

avg_unit_stay = []
for i,  row in dfs['patient'].iterrows():
    n = parse_avg_unit_stay(row, dfs['patient'])
    avg_unit_stay.append(n)

dfs['patient']['avg_unit_stay'] = avg_unit_stay

  previous_visits_df = df[


In [44]:
#
# Añadir a paciente la media del n de minutos que ha estado en el hospital
#
def parse_avg_hospital_stay(row, df):
    total_mins = 0
    total_entries = 0
    
    previous_visits_df = df[
        df['patienthealthsystemstayid'] == row['patienthealthsystemstayid']
    ][
        df['patientunitstayid'] < row['patientunitstayid']
    ]
    
    # Obtener las diagnossis de un paciente
    for i, row in previous_visits_df.iterrows():
        mins = row['hospitaldischargeoffset']
        total_mins += mins
        total_entries += 1
    
    return 0 if total_mins == 0 else total_mins//total_entries # Integer division

avg_unit_stay = []
for i,  row in dfs['patient'].iterrows():
    n = parse_avg_hospital_stay(row, dfs['patient'])
    avg_unit_stay.append(n)

dfs['patient']['avg_hospital_stay'] = avg_unit_stay

  previous_visits_df = df[


In [45]:
#
# Añadir BMI en admisión
#
def bmi(row):
    height = row['admissionheight']
    weight = row['admissionweight']
    
    if height != height or weight != weight: # has NaN
        return -1
    else:
        return height/weight

dfs['patient']['admission_bmi'] = dfs['patient'].apply(bmi, axis=1)

In [46]:
import math

def clean_age(row):
    age = row['age']
    
    if age == '':
        return 50 # TODO: Median
    elif age == '> 89':
        return 100
    elif math.isnan(float(age)):
        return 50 # ""
    else:
        return int(age)

In [47]:
#Sepsis, pulmonary              -> 121                                                                                                                                                                    121
#Diabetic ketoacidosis          -> 105                                                                                                                                                                 105
#CHF, congestive heart failure  -> 102   

def randomApache():
    num = random.randint(1, 3)
    
    if(num == 1): 
        return 'CHF, congestive heart failure'
    
    elif (num == 2): 
        return 'Diabetic ketoacidosis'
    
    else: return 'Sepsis, pulmonary'
     
def clean_apacheadmissiondx(row):
    apache = row['apacheadmissiondx']
    
    if apache != apache:
        return randomApache() # ""
    else:
        return apache

In [48]:
# 177.80    151
# 172.70    146
# 167.60    133
# 165.10    127
                                                                                                                                                    
def randomAdmissionheight():
    num = random.randint(1, 4)
    
    if(num == 1): 
        return 177.80
    
    elif (num == 2): 
        return 172.70
    
    elif (num == 3): 
        return 167.60
    
    else: return 165.10
     
def clean_admissionheight(row):
    admissionheight = row['admissionheight']
    
    if math.isnan(float(admissionheight)):
        return randomAdmissionheight() # ""
    else:
        return admissionheight

In [49]:
# Random
def clean_gender(row):
    gender = row['gender']
    
    if gender != gender:
        if random.randint(1, 2) % 2 == 0: return 'Male'
        else: return 'Famale'
    else:
        return gender

In [50]:
# Caucasian
def clean_ethnicity(row):
    ethnicity = row['ethnicity']
    
    if ethnicity != ethnicity:
        return 'Caucasian'
    else:
        return ethnicity

In [51]:
# Emergency Department
def clean_hospitaladmitsource(row):
    hospitalAdmit = row['hospitaladmitsource']
    
    if hospitalAdmit != hospitalAdmit:
        return 'Emergency Department' # ""
    else:
        return hospitalAdmit


In [52]:
def clean_admissionweight(row, df):
    admissionweight = row['admissionweight']
    
    if math.isnan(float(admissionweight)):
        return  df[df['age']==row['age']]['admissionweight'].median()
    else:
        return admissionweight    

In [53]:
# TODO: Feature idea - BMI
# TODO: ? Cual es más importante, unitDischargeOffset u hospitalDischargeOffset  

# Drop
dfs['patient'] = dfs['patient'].drop(columns=[
    'wardid', # ? Tiene pinta de que no va a ser muy importante
    'patienthealthsystemstayid', # Irrelevante
    'hospitaldischargeyear', # Irrelevante. Solo hay datos de los años 2014 y 2015
    'hospitaladmittime24', # Drop a priori. No parece importante
    'hospitaldischargetime24', # Drop a priori. No parece importante
    'hospitaldischargelocation', # No parece muy relevante
    'unittype', # Parece que ya está presente en otras partes
    'unitadmittime24', # No parece muy relevante
    'unitadmitsource', # ?
    'unitstaytype', # No parece muy relevante
    'dischargeweight', # No parece que sea muy relevante el peso cuando esté muerto
    'unitdischargetime24', # Irrelevante
    'unitdischargelocation', # Irrelevante
    'hospitaldischargestatus', # Irrelevante
    'unitdischargestatus',
    'uniquepid'
], axis=1)

# Eliminar NaN TODO
#dfs['patient'] = dfs['patient'].dropna()

# Age
dfs['patient']['age']                 = dfs['patient'].apply(clean_age, axis=1)

# Apacheadmissiondx
dfs['patient']['apacheadmissiondx']   = dfs['patient'].apply(clean_apacheadmissiondx, axis=1)

# Hospitaladmitsource
dfs['patient']['hospitaladmitsource'] = dfs['patient'].apply(clean_hospitaladmitsource, axis=1)

# Gender
dfs['patient']['gender']              = dfs['patient'].apply(clean_gender, axis=1)

# ethnicity
dfs['patient']['ethnicity']           = dfs['patient'].apply(clean_ethnicity, axis=1)

# admissionheight
dfs['patient']['admissionheight']     = dfs['patient'].apply(clean_admissionheight, axis=1)

# admissionweight -> OUTLAYERS TODO:
dfs['patient']['admissionweight']     = dfs['patient'].apply(lambda row : clean_admissionweight(row, dfs['patient']), axis=1)

<center><h2><b>physicalExam</b></h2></center>

In [54]:
#set(dfs['physicalExam']['physicalexamvalue'])
dfs['physicalExam']['physicalexamtext'] = dfs['physicalExam']['physicalexamtext'].fillna('_Unknown')
# BPDiastolic current
# BPDiastolic highest
# BPDiastolic lowest
# BPSystolic current
# BPSystolic highest
# BPSystolic lowest

# Blood loss
# Dialysis net

# O2 Sat current
# O2 Sat lowest
# O2 Sat highest
# Urine
# Intubated
# Comatose
# Ventilated

def physical_exam_info(group):
    pusID, group_df = group
    d = {
        'BP (diastolic) Current': -1,
        'BP (diastolic) Highest': -1,
        'BP (diastolic) Lowest' : -1,
        
        'BP (systolic) Current': -1,
        'BP (systolic) Highest': -1,
        'BP (systolic) Lowest' : -1,
        
        'Blood Loss': -1,
        'Dialysis Net': -1,
        'O2 Sat% Current': -1,
        'O2 Sat% Highest': -1,
        'O2 Sat% Lowest': -1,
        'Urine': -1,
        'Intubated': -1,
        'Comatose': -1,
        'Ventilated': -1,
        
        'Motor Score': -1,
        'Verbal Score': -1,
        'Eyes Score': -1
    }
    
    last_offset = float('inf')
    
    # Obtener las diagnossis de un paciente
    for i, row in group_df.iterrows():
        off = row['physicalexamoffset']
        
        if off < last_offset:
            last_offset = off
            diag = row['physicalexamvalue']
            val = row['physicalexamtext']
            
            
            if diag in d:
                d[diag] = val
    
    for i, row in group_df.iterrows():
        path = row['physicalexampath']
        part = path.split('/')[-2]
            
        if part in d:
            d[part] = row['physicalexamtext']
    
    return [pusID, d['BP (diastolic) Current'], d['BP (diastolic) Highest'], d['BP (diastolic) Lowest'],
            d['BP (systolic) Current'], d['BP (systolic) Highest'], d['BP (systolic) Lowest'], d['Blood Loss'],
            d['Dialysis Net'], d['O2 Sat% Current'], d['O2 Sat% Highest'], d['O2 Sat% Lowest'], d['Urine'], d['Intubated'], d['Comatose'], d['Ventilated'],
            d['Motor Score'], d['Verbal Score'], d['Eyes Score']]

new_df = pd.DataFrame(columns=[
    'patientunitstayid', 'BPD_Current', 'BPD_Highest', 'BPD_Lowest', 'BPS_Current', 'BPS_Highest', 'BPS_Lowest', 'Blood_Loss', 'Dialysis_Net',
    'O2Sat_Current', 'O2Sat_Highest', 'O2Sat_Lowest', 'Urine', 'Intubated', 'Comatose', 'Ventilated', 'Motor', 'Verbal', 'Eyes'
])

for i,  group in enumerate(dfs['physicalExam'].groupby('patientunitstayid')):
    row = physical_exam_info(group)
    new_df.loc[i] = row

dfs['physicalExam'] = new_df

<center><h2><b>respiratoryCare</b></h2></center>

In [55]:
# Me quedo con el numero máximo de cada paciente
def clean_respiratoryCare(row, df):
    respiratoryCare = row['currenthistoryseqnum']
           
    return  df[df['patientunitstayid']==row['patientunitstayid']]['currenthistoryseqnum'].max()

# Clasifico
def range_currenthistoryseqnum(row, df):
    median = 3.00  # df['currenthistoryseqnum'].median()
    currenthistoryseqnum = row ['currenthistoryseqnum']
    
    if (currenthistoryseqnum<median):
        return 'Low'
    elif currenthistoryseqnum == median:
        return 'Mid'
    else:
        return 'High' 

In [56]:
dfs['respiratoryCare']['currenthistoryseqnum'] = dfs['respiratoryCare'].apply(lambda row : clean_respiratoryCare(row, dfs['respiratoryCare']), axis=1)
dfs['respiratoryCare']['currenthistoryseqnum'] = dfs['respiratoryCare'].apply(lambda row : range_currenthistoryseqnum(row, dfs['respiratoryCare']), axis=1)

dfs['respiratoryCare'] = dfs['respiratoryCare'].drop(columns=[
    'airwaysize', 'airwayposition', 'cuffpressure', 'apneaparams', 'lowexhmvlimit', 'hiexhmvlimit', 'lowexhtvlimit', 'hipeakpreslimit', 'lowpeakpreslimit',
    'hirespratelimit', 'lowrespratelimit', 'sighpreslimit', 'lowironoxlimit', 'highironoxlimit', 'meanairwaypreslimit', 'peeplimit', 'cpaplimit',
    'setapneainterval', 'setapneatv', 'setapneaippeephigh', 'setapnearr', 'setapneapeakflow', 'setapneainsptime', 'setapneaie', 'setapneafio2', 
    'respcarestatusoffset', 'airwaytype', 'ventstartoffset', 'ventstartoffset', 'ventstartoffset', 'priorventendoffset', 'ventendoffset', 'priorventstartoffset'
], axis=1)

#
# TODO: airwaytype tiene nulls
#

<center><h2><b>respiratoryCharting</b></h2></center>

In [57]:
# TODO: ?

dfs['respiratoryCharting'] = dfs['respiratoryCharting'].drop(columns=[
    'respchartentryoffset' # Irrelevante
], axis=1)

<center><h2><b>treatment</b></h2></center>

In [58]:
# TODO: ?

dfs['treatment'] = dfs['treatment'].drop(columns=[
    'activeupondischarge', # Irrelevante    
], axis=1)

<center><h2><b>vitalAperiodic</b></h2></center>

In [59]:
# TODO

dfs['vitalAperiodic'] = dfs['vitalAperiodic'].drop(columns=[
], axis=1)


#
# TODO: Tiene valores nulls
#

<center><h2><b>vitalPeriodic</b></h2></center>

In [60]:
# TODO

dfs['vitalPeriodic'] = dfs['vitalPeriodic'].drop(columns=[
], axis=1)

#
# TODO: Tiene valores nulls
#

---

<center><h2><b>Eliminar duplicados</b></h2></center>

In [61]:
for df_name in dfs:
    n = len(dfs[df_name].duplicated())
    dfs[df_name] = dfs[df_name].drop_duplicates()
    
    diff = n - len(dfs[df_name].duplicated())
    if diff > 0:
        print(df_name, ':', diff)


lab : 3989
medication : 48
microLab : 122
note : 27
pastHistory : 4
respiratoryCare : 5041
treatment : 137
_allergyDrug : 2
_allergyNonDrug : 1


---

<center><h2><b>Exportar a CSV</b></h2></center>

In [62]:
for df_name in dfs:
    dfs[df_name].to_csv('../db/csv_clean/' + df_name + '.csv', index=False)

---

<center><h2><b>Exportar a SQL</b></h2></center>

In [64]:
import os

sqlite_path = "../db/sqlite/eicu_v2_0_1_clean.sqlite3"
#os.remove(sqlite_path)
conn = db.connect(sqlite_path)

for df_name in dfs:
    dfs[df_name].to_sql(df_name, conn, index=False)