## Load data files

In [2]:
import pandas as pd
import numpy as np
import math

df_cohort = pd.read_csv('data/crrt_patients_cohort_final.csv')
df_demo = pd.read_csv('data/demographics_final.csv')
df_vital = pd.read_csv('data/vital_signs_final.csv')
df_lab_before = pd.read_csv('data/lab_results_before_avg.csv')
df_vasopressor_before = pd.read_csv('data/vasopressor_before.csv')

race_map = {
    'UNKNOWN': 'UNKNOWN',
    'PATIENT DECLINED TO ANSWER': 'UNKNOWN',
    'UNABLE TO OBTAIN': 'UNKNOWN',

    'WHITE': 'WHITE',
    'WHITE - EASTERN EUROPEAN': 'WHITE',
    'WHITE - BRAZILIAN': 'WHITE',
    'WHITE - OTHER EUROPEAN': 'WHITE', 
    'WHITE - RUSSIAN': 'WHITE',

    'BLACK/AFRICAN AMERICAN': 'BLACK',
    'BLACK/AFRICAN': 'BLACK',
    'BLACK/CAPE VERDEAN': 'BLACK',
    'BLACK/CARIBBEAN ISLAND': 'BLACK',

    'HISPANIC/LATINO - PUERTO RICAN': 'OTHERS',
    'AMERICAN INDIAN/ALASKA NATIVE': 'OTHERS',
    'ASIAN - CHINESE': 'OTHERS',
    'ASIAN - ASIAN INDIAN': 'OTHERS',
    'PORTUGUESE': 'OTHERS',
    'HISPANIC OR LATINO': 'OTHERS',
    'MULTIPLE RACE/ETHNICITY': 'OTHERS',
    'ASIAN - SOUTH EAST ASIAN': 'OTHERS',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'OTHERS',
    'ASIAN': 'OTHERS',
    'HISPANIC/LATINO - DOMINICAN': 'OTHERS',
    'HISPANIC/LATINO - CUBAN': 'OTHERS',
    'HISPANIC/LATINO - GUATEMALAN': 'OTHERS',
    'HISPANIC/LATINO - MEXICAN': 'OTHERS',
    'HISPANIC/LATINO - SALVADORAN': 'OTHERS',
    'HISPANIC/LATINO - HONDURAN': 'OTHERS',
    'ASIAN - KOREAN': 'OTHERS',
    'HISPANIC/LATINO - COLUMBIAN': 'OTHERS',
    'OTHER': 'OTHERS'
}

# Group race categories
df_demo['race'] = df_demo['race'].fillna('UNKNOWN')
df_demo['race'].replace(race_map, inplace=True)

df_cohort['citrate_yn'] = df_cohort['citrate_yn'].apply(lambda x: 1 if x != 0 or math.isnan(x)==False else 0)
df_cohort['heparin_yn'] = df_cohort['heparin_yn'].apply(lambda x: 1 if x != 0 or math.isnan(x)==False else 0)

# Censor filters with duration > 120 hours
df_cohort['crrt_duration_hrs'] = df_cohort['crrt_duration_hrs'].apply(lambda x: 120 if x>120 else x)
df_cohort['filter_clotted'] = np.where(df_cohort['crrt_duration_hrs']==120, 0, df_cohort['filter_clotted'])

# Get time to first clot
df_cohort['time_to_first_clot'] = pd.to_datetime(df_cohort['first_clot_present_time']) - pd.to_datetime(df_cohort['crrt_starttime'])
df_cohort['time_to_first_clot'] = (df_cohort['time_to_first_clot'].dt.total_seconds()/3600).round(0)


In [3]:
def collate_data(keys, cohort, demo, vital, lab_before, vaso_before):
    combined = keys.merge(cohort, how='inner', on=['subject_id', 'stay_id', 'crrt_starttime'])
    combined = combined.merge(demo, how='left', on=['subject_id', 'stay_id'])
    combined = combined.merge(vital, how='left', on=['stay_id', 'crrt_starttime'])
    combined = combined.merge(lab_before, how='left', on=['subject_id', 'stay_id', 'crrt_starttime'])
    # combined = combined.merge(vaso_before, how='left', on=['subject_id', 'stay_id', 'crrt_starttime', 'hadm_id'])
    return combined

## Data collation and analysis

In [4]:
all_keys = df_cohort[['subject_id', 'stay_id', 'crrt_starttime']]
all_merged = collate_data(all_keys, df_cohort, df_demo, df_vital, df_lab_before, df_vasopressor_before)

### Get data fields with large number of missing parameters

In [7]:
print('Total records', len(all_merged))
all_merged.isna().sum().sort_values(ascending=False).to_dict()

Total records 3281


{'ddimer_before': 3277,
 'globulin_before': 3272,
 'crp_before': 3266,
 'direct_bill_before': 3081,
 'troponint_before': 2943,
 'albumin_before': 2588,
 'fibrinogen_before': 2502,
 'dbp_ni_12hrs_after': 2204,
 'sbp_ni_12hrs_after': 2204,
 'mbp_ni_12hrs_after': 2202,
 'alt_before': 1945,
 'alp_before': 1937,
 'ast_before': 1924,
 'dbp_ni_12hrs_before': 1908,
 'sbp_ni_12hrs_before': 1908,
 'mbp_ni_12hrs_before': 1903,
 'total_bill_before': 1899,
 'tmp': 1428,
 'inr_before': 1094,
 'pt_before': 1094,
 'first_clot_present_time': 1072,
 'time_to_first_clot': 1072,
 'ptt_before': 987,
 'lactate_before': 765,
 'wbc_before': 752,
 'platelets_before': 720,
 'hematocrit_before': 616,
 'postfilter_replacement_rate': 418,
 'admit_weight': 349,
 'prefilter_replacement_rate': 330,
 'temperature_12hrs_after': 294,
 'temperature_12hrs_before': 291,
 'calcium_before': 245,
 'glucose_before': 209,
 'sbp_12hrs_after': 192,
 'dbp_12hrs_after': 192,
 'creatine_before': 178,
 'nitrogen_before': 178,
 'sbp_1