In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
raw_data = pd.read_csv('../data/raw/df_merged11.csv')

# get rid of hospice patients
no_hospice_raw_data = raw_data[raw_data['discharge_location'] != 'HOSPICE']

print(len(no_hospice_raw_data), len(raw_data))

35289 36423


In [3]:
# deduplicate by hadm_id
dedup_no_hospice_raw_data = no_hospice_raw_data.drop_duplicates(['hadm_id'])

print(len(dedup_no_hospice_raw_data), len(dedup_no_hospice_raw_data[dedup_no_hospice_raw_data['deathtime'].isna()]), len(
    dedup_no_hospice_raw_data[~dedup_no_hospice_raw_data['deathtime'].isna()]))

12660 11251 1409


In [4]:
columns = dedup_no_hospice_raw_data.columns

In [5]:
# get rid of patients with no icu stay duration
dedup_no_hospice_icu_raw_data = dedup_no_hospice_raw_data[dedup_no_hospice_raw_data['icu_stay_duration'].notna(
)]

print(len(dedup_no_hospice_icu_raw_data), len(dedup_no_hospice_icu_raw_data[dedup_no_hospice_icu_raw_data['deathtime'].isna()]), len(
    dedup_no_hospice_icu_raw_data[~dedup_no_hospice_icu_raw_data['deathtime'].isna()]))

6104 4866 1238


In [6]:
# help me look at null values in each column
for column in columns:
    print(column, dedup_no_hospice_icu_raw_data[column].isnull().sum())

subject_id 0
hadm_id 0
gender 0
anchor_age 0
race 0
admittime 0
dischtime_x 0
hospital_expire_flag 4864
admission_type 0
insurance 0
marital_status 709
stay_id 0
intime 0
outtime 0
los 0
deathtime 4866
discharge_location 49
item_51006 693
item_51221 719
item_51222 706
item_51265 714
item_51301 705
item_51237 907
item_51274 902
item_51275 922
item_50863 2427
item_50878 2354
item_50885 2510
total_drugs 21
unique_drugs 21
lods_score 172
oasis_score 0
procedure_221214 2818
procedure_221216 5406
procedure_221217 4832
procedure_221223 5474
procedure_221255 5816
procedure_223253 3941
procedure_224263 4227
procedure_224264 4791
procedure_224267 5607
procedure_224268 6014
procedure_224269 5865
procedure_224270 5845
procedure_224272 6028
procedure_224274 4003
procedure_224275 958
procedure_224276 5018
procedure_224277 1912
procedure_224385 5033
procedure_224560 5691
procedure_225202 6036
procedure_225204 5949
procedure_225399 5975
procedure_225400 5536
procedure_225401 4497
procedure_225402 3697

- id: subject_id, hadm_id, 

- change to bool of null or not null: ventilation_duration; stay_id (turn into 'went into ICU or not'); procedure (probably can only pick 1 since all 301)

- write on discussion: people choose to record certain features; this induces biases.

- write on discussion: if missingness too correlated with death, then remove


In [7]:
drop_columns = ['admittime', 'dischtime_x', 'starttime', 'endtime',
                'intime', 'outtime', 'hospital_expire_flag', 'los', 'discharge_location']
drop_columns.extend(['item_50863', 'item_50878', 'item_50885',
                    'item_51237', 'item_51274', 'item_51275'])
drop_columns.extend(['ph', 'PaO2', 'calcium', 'ptt', 'PaCO2'])

# second round of dropping columns after chisq, spearmann corr and mann-whitney-u tests
drop_columns.extend(['item_51221', 'item_51006', 'total_drugs', 'inr', 'diastolic_bp',
                     'sodium', 'temperature', 'procedure_221216',  'procedure_221216',
                     'procedure_221217', 'procedure_221255', 'procedure_224264',
                     'procedure_224268', 'procedure_224269', 'procedure_224272',
                     'procedure_224277', 'procedure_224560', 'procedure_225202',
                     'procedure_225204', 'procedure_225399', 'procedure_225402',
                     'procedure_225433', 'procedure_225437', 'procedure_225439',
                     'procedure_225440', 'procedure_225441', 'procedure_225448',
                     'procedure_225462', 'procedure_225468', 'procedure_225789',
                     'procedure_225794', 'procedure_225805', 'procedure_225814',
                     'procedure_225966', 'procedure_227719', 'procedure_228129',
                     'procedure_229526', 'procedure_229532'
                     ])

fillna_columns = ['unique_drugs',
                  'marital_status', 'oasis_score', 'lods_score', 'deathtime']

booleanise_columns = ['ventilation_duration', 'stay_id']
for column in columns:
    if 'procedure' in column and column not in drop_columns:
        booleanise_columns.append(column)

one_hot_columns = ['gender', 'race']
one_hot_columns.extend(['insurance', 'marital_status'])

maybe_keep = ['hospital_stay_duration', 'icu_stay_duration']

In [8]:
drop_columns_df = dedup_no_hospice_icu_raw_data.drop(columns=drop_columns)

for column in fillna_columns:
    drop_columns_df[column].fillna(0, inplace=True)

for column in booleanise_columns:
    drop_columns_df[column] = drop_columns_df[column].apply(
        lambda x: 1 if not pd.isna(x) else 0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  drop_columns_df[column].fillna(0, inplace=True)


In [9]:
len(drop_columns_df.columns)

76

In [10]:
for column in drop_columns_df.columns:
    print(column, drop_columns_df[column].isnull().sum())

subject_id 0
hadm_id 0
gender 0
anchor_age 0
race 0
admission_type 0
insurance 0
marital_status 0
stay_id 0
deathtime 0
item_51222 706
item_51265 714
item_51301 705
unique_drugs 0
lods_score 0
oasis_score 0
procedure_221214 0
procedure_221223 0
procedure_223253 0
procedure_224263 0
procedure_224267 0
procedure_224270 0
procedure_224274 0
procedure_224275 0
procedure_224276 0
procedure_224385 0
procedure_225400 0
procedure_225401 0
procedure_225427 0
procedure_225430 0
procedure_225432 0
procedure_225444 0
procedure_225446 0
procedure_225451 0
procedure_225454 0
procedure_225457 0
procedure_225459 0
procedure_225464 0
procedure_225469 0
procedure_225470 0
procedure_225752 0
procedure_225792 0
procedure_225802 0
procedure_225817 0
procedure_226124 0
procedure_226236 0
procedure_226475 0
procedure_227194 0
procedure_227712 0
procedure_228125 0
procedure_228127 0
procedure_228128 0
procedure_228130 0
procedure_228715 0
procedure_229298 0
procedure_229351 0
procedure_229380 0
procedure_2295

In [14]:
dropna_df = drop_columns_df.dropna()

In [15]:
len(dropna_df), len(dropna_df[dropna_df['deathtime'] == 0]), len(
    dropna_df[dropna_df['deathtime'] != 0])

(4621, 3778, 843)

In [17]:
os.makedirs('../data/curated', exist_ok=True)
dropna_df.to_csv('../data/curated/rm_na_df_v2.csv', index=False)