## Update dataset for cvd mortality, use both ed_visits and inpatient data

In [51]:
import pandas as pd
import os
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import math

datatype = "test"

In [52]:
if datatype == "train": 
    in_ad = pd.read_csv("/home/bhatti/dataset/VCHAMPS/inpatient_admissions_train.csv", index_col=0)
    ed = pd.read_csv("/home/bhatti/dataset/VCHAMPS/ed_visits_train.csv", index_col=0)
    death = pd.read_csv("/home/bhatti/dataset/VCHAMPS/death_train.csv", index_col=0)
if datatype == "test": 
    in_ad = pd.read_csv("/data/public/MLA/VCHAMPS-Test/inpatient_admissions_test.csv", index_col=0)
    ed = pd.read_csv("/data/public/MLA/VCHAMPS-Test/ed_visits_test.csv", index_col=0)
    death = pd.read_csv("/data/public/MLA/VCHAMPS-Test/death_test.csv", index_col=0)

In [53]:
len(set(in_ad["Internalpatientid"]) & set(ed["Internalpatientid"]))

13622

In [54]:
in_ad.columns

Index(['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Admitting unit service', 'Discharging unit service',
       'Admitting specialty', 'Discharging specialty',
       'First listed discharge diagnosis icd10 subcategory',
       'Second listed discharge diagnosis icd10 subcategory',
       'Discharge disposition', 'Died during admission',
       'Outpatientreferralflag', 'Serviceconnectedflag', 'Agentorangeflag',
       'State'],
      dtype='object')

In [55]:
ed = ed.rename(columns={'Age at ed visit': 'Age at admission', 'Ed visit start date': 'Admission date', 
'Discharge date ed': 'Discharge date', 'Died during ed visit': 'Died during admission',
'First listed diagnosis icd10 subcategory': 'First listed discharge diagnosis icd10 subcategory',
'Second listed diagnosis icd10 subcategory': 'Second listed discharge diagnosis icd10 subcategory'})

In [56]:
ed.columns

Index(['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Died during admission',
       'First listed discharge diagnosis icd10 subcategory',
       'Second listed discharge diagnosis icd10 subcategory', 'State'],
      dtype='object')

In [57]:
full = pd.concat([ed, in_ad[['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Died during admission',
       'First listed discharge diagnosis icd10 subcategory',
       'Second listed discharge diagnosis icd10 subcategory', 'State']]])

In [58]:
full.columns

Index(['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Died during admission',
       'First listed discharge diagnosis icd10 subcategory',
       'Second listed discharge diagnosis icd10 subcategory', 'State'],
      dtype='object')

In [59]:
full

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Died during admission,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,State
1,102421,69.520563,2017-11-06 18:22:30.0,2017-11-07 12:11:15.0,No,Systolic (congestive) heart failure,"Urinary tract infection, site not specified",New York
2,104374,44.119446,2020-09-23 15:46:58.0,2020-09-23 22:44:24.0,No,"Pneumonia, unspecified organism",Type 2 diabetes mellitus with ophthalmic compl...,Virginia
8,115748,63.721484,2020-05-02 04:51:35.0,2020-05-02 06:54:34.0,No,Other difficulties with micturition,"Urinary tract infection, site not specified",Florida
10,117891,74.097866,2023-05-10 18:28:53.0,2023-05-11 02:58:23.0,No,"Sepsis, unspecified organism","Urinary tract infection, site not specified",Arizona
18,119001,72.116844,2020-01-05 16:57:31.0,2020-01-05 21:58:47.0,No,Mechanical complication of urinary catheter,Other surgical procedures as the cause of abno...,California
...,...,...,...,...,...,...,...,...
665801,99850,73.529082,2016-06-19 21:06:27.0,2016-06-22 04:17:40.0,No,"Benign neoplasm of colon, unspecified",Non-ST elevation (NSTEMI) myocardial infarction,Nevada
665805,99861,77.926770,2002-09-27 05:37:30.0,2002-10-16 06:58:24.0,No,"Chronic obstructive pulmonary disease, unspeci...","Finding of other specified substances, not nor...",Louisiana
665815,99883,58.530994,2001-10-23 01:52:50.0,2001-10-24 03:37:33.0,No,Other restrictive cardiomyopathy,Not specified,Texas
665824,99920,65.035879,2003-10-02 14:43:38.0,2003-10-08 06:12:01.0,No,"Cutaneous abscess, furuncle and carbuncle of l...",Unspecified atrial fibrillation and atrial flu...,Oregon


In [60]:
for i in full.columns:
    print(i, full[full[i].isna()].shape[0]/len(full))

Internalpatientid 0.0
Age at admission 0.0
Admission date 0.0
Discharge date 0.000547496164039615
Died during admission 0.0
First listed discharge diagnosis icd10 subcategory 0.0
Second listed discharge diagnosis icd10 subcategory 0.0
State 0.0


#### Died

In [61]:
full["Died during admission"] = full["Died during admission"].replace({"Yes":1, "No": 0})

#### Stay length

In [62]:
full["Admission date"] = pd.to_datetime(full["Admission date"], format='%Y-%m-%d %H:%M:%S.%f')
full["Discharge date"] = pd.to_datetime(full["Discharge date"], format='%Y-%m-%d %H:%M:%S.%f')
full["stay_length"] = (full["Discharge date"] - full["Admission date"]).dt.days + round((full["Discharge date"] - full["Admission date"]).dt.seconds/3600/24,2)

In [63]:
full["stay_length"] = full["stay_length"].fillna(full["stay_length"].mean())

### Age at admission

In [64]:
print(full["Age at admission"].min())
print(full["Age at admission"].max())

18.01635608248468
108.11996633860262


In [65]:
full['Age 20-40'] = [0] * len(full)
full['Age 40-60'] = [0] * len(full)
full['Age 60-80'] = [0] * len(full)
full['Age 80-100'] = [0] * len(full)
full['Age 100-120'] = [0] * len(full)
def age_category(data):
    data.loc[(data['Age at admission'] > 20) & (data['Age at admission'] <= 40), 'Age 20-40'] = 1
    data.loc[(data['Age at admission'] > 40) & (data['Age at admission'] <= 60), 'Age 40-60'] = 1
    data.loc[(data['Age at admission'] > 60) & (data['Age at admission'] <= 80), 'Age 60-80'] = 1
    data.loc[(data['Age at admission'] > 80) & (data['Age at admission'] <= 100), 'Age 80-100'] = 1
    data.loc[(data['Age at admission'] > 100) & (data['Age at admission'] <= 120), 'Age 100-120'] = 1
    return data
full = age_category(full)
full

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Died during admission,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,State,stay_length,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120
1,102421,69.520563,2017-11-06 18:22:30,2017-11-07 12:11:15,0,Systolic (congestive) heart failure,"Urinary tract infection, site not specified",New York,0.74,0,0,1,0,0
2,104374,44.119446,2020-09-23 15:46:58,2020-09-23 22:44:24,0,"Pneumonia, unspecified organism",Type 2 diabetes mellitus with ophthalmic compl...,Virginia,0.29,0,1,0,0,0
8,115748,63.721484,2020-05-02 04:51:35,2020-05-02 06:54:34,0,Other difficulties with micturition,"Urinary tract infection, site not specified",Florida,0.09,0,0,1,0,0
10,117891,74.097866,2023-05-10 18:28:53,2023-05-11 02:58:23,0,"Sepsis, unspecified organism","Urinary tract infection, site not specified",Arizona,0.35,0,0,1,0,0
18,119001,72.116844,2020-01-05 16:57:31,2020-01-05 21:58:47,0,Mechanical complication of urinary catheter,Other surgical procedures as the cause of abno...,California,0.21,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665801,99850,73.529082,2016-06-19 21:06:27,2016-06-22 04:17:40,0,"Benign neoplasm of colon, unspecified",Non-ST elevation (NSTEMI) myocardial infarction,Nevada,2.30,0,0,1,0,0
665805,99861,77.926770,2002-09-27 05:37:30,2002-10-16 06:58:24,0,"Chronic obstructive pulmonary disease, unspeci...","Finding of other specified substances, not nor...",Louisiana,19.06,0,0,1,0,0
665815,99883,58.530994,2001-10-23 01:52:50,2001-10-24 03:37:33,0,Other restrictive cardiomyopathy,Not specified,Texas,1.07,0,1,0,0,0
665824,99920,65.035879,2003-10-02 14:43:38,2003-10-08 06:12:01,0,"Cutaneous abscess, furuncle and carbuncle of l...",Unspecified atrial fibrillation and atrial flu...,Oregon,5.64,0,0,1,0,0


#### CVD

In [66]:
cardiovascular = ["Heart failure, unspecified", "Other heart failure",
                  "Cardiogenic shock", "Hypertensive heart disease with heart failure",
                  "Hypertensive heart and kidney disease with heart failure", "Unstable angina", "Other forms of chronic ischemic heart disease",
                  "Atherosclerotic heart disease of native coronary artery", "Atrial fibrillation", "Atrial flutter",
                  "Supraventricular tachycardia", "Ventricular tachycardia"]

In [67]:
full["cd_diagnosis"] = [0] * len(full)
def cd_diagnosis(data):
    data.loc[(full["Second listed discharge diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | full["Second listed discharge diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | full["Second listed discharge diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | full["Second listed discharge diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | full["Second listed discharge diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | full["Second listed discharge diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)
      | full["First listed discharge diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | full["First listed discharge diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | full["First listed discharge diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | full["First listed discharge diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | full["First listed discharge diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | full["First listed discharge diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)), "cd_diagnosis"] = 1
    return data
full = cd_diagnosis(full)
full


Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Died during admission,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,State,stay_length,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120,cd_diagnosis
1,102421,69.520563,2017-11-06 18:22:30,2017-11-07 12:11:15,0,Systolic (congestive) heart failure,"Urinary tract infection, site not specified",New York,0.74,0,0,1,0,0,1
2,104374,44.119446,2020-09-23 15:46:58,2020-09-23 22:44:24,0,"Pneumonia, unspecified organism",Type 2 diabetes mellitus with ophthalmic compl...,Virginia,0.29,0,1,0,0,0,0
8,115748,63.721484,2020-05-02 04:51:35,2020-05-02 06:54:34,0,Other difficulties with micturition,"Urinary tract infection, site not specified",Florida,0.09,0,0,1,0,0,0
10,117891,74.097866,2023-05-10 18:28:53,2023-05-11 02:58:23,0,"Sepsis, unspecified organism","Urinary tract infection, site not specified",Arizona,0.35,0,0,1,0,0,0
18,119001,72.116844,2020-01-05 16:57:31,2020-01-05 21:58:47,0,Mechanical complication of urinary catheter,Other surgical procedures as the cause of abno...,California,0.21,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665801,99850,73.529082,2016-06-19 21:06:27,2016-06-22 04:17:40,0,"Benign neoplasm of colon, unspecified",Non-ST elevation (NSTEMI) myocardial infarction,Nevada,2.30,0,0,1,0,0,1
665805,99861,77.926770,2002-09-27 05:37:30,2002-10-16 06:58:24,0,"Chronic obstructive pulmonary disease, unspeci...","Finding of other specified substances, not nor...",Louisiana,19.06,0,0,1,0,0,0
665815,99883,58.530994,2001-10-23 01:52:50,2001-10-24 03:37:33,0,Other restrictive cardiomyopathy,Not specified,Texas,1.07,0,1,0,0,0,0
665824,99920,65.035879,2003-10-02 14:43:38,2003-10-08 06:12:01,0,"Cutaneous abscess, furuncle and carbuncle of l...",Unspecified atrial fibrillation and atrial flu...,Oregon,5.64,0,0,1,0,0,0


### Final dataset

In [68]:
def final_set_before(ids, group):
    
    readmission = 0
    num_admissions = group["Age at admission"].nunique()
    if num_admissions > 1: readmission=1

    age_mean = group["Age at admission"].mean()
    age_std = group["Age at admission"].std()
    if group["Age at admission"].nunique() == 1: age_std = 0

    min_age = group["Age at admission"].min()
    max_age = group["Age at admission"].max()

    freq = num_admissions/(math.floor(max_age - min_age) + 1)

    min_stay = group["stay_length"].min()
    max_stay = group["stay_length"].min()
    stay_mean = group["stay_length"].mean()
    stay_std = group["stay_length"].std()
    if group["stay_length"].nunique() == 1: stay_std = 0

    # num_cvd_readmission = max(0, group['cd_diagnosis'].sum() - 1)
    num_cvd_admission = group['cd_diagnosis'].sum()

    cvd = 0
    if group["cd_diagnosis"].sum() > 0: cvd = 1

    if (group.iloc[len(group)-1]["Died during admission"] == 1) and (group.iloc[len(group)-1]['cd_diagnosis'] == 1): died_by_cvd = 1
    else: died_by_cvd = 0

    df = pd.DataFrame(data={'Internalpatientid': [ids], 'num_stays': [num_admissions], 'stay_length': group["stay_length"].sum(), 
                            "num_cvd_admission": [num_cvd_admission], "CVD": [cvd],
                            'Age 20-40': group["Age 20-40"].sum(), 'Age 40-60': group["Age 40-60"].sum(), 'Age 60-80':group["Age 60-80"].sum(), 
                            'Age 80-100':group["Age 80-100"].sum(), 'Age 100-120':group["Age 100-120"].sum(), "age_mean": [age_mean], "age_std": [age_std], 
                            "age_min": [min_age], "age_max": [max_age], "stay_min": [min_stay], "stay_max": [max_stay], "stay_mean": [stay_mean],
                            "stay_std": [stay_std], "freq": [round(freq,2)], "died_by_cvd": died_by_cvd
                            })
    df = df.reset_index(drop=True)
    return df

In [69]:
tidy_dataset = []
for ids, group in tqdm(full.groupby("Internalpatientid")):
    group = group.sort_values(by = ['Admission date'],ascending=True).reset_index(drop = True)
    df = final_set_before(ids, group)
    tidy_dataset.append(df)
    
tidy_dataset = pd.concat(tidy_dataset)

  0%|          | 0/24003 [00:00<?, ?it/s]

In [70]:
if datatype == "train": inpatient_before = pd.read_csv("/home/hassan/lily/MLA/FDA/inpatient_cvd_mortality.csv", index_col = 0)
if datatype == "test": inpatient_before = pd.read_csv("/home/hassan/lily/MLA/FDA/inpatient_cvd_mortality_test.csv", index_col = 0)
print(inpatient_before.columns)
inpatient_before = inpatient_before[['Internalpatientid', 
       'num_unique_units', 'num_transfers',
       'unique_admitting_specialty', 'unique_discharging_specialty',
       'DOMICILIARY', 'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY',
       'SURGERY', 'Medical', 'Mental',
       'Others_Specialty', 'Rehab', 'Gerontology', 'Age at death']]

Index(['Internalpatientid', 'num_stays', 'stay_length', 'num_unique_units',
       'num_transfers', 'num_cvd_admission', 'AO', 'CVD',
       'unique_admitting_specialty', 'unique_discharging_specialty',
       'DOMICILIARY', 'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY',
       'SURGERY', 'Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'age_min', 'age_max', 'stay_min',
       'stay_max', 'stay_mean', 'stay_std', 'freq', 'Medical', 'Mental',
       'Others_Specialty', 'Rehab', 'Gerontology', 'died_by_cvd',
       'Age at death'],
      dtype='object')


In [71]:
tidy_dataset = tidy_dataset.merge(right = death, how="left", on="Internalpatientid")
tidy_dataset = tidy_dataset[~tidy_dataset["Age at death"].isna()]

In [72]:
tidy_dataset = tidy_dataset.merge(right = inpatient_before, how="left", on="Internalpatientid")

In [73]:
for i in tidy_dataset.columns:
    if tidy_dataset[tidy_dataset[i].isna()].shape[0] != 0: 
        print(i, tidy_dataset[tidy_dataset[i].isna()].shape[0]/len(tidy_dataset))
        tidy_dataset[i] = tidy_dataset[i].fillna(tidy_dataset[i].mean())

num_unique_units 0.04031147050014974
num_transfers 0.04031147050014974
unique_admitting_specialty 0.04031147050014974
unique_discharging_specialty 0.04031147050014974
DOMICILIARY 0.04031147050014974
MEDICINE 0.04031147050014974
NHCU 0.04031147050014974
NON-COUNT 0.04031147050014974
OTHERS 0.04031147050014974
PSYCHIATRY 0.04031147050014974
SURGERY 0.04031147050014974
Medical 0.04031147050014974
Mental 0.04031147050014974
Others_Specialty 0.04031147050014974
Rehab 0.04031147050014974
Gerontology 0.04031147050014974
Age at death_y 0.04031147050014974


In [74]:
tidy_dataset = tidy_dataset.drop(columns=["Age at death_y"])

In [75]:
if datatype == "train": tidy_dataset.to_csv('/home/hassan/lily/MLA/FDA/inpatient_cvd_mortality.csv')
if datatype == "test": tidy_dataset.to_csv('/home/hassan/lily/MLA/FDA/inpatient_cvd_mortality_test.csv')