**features**:

*Age bins*: number of unit stays within this age range

age_mean: the mean admission age for each patient

age_std: the std of the admission age for each patient

freq: the frequency of lab tests per year


num_cvd_readmission: number of readmissions that are caused by cardiovascular

Cardiovescular: whether the patient has a cardiovescular diagnosis

readmission: whether this patient has been admitted again

num_stays: total number of unit stays 

visit_length: total length of emergency visits (use mean stay length to impute)

died: whether this patient is dead or not 

In [1]:
import pandas as pd
import os
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
DATA_PATH = "../../../../bhatti/dataset/VCHAMPS"

In [3]:
dataset_path = 'ed_visits_train.csv'
load_path = os.path.join(DATA_PATH, dataset_path)
ed = pd.read_csv(load_path, index_col=0)

In [4]:
inpatient = pd.read_csv("inpatient_full.csv")

In [5]:
for i in ed.columns:
    print(i, ed[ed[i].isna()].shape[0])

Internalpatientid 0
Age at ed visit 0
Ed visit start date 0
Discharge date ed 78
Died during ed visit 0
First listed diagnosis icd10 subcategory 0
Second listed diagnosis icd10 subcategory 0
State 0


### Check correlation between inpatient dataset and ed visits dataste

In [5]:
print("These patients are in inpatient dataset but not in ed_visits")
print(list(set(inpatient["Internalpatientid"].unique()) - set(ed["Internalpatientid"].unique())))

print("These patients are in ed dataset but not in inpatient")
print(list(set(ed["Internalpatientid"].unique()) - set(inpatient["Internalpatientid"].unique())))

These patients are in inpatient dataset but not in ed_visits
[3, 5, 131078, 131080, 131082, 18, 21, 27, 131101, 131103, 32, 36, 131108, 131112, 131116, 131117, 47, 49, 131121, 56, 57, 61, 131139, 70, 131142, 72, 131143, 131147, 131149, 79, 131154, 131157, 87, 131159, 89, 91, 93, 94, 131167, 101, 111, 113, 131186, 115, 121, 123, 131195, 131196, 129, 132, 133, 131206, 144, 131217, 131219, 131223, 154, 131227, 163, 131237, 168, 131240, 172, 175, 131248, 177, 131249, 131250, 181, 131254, 185, 188, 189, 192, 194, 131267, 197, 204, 131277, 206, 207, 131283, 212, 131284, 131288, 218, 219, 221, 131295, 131298, 229, 231, 131304, 131314, 249, 252, 131328, 131334, 264, 266, 268, 272, 274, 275, 131346, 277, 131348, 131353, 131354, 284, 131358, 131366, 131368, 131374, 303, 304, 311, 131387, 131392, 321, 322, 131393, 131395, 131396, 331, 332, 131403, 131407, 344, 131422, 352, 359, 131434, 366, 369, 370, 374, 377, 380, 131452, 131456, 131458, 131462, 391, 131467, 131468, 131471, 131475, 131476, 406, 

In [10]:
print("There are")
print(len(list(set(inpatient["Internalpatientid"].unique()) & set(ed["Internalpatientid"].unique()))))
print("patients that are both in inpatient and ed_visits dataset")

There are
51689
patients that are both in inpatient and ed_visits dataset


### Preprocessing

In [11]:
ed

Unnamed: 0,Internalpatientid,Age at ed visit,Ed visit start date,Discharge date ed,Died during ed visit,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,State
0,101689,64.361251,2021-08-26 00:53:30.0,2021-08-26 04:24:27.0,No,Systolic (congestive) heart failure,"Chronic kidney disease, unspecified",Virginia
3,107210,71.317300,2022-05-18 11:21:28.0,2022-05-18 18:30:21.0,No,"Sepsis, unspecified organism","Hypotension, unspecified",West Virginia
4,107866,61.164579,2017-06-26 15:41:11.0,2017-06-26 23:33:50.0,No,Other nonspecific abnormal finding of lung field,"Hypotension, unspecified",Pennsylvania
5,107906,66.101996,2017-02-17 06:42:34.0,2017-02-17 08:08:28.0,No,Dysuria,"Chronic kidney disease, unspecified",New York
6,108522,73.598130,2018-09-30 02:47:49.0,2018-09-30 04:28:57.0,No,Type 2 diabetes mellitus with circulatory comp...,Type 2 diabetes mellitus with unspecified comp...,Illinois
...,...,...,...,...,...,...,...,...
701954,99934,83.538682,2022-07-22 09:19:20.0,2022-07-22 09:42:57.0,No,"Pain in limb, hand, foot, fingers and toes",Not specified,Oklahoma
701955,99936,71.860371,2016-08-29 23:10:25.0,2016-08-30 01:43:36.0,No,Pain in joint,Not specified,New York
701958,99987,67.470347,2012-12-06 09:53:07.0,2012-12-06 10:41:48.0,No,Not specified,Not specified,Maryland
701960,9999,62.070707,2015-03-14 20:14:42.0,2015-03-14 22:35:46.0,No,Not specified,Not specified,Utah


#### Age bins

In [6]:
ed['Age 20-40'] = [0] * len(ed)
ed['Age 40-60'] = [0] * len(ed)
ed['Age 60-80'] = [0] * len(ed)
ed['Age 80-100'] = [0] * len(ed)
ed['Age 100-120'] = [0] * len(ed)
def age_category(data):
    data.loc[(data['Age at ed visit'] > 20) & (data['Age at ed visit'] <= 40), 'Age 20-40'] = 1
    data.loc[(data['Age at ed visit'] > 40) & (data['Age at ed visit'] <= 60), 'Age 40-60'] = 1
    data.loc[(data['Age at ed visit'] > 60) & (data['Age at ed visit'] <= 80), 'Age 60-80'] = 1
    data.loc[(data['Age at ed visit'] > 80) & (data['Age at ed visit'] <= 100), 'Age 80-100'] = 1
    data.loc[(data['Age at ed visit'] > 100) & (data['Age at ed visit'] <= 120), 'Age 100-120'] = 1
    return data
ed = age_category(ed)
ed

Unnamed: 0,Internalpatientid,Age at ed visit,Ed visit start date,Discharge date ed,Died during ed visit,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,State,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120
0,101689,64.361251,2021-08-26 00:53:30.0,2021-08-26 04:24:27.0,No,Systolic (congestive) heart failure,"Chronic kidney disease, unspecified",Virginia,0,0,1,0,0
3,107210,71.317300,2022-05-18 11:21:28.0,2022-05-18 18:30:21.0,No,"Sepsis, unspecified organism","Hypotension, unspecified",West Virginia,0,0,1,0,0
4,107866,61.164579,2017-06-26 15:41:11.0,2017-06-26 23:33:50.0,No,Other nonspecific abnormal finding of lung field,"Hypotension, unspecified",Pennsylvania,0,0,1,0,0
5,107906,66.101996,2017-02-17 06:42:34.0,2017-02-17 08:08:28.0,No,Dysuria,"Chronic kidney disease, unspecified",New York,0,0,1,0,0
6,108522,73.598130,2018-09-30 02:47:49.0,2018-09-30 04:28:57.0,No,Type 2 diabetes mellitus with circulatory comp...,Type 2 diabetes mellitus with unspecified comp...,Illinois,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
701954,99934,83.538682,2022-07-22 09:19:20.0,2022-07-22 09:42:57.0,No,"Pain in limb, hand, foot, fingers and toes",Not specified,Oklahoma,0,0,0,1,0
701955,99936,71.860371,2016-08-29 23:10:25.0,2016-08-30 01:43:36.0,No,Pain in joint,Not specified,New York,0,0,1,0,0
701958,99987,67.470347,2012-12-06 09:53:07.0,2012-12-06 10:41:48.0,No,Not specified,Not specified,Maryland,0,0,1,0,0
701960,9999,62.070707,2015-03-14 20:14:42.0,2015-03-14 22:35:46.0,No,Not specified,Not specified,Utah,0,0,1,0,0


#### Died

In [7]:
ed["Died during ed visit"] = ed["Died during ed visit"].replace({"No":0, "Yes": 1})

#### Length of Visits

In [8]:
stay_length = []
for i, row in ed.iterrows():
    start = row["Ed visit start date"]
    end = row["Discharge date ed"]

    start = pd.to_datetime(start,
               format='%Y-%m-%d %H:%M:%S.%f')
    end = pd.to_datetime(end,
               format='%Y-%m-%d %H:%M:%S.%f')
    
    diff = pd.Timedelta(end - start).days
    diff += round(pd.Timedelta(end - start).seconds/3600/24,2)

    stay_length.append(diff)

ed["stay_length"] = stay_length # in terms of day

In [9]:
nan_admission = ed[ed["stay_length"].isna()]
ed = ed[~ed["stay_length"].isna()]
nan_admission["stay_length"] = ed["stay_length"].mean()
ed = pd.concat([ed, nan_admission])
ed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_admission["stay_length"] = ed["stay_length"].mean()


Unnamed: 0,Internalpatientid,Age at ed visit,Ed visit start date,Discharge date ed,Died during ed visit,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,State,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120,stay_length
0,101689,64.361251,2021-08-26 00:53:30.0,2021-08-26 04:24:27.0,0,Systolic (congestive) heart failure,"Chronic kidney disease, unspecified",Virginia,0,0,1,0,0,0.150000
3,107210,71.317300,2022-05-18 11:21:28.0,2022-05-18 18:30:21.0,0,"Sepsis, unspecified organism","Hypotension, unspecified",West Virginia,0,0,1,0,0,0.300000
4,107866,61.164579,2017-06-26 15:41:11.0,2017-06-26 23:33:50.0,0,Other nonspecific abnormal finding of lung field,"Hypotension, unspecified",Pennsylvania,0,0,1,0,0,0.330000
5,107906,66.101996,2017-02-17 06:42:34.0,2017-02-17 08:08:28.0,0,Dysuria,"Chronic kidney disease, unspecified",New York,0,0,1,0,0,0.060000
6,108522,73.598130,2018-09-30 02:47:49.0,2018-09-30 04:28:57.0,0,Type 2 diabetes mellitus with circulatory comp...,Type 2 diabetes mellitus with unspecified comp...,Illinois,0,0,1,0,0,0.070000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680314,50832,76.352814,2022-01-23 22:53:18.0,,0,Not specified,Not specified,Florida,0,0,1,0,0,0.363437
689791,93787,69.830609,2011-11-15 02:05:52.0,,0,Not specified,Not specified,Texas,0,0,1,0,0,0.363437
690295,74113,78.492049,2009-12-21 20:15:23.0,,0,Not specified,Not specified,Texas,0,0,1,0,0,0.363437
691594,8936,74.625727,2011-11-21 16:52:24.0,,0,Not specified,Not specified,New York,0,0,1,0,0,0.363437


### Cardiovascular

In [10]:
cardiovascular = ["Heart failure, unspecified", "Other heart failure",
                  "Cardiogenic shock", "Hypertensive heart disease with heart failure",
                  "Hypertensive heart and kidney disease with heart failure", "Unstable angina", "Other forms of chronic ischemic heart disease",
                  "Atherosclerotic heart disease of native coronary artery", "Atrial fibrillation", "Atrial flutter",
                  "Supraventricular tachycardia", "Ventricular tachycardia"]

In [12]:
ed["cd_diagnosis"] = [0] * len(ed)
def cd_diagnosis(data):
    data.loc[(ed["Second listed diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | ed["Second listed diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | ed["Second listed diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | ed["Second listed diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | ed["Second listed diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | ed["Second listed diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)
      | ed["First listed diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | ed["First listed diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | ed["First listed diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | ed["First listed diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | ed["First listed diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | ed["First listed diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)), "cd_diagnosis"] = 1
    return data
ed = cd_diagnosis(ed)

In [13]:
ed.columns

Index(['Internalpatientid', 'Age at ed visit', 'Ed visit start date',
       'Discharge date ed', 'Died during ed visit',
       'First listed diagnosis icd10 subcategory',
       'Second listed diagnosis icd10 subcategory', 'State', 'Age 20-40',
       'Age 40-60', 'Age 60-80', 'Age 80-100', 'Age 100-120', 'stay_length',
       'cd_diagnosis'],
      dtype='object')

### Final dataset

In [14]:
tidy_dataset = []
for ids, group in tqdm(ed.groupby("Internalpatientid")):

    readmission = 0
    num_admissions = group["Age at ed visit"].nunique()
    if num_admissions > 1: readmission=1

    age_mean = group["Age at ed visit"].mean()
    age_std = group["Age at ed visit"].std()
    if group["Age at ed visit"].nunique() == 1: age_std = 0

    min_age = group["Age at ed visit"].min()
    max_age = group["Age at ed visit"].max()
    freq = num_admissions/(math.floor(max_age - min_age) + 1)

    num_cvd_readmission = max(0, group['cd_diagnosis'].sum() - 1)

    cvd = 0
    if group["cd_diagnosis"].sum() > 0: cvd = 1
    
    Died = 0
    if group["Died during ed visit"].sum() > 0: Died = 1

    df = pd.DataFrame(data={'Internalpatientid': [ids], 'num_stays': [num_admissions], 'stay_length': group["stay_length"].sum(),
                            "num_cvd_readmission": [num_cvd_readmission], "Readmission":[readmission], "Died": [Died], "CVD": [cvd],
                            'Age 20-40': group["Age 20-40"].sum(), 'Age 40-60': group["Age 40-60"].sum(), 'Age 60-80':group["Age 60-80"].sum(), 
                            'Age 80-100':group["Age 80-100"].sum(), 'Age 100-120':group["Age 100-120"].sum(), "age_mean": [age_mean], "age_std": [age_std], "freq": [round(freq,2)]
                            })
    
    df = df.reset_index(drop=True)
    tidy_dataset.append(df)
    
tidy_dataset = pd.concat(tidy_dataset)

  0%|          | 0/58240 [00:00<?, ?it/s]

In [17]:
tidy_dataset.corr()["Died"]

Internalpatientid      0.000753
num_stays              0.006434
stay_length            0.133586
num_cvd_readmission    0.002028
Readmission           -0.002943
Died                   1.000000
CVD                    0.001007
Age 20-40             -0.002744
Age 40-60             -0.002509
Age 60-80              0.009020
Age 80-100             0.001644
Age 100-120           -0.001472
age_mean               0.007940
age_std               -0.015164
freq                   0.021760
Name: Died, dtype: float64

In [16]:
tidy_dataset.to_csv("ed_visits_full.csv")