In [1]:
import pandas as pd
import os
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
datatype = 'quality'

In [65]:
if datatype == "train": 
    DATA_PATH = "/home/bhatti/dataset/VCHAMPS"
    dataset_path = 'inpatient_admissions_train.csv'
elif datatype == "quality":
    DATA_PATH = '/data/public/MLA/VCHAMPS-Quality/'
    dataset_path = "inpatient_admissions_qual_Formatfixed.csv"
else: 
    DATA_PATH = '/data/public/MLA/VCHAMPS-Test/'
    dataset_path = "inpatient_admissions_test.csv"
load_path = os.path.join(DATA_PATH, dataset_path)
in_ad = pd.read_csv(load_path,index_col=0).iloc[:,1:]
in_ad

Unnamed: 0,Internalpatientid,Age.at.admission,Admission.date,Discharge.date,Admitting.unit.service,Discharging.unit.service,Admitting.specialty,Discharging.specialty,First.listed.discharge.diagnosis.icd10.subcategory,Second.listed.discharge.diagnosis.icd10.subcategory,Discharge.disposition,Died.during.admission,Outpatientreferralflag,Serviceconnectedflag,Agentorangeflag,State
1,100012,55.317020,2004-10-25 08:54:01.0,2004-10-26 08:05:06.0,SURGERY,SURGERY,NEUROSURGERY,GENERAL SURGERY,Other and unspecified noninfective gastroenter...,Other specified disorders of white blood cells,Regular,,Yes,,,New Mexico
2,100399,85.706740,2010-03-24 19:31:38.0,2012-11-10 19:50:48.0,NHCU,NHCU,DOMICILIARY,NH HOSPICE,Unspecified mental disorder due to known physi...,"Malignant neoplasm of stomach, unspecified",Death without autopsy,,,No,No,Minnesota
3,100694,83.926120,2016-01-12 23:55:24.0,2016-01-13 20:55:24.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,Abnormal levels of other serum enzymes,Other acute ischemic heart diseases,Regular,,Yes,,No,Idaho
4,100694,88.611203,2020-09-20 09:28:13.0,2020-09-22 11:36:18.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,"Viral intestinal infection, unspecified",Hypo-osmolality and hyponatremia,Regular,,Yes,,No,Idaho
5,101407,88.925931,2009-05-03 10:15:50.0,2009-05-04 13:05:08.0,MEDICINE,MEDICINE,SPINAL CORD INJURY OBSERVATION,GENERAL(ACUTE MEDICINE),Unspecified dementia,Hypertensive chronic kidney disease with stage...,Regular,,Yes,,No,Louisiana
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4006,99870,87.481429,2008-12-07 11:05:40.0,2008-12-27 10:57:14.0,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),Spinal stenosis,"Other thoracic, thoracolumbar and lumbosacral ...",Regular,,No,,,South Dakota
4007,95448,47.736311,2009-03-21 14:55:07.0,2009-03-23 12:33:43.0,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),Other chest pain,Hypertensive chronic kidney disease with stage...,Regular,,Yes,,No,Nevada
4008,98416,78.000151,2005-09-12 01:19:43.0,2005-09-18 19:18:02.0,MEDICINE,MEDICINE,SUBSTANCE ABUSE TRMT UNIT,CARDIOLOGY,"Heart failure, unspecified","Nonrheumatic aortic valve disorder, unspecified",Regular,,No,,,California
4009,99137,88.265051,2019-01-13 21:01:36.0,2019-01-20 22:03:19.0,MEDICINE,MEDICINE,Not specified,GENERAL(ACUTE MEDICINE),Diastolic (congestive) heart failure,"Chronic kidney disease, stage 4 (severe)",Regular,,Yes,,No,Texas


In [66]:
if datatype == "quality":
    in_ad.rename(columns = {'Age.at.admission': "Age at admission", "Admission.date": "Admission date", "Discharge.date": "Discharge date",
                            "Admitting.unit.service": "Admitting unit service", "Discharging.unit.service": "Discharging unit service",
                            "Admitting.specialty": "Admitting specialty", "Discharging.specialty": "Discharging specialty", 
                            "First.listed.discharge.diagnosis.icd10.subcategory": "First listed discharge diagnosis icd10 subcategory",
                            "Second.listed.discharge.diagnosis.icd10.subcategory": "Second listed discharge diagnosis icd10 subcategory",
                            "Discharge.disposition": "Discharge disposition", "Died.during.admission": "Died during admission"}, inplace = True)

In [67]:
in_ad.columns

Index(['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Admitting unit service', 'Discharging unit service',
       'Admitting specialty', 'Discharging specialty',
       'First listed discharge diagnosis icd10 subcategory',
       'Second listed discharge diagnosis icd10 subcategory',
       'Discharge disposition', 'Died during admission',
       'Outpatientreferralflag', 'Serviceconnectedflag', 'Agentorangeflag',
       'State'],
      dtype='object')

In [68]:
in_ad['Transfer'] = [0] * len(in_ad)
def transfer(data):
    data.loc[data['Admitting unit service'] != data['Discharging unit service'], 'Transfer'] = 1
    return data
in_ad = transfer(in_ad)

In [69]:
in_ad["Admitting unit service"] = in_ad["Admitting unit service"].replace({'REHAB MEDICINE':'OTHERS', 'BLIND REHAB':'OTHERS',
                                        '(Censored)':'NON-COUNT', 'Not specified (no value)':'NON-COUNT', 'Not specified':'NON-COUNT',
                                        'INTERMEDIATE MED':'OTHERS', 'NEUROLOGY':'OTHERS', 'SPINAL CORD INJURY':'OTHERS'})
in_ad = pd.concat([in_ad, pd.get_dummies(in_ad['Admitting unit service'])], axis=1)

In [70]:
admission_stay_length = []
for i, row in in_ad.iterrows():
    start = row["Admission date"]
    end = row["Discharge date"]

    start = pd.to_datetime(start,
               format='%Y-%m-%d %H:%M:%S.%f')
    end = pd.to_datetime(end,
               format='%Y-%m-%d %H:%M:%S.%f')
    
    diff = pd.Timedelta(end - start).days
    diff += round(pd.Timedelta(end - start).seconds/3600/24,2)

    admission_stay_length.append(diff)

in_ad["stay_length"] = admission_stay_length # in terms of day

In [71]:
mean_stay_length = in_ad.groupby("Admitting unit service")["stay_length"].mean()
nan_admission = in_ad[in_ad["stay_length"].isna()]
in_ad = in_ad[~in_ad["stay_length"].isna()]

In [72]:
admission_stay_length = []
for i, row in nan_admission.iterrows():
    service = row["Admitting unit service"]
    diff = mean_stay_length[service]

    admission_stay_length.append(round(diff,2))

nan_admission["stay_length"] = admission_stay_length

In [73]:
in_ad = pd.concat([in_ad, nan_admission])
in_ad

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,...,State,Transfer,DOMICILIARY,MEDICINE,NHCU,NON-COUNT,OTHERS,PSYCHIATRY,SURGERY,stay_length
1,100012,55.317020,2004-10-25 08:54:01.0,2004-10-26 08:05:06.0,SURGERY,SURGERY,NEUROSURGERY,GENERAL SURGERY,Other and unspecified noninfective gastroenter...,Other specified disorders of white blood cells,...,New Mexico,0,0,0,0,0,0,0,1,0.97
2,100399,85.706740,2010-03-24 19:31:38.0,2012-11-10 19:50:48.0,NHCU,NHCU,DOMICILIARY,NH HOSPICE,Unspecified mental disorder due to known physi...,"Malignant neoplasm of stomach, unspecified",...,Minnesota,0,0,0,1,0,0,0,0,962.01
3,100694,83.926120,2016-01-12 23:55:24.0,2016-01-13 20:55:24.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,Abnormal levels of other serum enzymes,Other acute ischemic heart diseases,...,Idaho,0,0,0,0,1,0,0,0,0.88
4,100694,88.611203,2020-09-20 09:28:13.0,2020-09-22 11:36:18.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,"Viral intestinal infection, unspecified",Hypo-osmolality and hyponatremia,...,Idaho,0,0,0,0,1,0,0,0,2.09
5,101407,88.925931,2009-05-03 10:15:50.0,2009-05-04 13:05:08.0,MEDICINE,MEDICINE,SPINAL CORD INJURY OBSERVATION,GENERAL(ACUTE MEDICINE),Unspecified dementia,Hypertensive chronic kidney disease with stage...,...,Louisiana,0,0,1,0,0,0,0,0,1.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4010,99870,87.535970,2008-12-27 09:31:41.0,2009-01-28 07:16:34.0,NHCU,NHCU,ANESTHESIOLOGY,NH SHORT STAY SKILLED NURSING,Acute and subacute infective endocarditis,"Bacterial infection, unspecified",...,South Dakota,0,0,0,1,0,0,0,0,31.91
1407,156832,53.170385,2003-08-25 18:37:16.0,,NHCU,NHCU,DOD BEDS IN VA FACILITY,NH LONG-STAY CONTINUING CARE,Sequelae of cerebral infarction,Quadriplegia,...,North Carolina,0,0,0,1,0,0,0,0,92.07
1419,158742,70.533947,2014-05-17 03:46:53.0,,SURGERY,SURGERY,Not specified (no value),Not specified (no value),Not specified,Not specified,...,California,0,0,0,0,0,0,0,1,7.34
2709,21909,64.164001,2021-09-30 22:37:08.0,,OTHERS,SPINAL CORD INJURY,PERIPHERAL VASCULAR,SPINAL CORD INJURY,Not specified,Not specified,...,Virginia,0,0,0,0,0,1,0,0,103.08


In [74]:
in_ad['Age 20-40'] = [0] * len(in_ad)
in_ad['Age 40-60'] = [0] * len(in_ad)
in_ad['Age 60-80'] = [0] * len(in_ad)
in_ad['Age 80-100'] = [0] * len(in_ad)
in_ad['Age 100-120'] = [0] * len(in_ad)
def age_category(data):
    data.loc[(data['Age at admission'] > 20) & (data['Age at admission'] <= 40), 'Age 20-40'] = 1
    data.loc[(data['Age at admission'] > 40) & (data['Age at admission'] <= 60), 'Age 40-60'] = 1
    data.loc[(data['Age at admission'] > 60) & (data['Age at admission'] <= 80), 'Age 60-80'] = 1
    data.loc[(data['Age at admission'] > 80) & (data['Age at admission'] <= 100), 'Age 80-100'] = 1
    data.loc[(data['Age at admission'] > 100) & (data['Age at admission'] <= 120), 'Age 100-120'] = 1
    return data
in_ad = age_category(in_ad)
in_ad

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,...,NON-COUNT,OTHERS,PSYCHIATRY,SURGERY,stay_length,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120
1,100012,55.317020,2004-10-25 08:54:01.0,2004-10-26 08:05:06.0,SURGERY,SURGERY,NEUROSURGERY,GENERAL SURGERY,Other and unspecified noninfective gastroenter...,Other specified disorders of white blood cells,...,0,0,0,1,0.97,0,1,0,0,0
2,100399,85.706740,2010-03-24 19:31:38.0,2012-11-10 19:50:48.0,NHCU,NHCU,DOMICILIARY,NH HOSPICE,Unspecified mental disorder due to known physi...,"Malignant neoplasm of stomach, unspecified",...,0,0,0,0,962.01,0,0,0,1,0
3,100694,83.926120,2016-01-12 23:55:24.0,2016-01-13 20:55:24.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,Abnormal levels of other serum enzymes,Other acute ischemic heart diseases,...,1,0,0,0,0.88,0,0,0,1,0
4,100694,88.611203,2020-09-20 09:28:13.0,2020-09-22 11:36:18.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,"Viral intestinal infection, unspecified",Hypo-osmolality and hyponatremia,...,1,0,0,0,2.09,0,0,0,1,0
5,101407,88.925931,2009-05-03 10:15:50.0,2009-05-04 13:05:08.0,MEDICINE,MEDICINE,SPINAL CORD INJURY OBSERVATION,GENERAL(ACUTE MEDICINE),Unspecified dementia,Hypertensive chronic kidney disease with stage...,...,0,0,0,0,1.12,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4010,99870,87.535970,2008-12-27 09:31:41.0,2009-01-28 07:16:34.0,NHCU,NHCU,ANESTHESIOLOGY,NH SHORT STAY SKILLED NURSING,Acute and subacute infective endocarditis,"Bacterial infection, unspecified",...,0,0,0,0,31.91,0,0,0,1,0
1407,156832,53.170385,2003-08-25 18:37:16.0,,NHCU,NHCU,DOD BEDS IN VA FACILITY,NH LONG-STAY CONTINUING CARE,Sequelae of cerebral infarction,Quadriplegia,...,0,0,0,0,92.07,0,1,0,0,0
1419,158742,70.533947,2014-05-17 03:46:53.0,,SURGERY,SURGERY,Not specified (no value),Not specified (no value),Not specified,Not specified,...,0,0,0,1,7.34,0,0,1,0,0
2709,21909,64.164001,2021-09-30 22:37:08.0,,OTHERS,SPINAL CORD INJURY,PERIPHERAL VASCULAR,SPINAL CORD INJURY,Not specified,Not specified,...,0,1,0,0,103.08,0,0,1,0,0


In [75]:
cardiovascular = ["Heart failure, unspecified", "Other heart failure",
                  "Cardiogenic shock", "Hypertensive heart disease with heart failure",
                  "Hypertensive heart and kidney disease with heart failure", "Unstable angina", "Other forms of chronic ischemic heart disease",
                  "Atherosclerotic heart disease of native coronary artery", "Atrial fibrillation", "Atrial flutter",
                  "Supraventricular tachycardia", "Ventricular tachycardia"]

In [76]:
in_ad["cd_diagnosis"] = [0] * len(in_ad)
def cd_diagnosis(data):
    data.loc[(in_ad["Second listed discharge diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | in_ad["Second listed discharge diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | in_ad["Second listed discharge diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | in_ad["Second listed discharge diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | in_ad["Second listed discharge diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | in_ad["Second listed discharge diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)
      | in_ad["First listed discharge diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | in_ad["First listed discharge diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | in_ad["First listed discharge diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | in_ad["First listed discharge diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | in_ad["First listed discharge diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | in_ad["First listed discharge diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)), "cd_diagnosis"] = 1
    return data
in_ad = cd_diagnosis(in_ad)
in_ad


Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,...,OTHERS,PSYCHIATRY,SURGERY,stay_length,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120,cd_diagnosis
1,100012,55.317020,2004-10-25 08:54:01.0,2004-10-26 08:05:06.0,SURGERY,SURGERY,NEUROSURGERY,GENERAL SURGERY,Other and unspecified noninfective gastroenter...,Other specified disorders of white blood cells,...,0,0,1,0.97,0,1,0,0,0,0
2,100399,85.706740,2010-03-24 19:31:38.0,2012-11-10 19:50:48.0,NHCU,NHCU,DOMICILIARY,NH HOSPICE,Unspecified mental disorder due to known physi...,"Malignant neoplasm of stomach, unspecified",...,0,0,0,962.01,0,0,0,1,0,0
3,100694,83.926120,2016-01-12 23:55:24.0,2016-01-13 20:55:24.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,Abnormal levels of other serum enzymes,Other acute ischemic heart diseases,...,0,0,0,0.88,0,0,0,1,0,0
4,100694,88.611203,2020-09-20 09:28:13.0,2020-09-22 11:36:18.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,"Viral intestinal infection, unspecified",Hypo-osmolality and hyponatremia,...,0,0,0,2.09,0,0,0,1,0,0
5,101407,88.925931,2009-05-03 10:15:50.0,2009-05-04 13:05:08.0,MEDICINE,MEDICINE,SPINAL CORD INJURY OBSERVATION,GENERAL(ACUTE MEDICINE),Unspecified dementia,Hypertensive chronic kidney disease with stage...,...,0,0,0,1.12,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4010,99870,87.535970,2008-12-27 09:31:41.0,2009-01-28 07:16:34.0,NHCU,NHCU,ANESTHESIOLOGY,NH SHORT STAY SKILLED NURSING,Acute and subacute infective endocarditis,"Bacterial infection, unspecified",...,0,0,0,31.91,0,0,0,1,0,0
1407,156832,53.170385,2003-08-25 18:37:16.0,,NHCU,NHCU,DOD BEDS IN VA FACILITY,NH LONG-STAY CONTINUING CARE,Sequelae of cerebral infarction,Quadriplegia,...,0,0,0,92.07,0,1,0,0,0,0
1419,158742,70.533947,2014-05-17 03:46:53.0,,SURGERY,SURGERY,Not specified (no value),Not specified (no value),Not specified,Not specified,...,0,0,1,7.34,0,0,1,0,0,0
2709,21909,64.164001,2021-09-30 22:37:08.0,,OTHERS,SPINAL CORD INJURY,PERIPHERAL VASCULAR,SPINAL CORD INJURY,Not specified,Not specified,...,1,0,0,103.08,0,0,1,0,0,0


In [77]:
sub_abuse_and_mental = ['DRUG DEPENDENCE TRMT UNIT', 'PSYCHIATRIC MENTALLY INFIRM', 'SUBSTANCE ABUSE RES TRMT PROG', 'PLASTIC SURGERY', 'PSYCH RESID REHAB TRMT PROG', 'SUBSTANCE ABUSE INTERMED CARE', 'ACUTE PSYCHIATRY (<45 DAYS)', 'DOMICILIARY PTSD', 
                        'ALCOHOL DEPENDENCE TRMT UNIT', 'EVAL/BRF TRMT PTSD UNIT(EBTPU)', 'PTSD RESID REHAB PROG', 'PSYCH RESID REHAB PROG', 'PTSD CWT/TR', 
                        'PTSD RESIDENTIAL REHAB PROG', 'SUBST ABUSE STAR I, II & III', 'SUBSTANCE ABUSE TRMT UNIT', 'GEN INTERMEDIATE PSYCH', 'LONG TERM PSYCHIATRY(>45 DAYS)', 'SUBSTANCE ABUSE RESID PROG', 
                        'DOMICILIARY SUBSTANCE ABUSE', 'SIPU (SPEC INPT PTSD UNIT)', 'ZZALCOHOL DEPENDENCE TRMT UNIT', 'ZZSUBSTANCE ABUSE INTERMEDCARE', 'PSYCHIATRY', 'ZZDRUG DEPENDENCE TRMT UNIT',
                        'ZZSUBSTANCE ABUSE TRMT UNIT', 'GEN MEDICINE (ACUTE)', 'zSUBST ABUSE STAR I, II & III', 'ZZSUBST ABUSE STAR I,II,II', 'SUBST ABUSE CWT/TRANS RESID', 'HIGH INTENSITY GEN PSYCH INPAT', 'HALFWAY HOUSE']
medical = ['HEMATOLOGY/ONCOLOGY', 'GASTROENTEROLOGY', 'INTERMEDIATE MEDICINE', 'ANESTHESIOLOGY', 'PROCTOLOGY', 'CARDIAC SURGERY', 'TRANSPLANTATION', 'CARDIOLOGY', 'METABOLIC',
           'GENERAL(ACUTE MEDICINE)', 'PEDIATRICS', 'VASCULAR', 'OPHTHALMOLOGY', 'NEUROSURGERY', 'SURGICAL STEPDOWN', 'UROLOGY', 'PULMONARY, TUBERCULOSIS', 'PERIPHERAL VASCULAR', 
           'THORACIC SURGERY', 'MEDICAL STEP DOWN', 'GENERAL SURGERY', 'PULMONARY, NON-TB', 'EPILEPSY CENTER', 'NEUROLOGY', 'SPINAL CORD INJURY', 'ORAL SURGERY',
           'PODIATRY', 'EAR, NOSE, THROAT (ENT)', 'ENDOCRINOLOGY', 'CARDIAC-STEP DOWN UNIT', 'TELEMETRY', 'OB/GYN', 'ORTHOPEDIC', 'DOMICILIARY','ALLERGY', 'STROKE UNIT', 'DERMATOLOGY',
           'CARDIAC INTENSIVE CARE UNIT', 'HOSPICE FOR ACUTE CARE', 'SURGICAL ICU', 'MEDICAL ICU']
rehab = ['NH SHORT STAY REHABILITATION', 'BLIND REHAB OBSERVATION', 'BLIND REHAB', 'NH LONG STAY DEMENTIA CARE', 'NH SHORT-STAY CONTINUING CARE', 'NH HOSPICE', 'NEUROLOGY OBSERVATION', 'NH LONG-STAY MH RECOVERY', 
         'NH SHORT-STAY MH RECOVERY', 'NH SHORT STAY RESTORATIVE', 'REHABILITATION MEDICINE','NH SHORT STAY DEMENTIA CARE', 'RESPITE CARE (MEDICINE)', 'PM&R TRANSITIONAL REHAB', 
         'SPINAL CORD INJURY OBSERVATION', 'POLYTRAUMA REHAB UNIT', 'SURGICAL OBSERVATION', 'NHCU', 'NH SHORT STAY SKILLED NURSING', 'NH GEM NURSING HOME CARE', 'NH LONG STAY SKILLED NURSING', 'NH LONG-STAY CONTINUING CARE', 'ED OBSERVATION', 
         'MEDICAL OBSERVATION', 'REHAB MEDICINE OBSERVATION', 'PSYCHIATRIC OBSERVATION', 'NH LONG STAY SPINAL CORD INJ', 'DOD BEDS IN VA FACILITY', 'NON-DOD BEDS IN VA FACILITY', 'GENERAL CWT/TR', 'HOMELESS CWT/TRANS RESID', 'PRRTP', 'HIGH INTENSITY GEN INPT',
         'DOMICILIARY CHV', 'STAR I, II & III']
gem = ['GRECC-MED', 'SHORT STAY GRECC-GEM-NHCU', 'GEM DOMICILIARY', 'SHORT STAY GRECC-NHCU', 'GEM REHABILITATION MEDICINE', 'GRECC-GEM-REHAB', 'GEM NEUROLOGY', 
       'GEM ACUTE MEDICINE', 'LONG STAY GRECC-NHCU', 'GEM PSYCHIATRIC BEDS', 'GERONTOLOGY', 'GEM INTERMEDIATE CARE']
others = ['Not specified', '(Censored)', 'Not specified (no value)']

In [78]:
in_ad['Mental'] = [0] * len(in_ad)
in_ad['Medical'] = [0] * len(in_ad)
in_ad['Rehab'] = [0] * len(in_ad)
in_ad['Gerontology'] = [0] * len(in_ad)
in_ad['Others_Specialty'] = [0] * len(in_ad)
def age_category(data):
    data.loc[(data['Admitting specialty'].isin(sub_abuse_and_mental)), 'Mental'] = 1
    data.loc[(data['Admitting specialty'].isin(medical)), 'Medical'] = 1
    data.loc[(data['Admitting specialty'].isin(rehab)), 'Rehab'] = 1
    data.loc[(data['Admitting specialty'].isin(gem)), 'Gerontology'] = 1
    data.loc[(data['Admitting specialty'].isin(others)), 'Others_Specialty'] = 1
    return data
in_ad = age_category(in_ad)
in_ad

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,First listed discharge diagnosis icd10 subcategory,Second listed discharge diagnosis icd10 subcategory,...,Age 40-60,Age 60-80,Age 80-100,Age 100-120,cd_diagnosis,Mental,Medical,Rehab,Gerontology,Others_Specialty
1,100012,55.317020,2004-10-25 08:54:01.0,2004-10-26 08:05:06.0,SURGERY,SURGERY,NEUROSURGERY,GENERAL SURGERY,Other and unspecified noninfective gastroenter...,Other specified disorders of white blood cells,...,1,0,0,0,0,0,1,0,0,0
2,100399,85.706740,2010-03-24 19:31:38.0,2012-11-10 19:50:48.0,NHCU,NHCU,DOMICILIARY,NH HOSPICE,Unspecified mental disorder due to known physi...,"Malignant neoplasm of stomach, unspecified",...,0,0,1,0,0,0,1,0,0,0
3,100694,83.926120,2016-01-12 23:55:24.0,2016-01-13 20:55:24.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,Abnormal levels of other serum enzymes,Other acute ischemic heart diseases,...,0,0,1,0,0,0,1,0,0,0
4,100694,88.611203,2020-09-20 09:28:13.0,2020-09-22 11:36:18.0,NON-COUNT,NON-COUNT,SPINAL CORD INJURY,MEDICAL OBSERVATION,"Viral intestinal infection, unspecified",Hypo-osmolality and hyponatremia,...,0,0,1,0,0,0,1,0,0,0
5,101407,88.925931,2009-05-03 10:15:50.0,2009-05-04 13:05:08.0,MEDICINE,MEDICINE,SPINAL CORD INJURY OBSERVATION,GENERAL(ACUTE MEDICINE),Unspecified dementia,Hypertensive chronic kidney disease with stage...,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4010,99870,87.535970,2008-12-27 09:31:41.0,2009-01-28 07:16:34.0,NHCU,NHCU,ANESTHESIOLOGY,NH SHORT STAY SKILLED NURSING,Acute and subacute infective endocarditis,"Bacterial infection, unspecified",...,0,0,1,0,0,0,1,0,0,0
1407,156832,53.170385,2003-08-25 18:37:16.0,,NHCU,NHCU,DOD BEDS IN VA FACILITY,NH LONG-STAY CONTINUING CARE,Sequelae of cerebral infarction,Quadriplegia,...,1,0,0,0,0,0,0,1,0,0
1419,158742,70.533947,2014-05-17 03:46:53.0,,SURGERY,SURGERY,Not specified (no value),Not specified (no value),Not specified,Not specified,...,0,1,0,0,0,0,0,0,0,1
2709,21909,64.164001,2021-09-30 22:37:08.0,,OTHERS,SPINAL CORD INJURY,PERIPHERAL VASCULAR,SPINAL CORD INJURY,Not specified,Not specified,...,0,1,0,0,0,0,1,0,0,0


In [79]:
def final_set_before(ids, group):
    
    readmission = 0
    num_admissions = group["Age at admission"].nunique()
    if num_admissions > 1: readmission=1

    age_mean = group["Age at admission"].mean()
    age_std = group["Age at admission"].std()
    if group["Age at admission"].nunique() == 1: age_std = 0

    min_age = group["Age at admission"].min()
    max_age = group["Age at admission"].max()

    freq = num_admissions/(math.floor(max_age - min_age) + 1)

    min_stay = group["stay_length"].min()
    max_stay = group["stay_length"].min()
    stay_mean = group["stay_length"].mean()
    stay_std = group["stay_length"].std()
    if group["stay_length"].nunique() == 1: stay_std = 0

    # num_cvd_readmission = max(0, group['cd_diagnosis'].sum() - 1)
    num_cvd_admission = group['cd_diagnosis'].sum()

    cvd = 0
    if group["cd_diagnosis"].sum() > 0: cvd = 1
    
    Died = 0
    if group["Died during admission"].sum() > 0: Died = 1

    #AO = 0
    #if group["Agentorangeflag"].sum() > 0: AO = 1

    df = pd.DataFrame(data={'Internalpatientid': [ids], 'num_stays': [num_admissions], 'stay_length': group["stay_length"].sum(),
                            'num_unique_units': group["Admitting unit service"].nunique(), "num_transfers": group["Transfer"].sum(), 
                            "num_cvd_admission": [num_cvd_admission], "Died": [Died], "CVD": [cvd],
                            "unique_admitting_specialty": group["Admitting specialty"].nunique(), "unique_discharging_specialty": group["Discharging specialty"].nunique(),
                            "DOMICILIARY": group["DOMICILIARY"].sum(), "MEDICINE": group["MEDICINE"].sum(), "NHCU":group["NHCU"].sum(),
                            "NON-COUNT":group["NON-COUNT"].sum(), "OTHERS":group["OTHERS"].sum(), 'PSYCHIATRY': group['PSYCHIATRY'].sum(), 'SURGERY': group['SURGERY'].sum(),
                            'Age 20-40': group["Age 20-40"].sum(), 'Age 40-60': group["Age 40-60"].sum(), 'Age 60-80':group["Age 60-80"].sum(), 
                            'Age 80-100':group["Age 80-100"].sum(), 'Age 100-120':group["Age 100-120"].sum(), "age_mean": [age_mean], "age_std": [age_std], 
                            "age_min": [min_age], "age_max": [max_age], "stay_min": [min_stay], "stay_max": [max_stay], "stay_mean": [stay_mean],
                            "stay_std": [stay_std], "freq": [round(freq,2)], 'Medical': group["Medical"].sum(), 'Mental':group["Mental"].sum(), 
                            'Others_Specialty':group["Others_Specialty"].sum(), 'Rehab': group["Rehab"].sum(), 'Gerontology': group["Gerontology"].sum()
                            })
    df = df.reset_index(drop=True)
    return df

In [80]:
def final_set(ids, group):
    full = []
    ddl = 0
    for i in range(len(group)):
        if ((i+1) != (len(group) - 1)) and (len(group) != 1): 
            # for now we only consider the case when the last row is used as the indicator
            continue
        df = final_set_before(ids, group.iloc[0:i+1,])
        
        CVD_readmission = 0

        if len(group) == i+1: 
            read_time = 0
            ddl = group.iloc[i]["Discharge date"]
        else: 
            start = group.iloc[i]["Discharge date"]
            end = group.iloc[i+1]["Admission date"]
            ddl = start

            start = pd.to_datetime(start,
               format='%Y-%m-%d %H:%M:%S.%f')
            end = pd.to_datetime(end,
               format='%Y-%m-%d %H:%M:%S.%f')
            read_time = pd.Timedelta(end - start).days
            read_time += round(pd.Timedelta(end - start).seconds/3600/24,2)

            if df["num_cvd_admission"].item() > 0:
                # if this patient once admitted due to CVD
                if group.iloc[i+1]["cd_diagnosis"] == 1:
                    # if the indicator admission also caused by CVD
                    CVD_readmission = 1

        df["CVD_readmission"] = CVD_readmission
        df["next_readmission_time"] = read_time
        df["Discharge date"] = ddl
        # if read_time < 0: print(group[["Admission date","Discharge date"]])

        if (read_time > 300) or (read_time == 0): threshold = 0
        else: threshold = 1
        df["readmission within 300 days"] = threshold
        full.append(df)
    return pd.concat(full)

In [81]:
tidy_dataset = []
for ids, group in tqdm(in_ad.groupby("Internalpatientid")):
    group = group.sort_values(by = ['Admission date'],ascending=True).reset_index(drop = True)
    
    df = final_set(ids, group)
    tidy_dataset.append(df)
    
tidy_dataset = pd.concat(tidy_dataset)

  0%|          | 0/632 [00:00<?, ?it/s]

In [82]:
tidy_dataset

Unnamed: 0,Internalpatientid,num_stays,stay_length,num_unique_units,num_transfers,num_cvd_admission,Died,CVD,unique_admitting_specialty,unique_discharging_specialty,...,freq,Medical,Mental,Others_Specialty,Rehab,Gerontology,CVD_readmission,next_readmission_time,Discharge date,readmission within 300 days
0,67,1,0.18,1,0,0,0,0,1,1,...,1.00,1,0,0,0,0,0,0.00,2019-04-29 22:39:20.0,0
0,200,4,17.90,2,0,0,0,0,4,3,...,0.25,1,0,0,2,0,0,130.54,2019-01-31 22:59:02.0,1
0,291,7,79.68,2,0,3,0,1,3,2,...,2.33,0,0,0,7,0,0,-6.93,2006-07-08 13:56:24.0,1
0,330,3,17.38,1,0,1,0,1,2,2,...,3.00,1,0,0,2,0,0,993.85,2019-06-28 14:50:58.0,0
0,351,3,22.14,1,0,3,0,1,2,2,...,3.00,0,2,0,0,1,1,77.26,2023-07-10 21:33:42.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,166881,1,2.52,1,0,1,0,1,1,1,...,1.00,1,0,0,0,0,0,410.96,2001-01-09 21:07:54.0,0
0,167102,3,10.25,1,0,0,0,0,3,1,...,0.38,2,0,0,1,0,0,2026.77,2011-11-27 12:57:02.0,0
0,167404,1,11.63,1,0,1,0,1,1,1,...,1.00,0,0,0,1,0,0,0.00,2004-01-30 11:12:01.0,0
0,167917,4,10.50,1,0,1,0,1,4,2,...,1.33,3,0,0,1,0,1,1354.04,2016-08-07 11:25:36.0,0


In [87]:
tidy_dataset = tidy_dataset.drop(columns = ['Died'])
tidy_dataset.columns

Index(['Internalpatientid', 'num_stays', 'stay_length', 'num_unique_units',
       'num_transfers', 'num_cvd_admission', 'CVD',
       'unique_admitting_specialty', 'unique_discharging_specialty',
       'DOMICILIARY', 'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY',
       'SURGERY', 'Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'age_min', 'age_max', 'stay_min',
       'stay_max', 'stay_mean', 'stay_std', 'freq', 'Medical', 'Mental',
       'Others_Specialty', 'Rehab', 'Gerontology', 'CVD_readmission',
       'next_readmission_time', 'Discharge date',
       'readmission within 300 days'],
      dtype='object')

In [88]:
if datatype == "quality": tidy_dataset.to_csv("/home/hassan/lily/MLA/FDA/inpatient_simple_quality.csv")