**This table will not be combined with inpatient data**

**features**:

num_visits: total number of visits

stay_length: total length of unit stays (use service mean stay length to impute)

num_cvd_diagnosis: number of readmissions that are caused by cardiovascular

CVD: whether the patient has a cardiovescular diagnosis

*Different Age bins*: number of unit stays within this age range

age_mean: the mean admission age for each patient

age_std: the std of the admission age for each patient

AO: whether this patinet has been exposed to Agentorange


In [1]:
import pandas as pd

import numpy as np
import pandas as pd

import seaborn as sns
from importlib import reload
import matplotlib.pyplot as plt
import preprocess_data 
from tqdm.auto import tqdm
import math

reload(preprocess_data)

%matplotlib inline

In [2]:
outpatient_visits = pd.read_csv('/data/public/MLA/VCHAMPS-Quality/outpatient_visits_qual.csv', index_col=0)

### Drop Combatflag, Ionizingradiationflag, Serviceconnectedflag, Swasiaconditionsflag, Agentorangeflag

In [3]:
outpatient_visits = outpatient_visits.drop([ 'Combatflag', 'Ionizingradiationflag', 'Serviceconnectedflag', 'Swasiaconditionsflag', 'Agentorangeflag'], axis=1)

### Age at visit

Divide to different age bins: (20-40], (40-60], (60-80], (80-100], (100-120]

In [4]:
print(outpatient_visits["Age at visit"].min())
print(outpatient_visits["Age at visit"].max())

22.08343539418352
104.45549526270456


In [5]:
outpatient_visits['Age 00-20'] = [0] * len(outpatient_visits)
outpatient_visits['Age 20-40'] = [0] * len(outpatient_visits)
outpatient_visits['Age 40-60'] = [0] * len(outpatient_visits)
outpatient_visits['Age 60-80'] = [0] * len(outpatient_visits)
outpatient_visits['Age 80-100'] = [0] * len(outpatient_visits)
outpatient_visits['Age 100-120'] = [0] * len(outpatient_visits)
def age_category(data):
    data.loc[(data['Age at visit'] <= 20), 'Age 00-20'] = 1
    data.loc[(data['Age at visit'] > 20) & (data['Age at visit'] <= 40), 'Age 20-40'] = 1
    data.loc[(data['Age at visit'] > 40) & (data['Age at visit'] <= 60), 'Age 40-60'] = 1
    data.loc[(data['Age at visit'] > 60) & (data['Age at visit'] <= 80), 'Age 60-80'] = 1
    data.loc[(data['Age at visit'] > 80) & (data['Age at visit'] <= 100), 'Age 80-100'] = 1
    data.loc[(data['Age at visit'] > 100) & (data['Age at visit'] <= 120), 'Age 100-120'] = 1
    return data
outpatient_visits = age_category(outpatient_visits)
outpatient_visits

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,State,Age 00-20,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120
461,100012,52.440184,2001-12-08 21:17:36.0,Not specified,Not specified,TELEPHONE TRIAGE,Texas,0,0,1,0,0,0
462,100012,52.570897,2002-01-25 15:54:35.0,Not specified,Not specified,EMERGENCY UNIT,New Mexico,0,0,1,0,0,0
463,100012,52.603491,2002-02-06 13:49:30.0,Not specified,Not specified,PRIMARY CARE/MEDICINE,New Mexico,0,0,1,0,0,0
464,100012,52.678927,2002-03-06 03:32:37.0,Not specified,Not specified,TELEPHONE TRIAGE,New Mexico,0,0,1,0,0,0
465,100012,52.706935,2002-03-16 09:14:15.0,Not specified,Not specified,TELEPHONE TRIAGE,New Mexico,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105223217,99944,79.489989,2012-01-04 14:10:45.0,Not specified,Not specified,OCCUPATIONAL THERAPY,Massachusetts,0,0,0,1,0,0
105223218,99944,80.012876,2012-07-13 16:57:06.0,Not specified,Not specified,HBPC - SOCIAL WORKER,Massachusetts,0,0,0,0,1,0
105223219,99944,80.320453,2012-11-03 03:01:01.0,Not specified,Not specified,HBPC Nursing (RN / LP),Massachusetts,0,0,0,0,1,0
105223220,99944,80.438034,2012-12-16 02:26:00.0,Not specified,Not specified,HBPC Nursing (RN / LP),Massachusetts,0,0,0,0,1,0


### Cardiovascular

In [6]:
cardiovascular = ["Heart failure, unspecified", "Other heart failure",
                  "Cardiogenic shock", "Hypertensive heart disease with heart failure",
                  "Hypertensive heart and kidney disease with heart failure", "Unstable angina", "Other forms of chronic ischemic heart disease",
                  "Atherosclerotic heart disease of native coronary artery", "Atrial fibrillation", "Atrial flutter",
                  "Supraventricular tachycardia", "Ventricular tachycardia"]

In [7]:
outpatient_visits["CVD_outpatient"] = [0] * len(outpatient_visits)
def cd_diagnosis(data):
    data.loc[(data["Second listed diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | data["Second listed diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | data["Second listed diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | data["Second listed diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | data["Second listed diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | data["Second listed diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)
      | data["First listed diagnosis icd10 subcategory"].str.contains('|'.join(cardiovascular))
      | data["First listed diagnosis icd10 subcategory"].str.contains("Systolic (congestive) heart failure", regex=False)
      | data["First listed diagnosis icd10 subcategory"].str.contains("Diastolic (congestive) heart failure", regex=False)
      | data["First listed diagnosis icd10 subcategory"].str.contains("Combined systolic (congestive) and diastolic (congestive) heart failure", regex=False)
      | data["First listed diagnosis icd10 subcategory"].str.contains("ST elevation (STEMI) myocardial infarction", regex=False)
      | data["First listed diagnosis icd10 subcategory"].str.contains("Non-ST elevation (NSTEMI) myocardial infarction", regex=False)), "CVD_outpatient"] = 1
    return data
outpatient_visits = cd_diagnosis(outpatient_visits)
outpatient_visits


Unnamed: 0,Internalpatientid,Age at visit,Visit start date,First listed diagnosis icd10 subcategory,Second listed diagnosis icd10 subcategory,Stop code,State,Age 00-20,Age 20-40,Age 40-60,Age 60-80,Age 80-100,Age 100-120,CVD_outpatient
461,100012,52.440184,2001-12-08 21:17:36.0,Not specified,Not specified,TELEPHONE TRIAGE,Texas,0,0,1,0,0,0,0
462,100012,52.570897,2002-01-25 15:54:35.0,Not specified,Not specified,EMERGENCY UNIT,New Mexico,0,0,1,0,0,0,0
463,100012,52.603491,2002-02-06 13:49:30.0,Not specified,Not specified,PRIMARY CARE/MEDICINE,New Mexico,0,0,1,0,0,0,0
464,100012,52.678927,2002-03-06 03:32:37.0,Not specified,Not specified,TELEPHONE TRIAGE,New Mexico,0,0,1,0,0,0,0
465,100012,52.706935,2002-03-16 09:14:15.0,Not specified,Not specified,TELEPHONE TRIAGE,New Mexico,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105223217,99944,79.489989,2012-01-04 14:10:45.0,Not specified,Not specified,OCCUPATIONAL THERAPY,Massachusetts,0,0,0,1,0,0,0
105223218,99944,80.012876,2012-07-13 16:57:06.0,Not specified,Not specified,HBPC - SOCIAL WORKER,Massachusetts,0,0,0,0,1,0,0
105223219,99944,80.320453,2012-11-03 03:01:01.0,Not specified,Not specified,HBPC Nursing (RN / LP),Massachusetts,0,0,0,0,1,0,0
105223220,99944,80.438034,2012-12-16 02:26:00.0,Not specified,Not specified,HBPC Nursing (RN / LP),Massachusetts,0,0,0,0,1,0,0


In [8]:
outpatient_visits.columns

Index(['Internalpatientid', 'Age at visit', 'Visit start date',
       'First listed diagnosis icd10 subcategory',
       'Second listed diagnosis icd10 subcategory', 'Stop code', 'State',
       'Age 00-20', 'Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'CVD_outpatient'],
      dtype='object')

In [9]:
outpatient_visits = outpatient_visits.drop(['First listed diagnosis icd10 subcategory', 'Second listed diagnosis icd10 subcategory'], axis=1)

### Final dataset

In [10]:
tidy_dataset = []
for ids, group in tqdm(outpatient_visits.groupby("Internalpatientid")):
    # group = group.sort_values(by = ['Visit start date'],ascending=True).reset_index(drop = True)
    num_visits = group["Age at visit"].nunique()

    age_mean = group["Age at visit"].mean()
    age_std = group["Age at visit"].std()
    if group["Age at visit"].nunique() == 1: age_std = 0

    min_age = group["Age at visit"].min()
    max_age = group["Age at visit"].max()
    freq = len(group)/(math.floor(max_age - min_age) + 1)

    # num_cvd_visits = group['CVD_outpatient'].sum()
    if  group['CVD_outpatient'].sum() > 0: CVD = 1
    else: CVD = 0

    last_visit_date = group["Visit start date"].max()
    # away_from_death = group["away_from_death"].min()

    df = pd.DataFrame(data={'Internalpatientid': [ids], 'num_visits': [len(group)], 
                            "CVD": [CVD], "last_visit_date": [last_visit_date], "Age 00-20": group["Age 00-20"].sum(),
                            'Age 20-40': group["Age 20-40"].sum(), 'Age 40-60': group["Age 40-60"].sum(), 'Age 60-80':group["Age 60-80"].sum(), 
                            'Age 80-100':group["Age 80-100"].sum(), 'Age 100-120':group["Age 100-120"].sum(), 
                            "age_mean": [age_mean], "age_std": [age_std], "freq": [round(freq,2)]
                            })
    
    df = df.reset_index(drop=True)
    tidy_dataset.append(df)
    
tidy_dataset = pd.concat(tidy_dataset)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [11]:
tidy_dataset.columns

Index(['Internalpatientid', 'num_visits', 'CVD', 'last_visit_date',
       'Age 00-20', 'Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'freq'],
      dtype='object')

In [None]:
tidy_dataset.to_csv("/home/hassan/lily/MLA/FDA/outpatient_mortality_quality.csv")