In [None]:
"""This is a public data source from TableShift."""
# For more information on datasets and access in TableShift, see:
# * https://tableshift.org/datasets.html
# * https://github.com/mlfoundations/tableshift
import os
import zipfile
from abc import ABC, abstractmethod
from typing import Sequence, Callable

import numpy as np
import pandas as pd

import utils
# diabetes_readmission.py and tabular_dataset.py are from TableShift
from diabetes_readmission import \
    DIABETES_READMISSION_RESOURCES, preprocess_diabetes_readmission, DIABETES_READMISSION_FEATURES, get_icd9
from tabular_dataset import *

import pandas as pd


def convert_numeric_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """Utility function for automatically casting int-valued columns to float."""
    for c in df.columns:
        df[c] = df[c].convert_dtypes(convert_string=False,
                                     convert_boolean=False)
    return df


def complete_cases(df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna(axis=0, how='any')


def apply_column_missingness_threshold(df: pd.DataFrame,
                                       missingness_threshold=0.8) -> pd.DataFrame:
    miss = pd.isnull(df).sum() / len(df)

    dropcols = miss.index[miss >= missingness_threshold].tolist()
    df.drop(columns=dropcols, inplace=True)
    return df



class DataSource(ABC):
    """Abstract class to represent a generic data source."""

    def __init__(self, cache_dir: str,
                 preprocess_fn: Callable[[pd.DataFrame], pd.DataFrame],
                 resources: Sequence[str] = None,
                 download: bool = True,
                 ):
        self.cache_dir = cache_dir
        self.download = download

        self.preprocess_fn = preprocess_fn
        self.resources = resources
        self._initialize_cache_dir()

    def _initialize_cache_dir(self):
        """Create cache_dir if it does not exist."""
        utils.initialize_dir(self.cache_dir)

    def get_data(self) -> pd.DataFrame:
        """Fetch data from local cache or download if necessary."""
        self._download_if_not_cached()
        raw_data = self._load_data()
        return self.preprocess_fn(raw_data)

    def _download_if_not_cached(self):
        """Download files if they are not already cached."""
        for url in self.resources:
            utils.download_file(url, self.cache_dir)

    @abstractmethod
    def _load_data(self) -> pd.DataFrame:
        """Load the raw data from disk and return it.

        Any preprocessing should be performed in preprocess_fn, not here."""
        raise

    @property
    def is_cached(self) -> bool:
        """Check whether all resources exist in cache dir."""
        for url in self.resources:
            basename = utils.basename_from_url(url)
            fp = os.path.join(self.cache_dir, basename)
            if not os.path.exists(fp):
                return False
        return True

class DiabetesReadmissionDataSource(DataSource):
    def __init__(self, resources=DIABETES_READMISSION_RESOURCES,
                 preprocess_fn=preprocess_diabetes_readmission, **kwargs):
        super().__init__(resources=resources, preprocess_fn=preprocess_fn,
                         **kwargs)

    def _load_data(self) -> pd.DataFrame:
        # unzip the file
        zip_fp = os.path.join(self.cache_dir, "dataset_diabetes.zip")
        with zipfile.ZipFile(zip_fp, 'r') as zf:
            zf.extractall(self.cache_dir)
        # read the dataframe
        df = pd.read_csv(os.path.join(self.cache_dir, "dataset_diabetes",
                                      "diabetic_data.csv"),
                         na_values="?",
                         low_memory=False)
        return df

  from pandas.core import (
  from .autonotebook import tqdm as notebook_tqdm
2025-05-19 02:30:48,613	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-05-19 02:30:48,777	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
df_DiabetesReadmission = DiabetesReadmissionDataSource(cache_dir = '/Users/ruiqil/Documents/DistShift/tableshift_fetch/DiabetesReadmission').get_data()
# df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission.race == 'Other')]
df_DiabetesReadmission[['num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']].dtypes

num_lab_procedures    int64
num_procedures        int64
num_medications       int64
number_outpatient     int64
number_emergency      int64
number_inpatient      int64
number_diagnoses      int64
dtype: object

In [3]:
# for col in ['race', 'gender', 'age', 'weight']:
#     print(f"Summary for column: {col}")
#     print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())
#     print("-" * 30)
df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission['gender'] == 'Unknown/Invalid')]
df_DiabetesReadmission['race_Caucasian'] = np.where((df_DiabetesReadmission['race'] == 'Caucasian'), 1, 0)
df_DiabetesReadmission['race_AfricanAmerican'] = np.where((df_DiabetesReadmission['race'] == 'AfricanAmerican'), 1, 0)
df_DiabetesReadmission['race_Hispanic'] = np.where((df_DiabetesReadmission['race'] == 'Hispanic'), 1, 0)
df_DiabetesReadmission['race_Asian'] = np.where((df_DiabetesReadmission['race'] == 'Asian'), 1, 0)
df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission.race == 'Other')]
df_DiabetesReadmission = df_DiabetesReadmission[(df_DiabetesReadmission.admission_source_id == 1) | (df_DiabetesReadmission.admission_source_id == 7)]
df_DiabetesReadmission = df_DiabetesReadmission[(df_DiabetesReadmission.admission_type_id == 1) | (df_DiabetesReadmission.admission_type_id == 2) | (df_DiabetesReadmission.admission_type_id == 3)]
df_DiabetesReadmission = df_DiabetesReadmission[df_DiabetesReadmission.discharge_disposition_id.isin([1, 3, 6, 22, 2, 5, 4, 23])]

df_DiabetesReadmission = df_DiabetesReadmission[~((df_DiabetesReadmission.age == '[0-10)') | (df_DiabetesReadmission.age == '[10-20)'))]

# df_DiabetesReadmission = df_DiabetesReadmission[~(df_DiabetesReadmission.race == 'Asian')]
df_DiabetesReadmission['gender'] = np.where((df_DiabetesReadmission['gender'] == 'Male'), 1, 0)
# df_DiabetesReadmission = df_DiabetesReadmission.join(pd.get_dummies(df_DiabetesReadmission["age"], prefix="age"))
# for c in ["payer_code", "medical_specialty"]:
#     freq = df_DiabetesReadmission[c].value_counts(normalize=True)
#     keep_categories = freq[freq > 0.01].index
#     df_DiabetesReadmission[c] = df_DiabetesReadmission[c].where(
#         df_DiabetesReadmission[c].isin(keep_categories), 
#         other=np.nan
#     )
#     df_DiabetesReadmission = df_DiabetesReadmission.join(pd.get_dummies(df_DiabetesReadmission[c], prefix=c))
df_DiabetesReadmission['age>=70'] = np.where(df_DiabetesReadmission.age.isin(['[70-80)', '[80-90)', '[90-100)']), 1, 0)
for col in ['race', 'gender', 'age', 'weight']:
    print(f"Summary for column: {col}")
    print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())
    print("-" * 30)
df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['encounter_id', 'patient_nbr', 'race', 'age', 'weight', "payer_code", "medical_specialty"]).reset_index(drop=True)
df_DiabetesReadmission.columns

Summary for column: race
race
Asian                521
Hispanic            1419
AfricanAmerican    14693
Caucasian          56982
Name: count, dtype: int64
------------------------------
Summary for column: gender
gender
1    33773
0    39842
Name: count, dtype: int64
------------------------------
Summary for column: age
age
[20-30)      1257
[90-100)     1924
[30-40)      2828
[40-50)      7101
[80-90)     12406
[50-60)     12815
[60-70)     16696
[70-80)     18588
Name: count, dtype: int64
------------------------------
Summary for column: weight
weight
>200             2
[0-25)          10
[175-200)       10
[150-175)       27
[25-50)         45
[125-150)      107
[100-125)      479
[50-75)        516
[75-100)       874
NaN          71545
Name: count, dtype: int64
------------------------------


Index(['gender', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'race_Caucasian', 'race_AfricanAmerican', 'race_Hispanic', 'race_Asian',
       'age>=70'],
      dtype='object')

In [4]:
# for col in ['max_glu_serum', 'A1Cresult', 'metformin', 'insulin', 'change', 'diabetesMed', 'readmitted']:
low_quality_col = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
            'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 
            'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']
for col in low_quality_col:
    print(f"Summary for column: {col}")
    print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())
    print("-" * 30)

df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=low_quality_col).reset_index(drop=True)
df_DiabetesReadmission.columns

Summary for column: repaglinide
repaglinide
Down         31
Up           86
Steady     1199
No        72299
Name: count, dtype: int64
------------------------------
Summary for column: nateglinide
nateglinide
Down          9
Up           20
Steady      579
No        73007
Name: count, dtype: int64
------------------------------
Summary for column: chlorpropamide
chlorpropamide
Down          1
Up            4
Steady       37
No        73573
Name: count, dtype: int64
------------------------------
Summary for column: glimepiride
glimepiride
Down        147
Up          229
Steady     3712
No        69527
Name: count, dtype: int64
------------------------------
Summary for column: acetohexamide
acetohexamide
Steady        1
No        73614
Name: count, dtype: int64
------------------------------
Summary for column: glipizide
glipizide
Down        406
Up          527
Steady     8474
No        64208
Name: count, dtype: int64
------------------------------
Summary for column: glyburide
glybur

Index(['gender', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'insulin', 'change', 'diabetesMed', 'readmitted', 'race_Caucasian',
       'race_AfricanAmerican', 'race_Hispanic', 'race_Asian', 'age>=70'],
      dtype='object')

In [5]:
for col in ['max_glu_serum', 'A1Cresult', 'metformin', 'insulin', 'change', 'diabetesMed']:
    print(f"Summary for column: {col}")
    print(df_DiabetesReadmission[col].value_counts(dropna=False).sort_values())
    print("-" * 30)
df_DiabetesReadmission["max_glu_serum>200"] = df_DiabetesReadmission["max_glu_serum"].map({"None": 0, "Norm": 0, ">200": 1, ">300": 1})
df_DiabetesReadmission["max_glu_serum>300"] = df_DiabetesReadmission["max_glu_serum"].map({"None": 0, "Norm": 0, ">200": 0, ">300": 1})
df_DiabetesReadmission["max_glu_serum"] = df_DiabetesReadmission["max_glu_serum"].map({"None": 0, "Norm": 1, ">200": 1, ">300": 1})
df_DiabetesReadmission["A1Cresult>7"] = df_DiabetesReadmission["A1Cresult"].map({"None": 0, "Norm": 0, ">7": 1, ">8": 1})
df_DiabetesReadmission["A1Cresult>8"] = df_DiabetesReadmission["A1Cresult"].map({"None": 0, "Norm": 0, ">7": 0, ">8": 1})
df_DiabetesReadmission["A1Cresult"] = df_DiabetesReadmission["A1Cresult"].map({"None": 0, "Norm": 1, ">7": 1, ">8": 1})
df_DiabetesReadmission['change'] = np.where((df_DiabetesReadmission['change'] == 'Ch'), 1, 0)
df_DiabetesReadmission['diabetesMed'] = np.where((df_DiabetesReadmission['diabetesMed'] == 'Yes'), 1, 0)
df_DiabetesReadmission['metformin_Up'] = np.where((df_DiabetesReadmission['metformin'] == 'Up'), 1, 0)
df_DiabetesReadmission['metformin_Down'] = np.where((df_DiabetesReadmission['metformin'] == 'Down'), 1, 0)
df_DiabetesReadmission['metformin_Steady'] = np.where((df_DiabetesReadmission['metformin'] == 'Steady'), 1, 0)
df_DiabetesReadmission['insulin_Up'] = np.where((df_DiabetesReadmission['insulin'] == 'Up'), 1, 0)
df_DiabetesReadmission['insulin_Down'] = np.where((df_DiabetesReadmission['insulin'] == 'Down'), 1, 0)
df_DiabetesReadmission['insulin_Steady'] = np.where((df_DiabetesReadmission['insulin'] == 'Steady'), 1, 0)
df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['metformin', 'insulin'])
df_DiabetesReadmission

Summary for column: max_glu_serum
max_glu_serum
>200        7
Norm       44
>300      255
NaN     73309
Name: count, dtype: int64
------------------------------
Summary for column: A1Cresult
A1Cresult
>7       2887
Norm     3912
>8       5694
NaN     61122
Name: count, dtype: int64
------------------------------
Summary for column: metformin
metformin
Down        425
Up          771
Steady    13994
No        58425
Name: count, dtype: int64
------------------------------
Summary for column: insulin
insulin
Up         8354
Down       9390
Steady    22139
No        33732
Name: count, dtype: int64
------------------------------
Summary for column: change
change
Ch    35103
No    38512
Name: count, dtype: int64
------------------------------
Summary for column: diabetesMed
diabetesMed
No     16442
Yes    57173
Name: count, dtype: int64
------------------------------


Unnamed: 0,gender,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,max_glu_serum>200,max_glu_serum>300,A1Cresult>7,A1Cresult>8,metformin_Up,metformin_Down,metformin_Steady,insulin_Up,insulin_Down,insulin_Steady
0,0,1,1,7,2,11,5,13,2,0,...,,,,,0,0,0,0,0,0
1,1,1,1,7,2,44,1,16,0,0,...,,,,,0,0,0,1,0,0
2,1,1,1,7,1,51,0,8,0,0,...,,,,,0,0,0,0,0,1
3,1,1,1,7,5,73,0,12,0,0,...,,,,,0,0,0,0,0,0
4,0,1,1,7,9,47,2,17,0,0,...,,,,,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73610,0,1,1,7,6,45,1,25,3,1,...,,,,,0,0,0,0,1,0
73611,1,1,3,7,3,51,0,16,0,0,...,,,1.0,1.0,0,0,1,0,1,0
73612,1,1,1,7,1,53,0,9,1,0,...,,,,,0,0,1,0,1,0
73613,0,2,3,7,10,45,2,21,0,0,...,,,,,0,0,0,1,0,0


In [6]:
from features import *
transforms = make_value_map_transforms(DIABETES_READMISSION_FEATURES)
# ColumnTransformer(transforms)
filtered_transforms = [
    t for t in transforms if any(col in t[2] for col in ['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3'])
]

# Create a ColumnTransformer with only the filtered transformations
ct = ColumnTransformer(filtered_transforms)

In [7]:
df_transformed = ct.fit_transform(df_DiabetesReadmission[['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3']].fillna("nan"))
df_transformed = pd.DataFrame(df_transformed, columns=remove_verbose_prefixes(ct.get_feature_names_out()))
df_transformed

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3
0,Emergency,Discharged to home,Emergency Room,Other current conditions in the mother classif...,Diabetes mellitus,Outcome of delivery
1,Emergency,Discharged to home,Emergency Room,Intestinal infections due to other organisms,"Diabetes mellitus type 1, uncontrolled Diabete...",Hypertensive chronic kidney disease
2,Emergency,Discharged to home,Emergency Room,Secondary malignant neoplasm of respiratory an...,Malignant neoplasm of pancreas,Diabetes mellitus
3,Emergency,Discharged to home,Emergency Room,Heart failure,Emphysema,Diabetes mellitus
4,Emergency,Discharged to home,Emergency Room,Diabetes with peripheral circulatory disorders,Hypertensive chronic kidney disease,Complications peculiar to certain specified pr...
...,...,...,...,...,...,...
73610,Emergency,Discharged to home,Emergency Room,Epilepsy and recurrent seizures,Late effects of cerebrovascular disease,Old myocardial infarction
73611,Emergency,Discharged/transferred to SNF,Emergency Room,"Diabetes mellitus type 1, uncontrolled Diabete...",Alcohol-induced mental disorders,Hypotension
73612,Emergency,Discharged to home,Emergency Room,Septicemia,Infections of kidney,Episodic mood disorders
73613,Urgent,Discharged/transferred to SNF,Emergency Room,Complications peculiar to certain specified pr...,Other and unspecified anemias,"Other complications of procedures, NEC"


In [8]:
df_DiabetesReadmission['admission_type_id'].value_counts(dropna=False).sort_values()

admission_type_id
2    13864
3    15565
1    44186
Name: count, dtype: int64

In [9]:
df_transformed['admission_type_id'].value_counts(dropna=False).sort_values()

admission_type_id
Urgent       13864
Elective     15565
Emergency    44186
Name: count, dtype: int64

In [10]:
df_DiabetesReadmission['admission_source_id'].value_counts(dropna=False).sort_values()

admission_source_id
1    24402
7    49213
Name: count, dtype: int64

In [11]:
df_transformed['admission_source_id'].value_counts(dropna=False).sort_values()

admission_source_id
Physician Referral    24402
Emergency Room        49213
Name: count, dtype: int64

In [12]:
df_DiabetesReadmission['discharge_disposition_id'].value_counts(dropna=False).sort_values()

discharge_disposition_id
23      339
4       600
5       807
2      1588
22     1805
6     10084
3     10231
1     48161
Name: count, dtype: int64

In [13]:
df_transformed['discharge_disposition_id'].value_counts(dropna=False).sort_values()

discharge_disposition_id
Discharged/transferred to a long term care hospital.                                  339
Discharged/transferred to ICF                                                         600
Discharged/transferred to another type of inpatient care institution                  807
Discharged/transferred to another short term hospital                                1588
Discharged/transferred to another rehab fac including rehab units of a hospital.     1805
Discharged/transferred to home with home health service                             10084
Discharged/transferred to SNF                                                       10231
Discharged to home                                                                  48161
Name: count, dtype: int64

In [14]:
def map_discharge_simple(x):
    # Treat NULL, "Not Mapped", or empty strings as missing
    if pd.isnull(x) or x.strip() == "" or x.lower() in ["null", "not mapped"]:
        return None
    x_lower = x.lower()
    # Group anything with "home" (for example: 'Discharged to home', 'Discharged/transferred to home with home health service')
    if "home" in x_lower:
        return "Home"
    # For others that mention 'transferred', 'discharged/transferred', or similar phrases
    elif "transferred" in x_lower or "discharged/transferred" in x_lower:
        return "Transferred"

In [15]:
df_transformed["discharge_group"] = df_transformed["discharge_disposition_id"].apply(map_discharge_simple)
print(df_transformed['discharge_group'].value_counts(dropna=False).sort_values())
df_DiabetesReadmission['Transferred'] = np.where((df_transformed['discharge_group'] == 'Transferred'), 1, 0)
df_DiabetesReadmission['Home'] = np.where((df_transformed['discharge_group'] == 'Home'), 1, 0)

discharge_group
Transferred    15370
Home           58245
Name: count, dtype: int64


In [16]:
df_DiabetesReadmission['Emergency_admission'] = np.where((df_DiabetesReadmission['admission_type_id'] == 1), 1, 0)
df_DiabetesReadmission['Elective_admission'] = np.where((df_DiabetesReadmission['admission_type_id'] == 3), 1, 0)
df_DiabetesReadmission['Urgent_admission'] = np.where((df_DiabetesReadmission['admission_type_id'] == 2), 1, 0)

df_DiabetesReadmission['Emergency Room'] = np.where((df_DiabetesReadmission['admission_source_id'] == 7), 1, 0)

df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['admission_type_id', 'admission_source_id', 'discharge_disposition_id'])
df_DiabetesReadmission.columns

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'readmitted',
       'race_Caucasian', 'race_AfricanAmerican', 'race_Hispanic', 'race_Asian',
       'age>=70', 'max_glu_serum>200', 'max_glu_serum>300', 'A1Cresult>7',
       'A1Cresult>8', 'metformin_Up', 'metformin_Down', 'metformin_Steady',
       'insulin_Up', 'insulin_Down', 'insulin_Steady', 'Transferred', 'Home',
       'Emergency_admission', 'Elective_admission', 'Urgent_admission',
       'Emergency Room'],
      dtype='object')

In [17]:
import pandas as pd

# -- 1) Calculate overall frequencies of diagnoses --

# Flatten all diagnosis columns into one long Series
all_diags = pd.concat([
    df_DiabetesReadmission["diag_1"],
    df_DiabetesReadmission["diag_2"],
    df_DiabetesReadmission["diag_3"]
], axis=0).dropna()

# Compute relative frequency
diag_counts = all_diags.value_counts(normalize=True)

# Use threshold = 0.005 for 0.5%
diag_keep = diag_counts[diag_counts > 0.005].index

icd_mapping = get_icd9()
for diag in diag_keep:
    print(diag, diag_counts[diag], icd_mapping[diag])

# -- 2) Create one-hot/multi-hot columns for each frequent diagnosis --
df_encoded = pd.DataFrame(index=df_DiabetesReadmission.index)

for diag in diag_keep:
    df_encoded[f'diag_{diag}'] = (
        df_DiabetesReadmission[['diag_1', 'diag_2', 'diag_3']]
        .isin([diag])                      # check if any diag matches this diag
        .any(axis=1)                      # True if diag_1 OR diag_2 OR diag_3 = diag
        .astype(int)
    )

df_encoded


428 0.0598563701649925 Heart failure
250 0.05867460569974092 Diabetes mellitus
276 0.04618880959956365 Disorders of fluid, electrolyte, and acid-base balance
401 0.04193445752465797 Essential hypertension
414 0.04143902549884096 Other forms of chronic ischemic heart disease
427 0.03868005999727285 Cardiac dysrhythmias
599 0.022885323394391165 Other disorders of urethra and urinary tract
403 0.019585473387573292 Hypertensive chronic kidney disease
496 0.019535475660197265 Chronic airway obstruction, not elsewhere classified
786 0.01752647606926958 Symptoms involving respiratory system and other chest symptoms
486 0.017499204581609928 Pneumonia, organism unspecified
780 0.016081087223308033 General symptoms
491 0.01519021862642607 Chronic bronchitis
682 0.014953865733375756 Other cellulitis and abscess
250.02 0.014431162219899096 Diabetes mellitus type 2, uncontrolled Diabetes mellitus type 1 Diabetes mellitus type 2 without mention of complication
585 0.014381164492523068 Chronic kidney

Unnamed: 0,diag_428,diag_250,diag_276,diag_401,diag_414,diag_427,diag_599,diag_403,diag_496,diag_786,...,diag_434,diag_424,diag_250.01,diag_38,diag_V45,diag_305,diag_998,diag_560,diag_577,diag_574
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
73613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [18]:
# df_DiabetesReadmission = df_DiabetesReadmission.join(df_encoded)
df_DiabetesReadmission = df_DiabetesReadmission.drop(columns=['diag_1', 'diag_2', 'diag_3'])
df_DiabetesReadmission

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,...,metformin_Steady,insulin_Up,insulin_Down,insulin_Steady,Transferred,Home,Emergency_admission,Elective_admission,Urgent_admission,Emergency Room
0,0,2,11,5,13,2,0,1,6,,...,0,0,0,0,0,1,1,0,0,1
1,1,2,44,1,16,0,0,0,7,,...,0,1,0,0,0,1,1,0,0,1
2,1,1,51,0,8,0,0,0,5,,...,0,0,0,1,0,1,1,0,0,1
3,1,5,73,0,12,0,0,0,8,,...,0,0,0,0,0,1,1,0,0,1
4,0,9,47,2,17,0,0,0,9,,...,0,0,0,1,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73610,0,6,45,1,25,3,1,2,9,,...,0,0,1,0,0,1,1,0,0,1
73611,1,3,51,0,16,0,0,0,9,,...,1,0,1,0,1,0,1,0,0,1
73612,1,1,53,0,9,1,0,0,13,,...,1,0,1,0,0,1,1,0,0,1
73613,0,10,45,2,21,0,0,1,9,,...,0,1,0,0,1,0,0,0,1,1


In [19]:
df_DiabetesReadmission.sum(axis=0)

gender                    33773.0
time_in_hospital         318312.0
num_lab_procedures      3194456.0
num_procedures            96625.0
num_medications         1180804.0
number_outpatient         28075.0
number_emergency          16005.0
number_inpatient          48553.0
number_diagnoses         558652.0
max_glu_serum               306.0
A1Cresult                 12493.0
change                    35103.0
diabetesMed               57173.0
readmitted                35880.0
race_Caucasian            56982.0
race_AfricanAmerican      14693.0
race_Hispanic              1419.0
race_Asian                  521.0
age>=70                   32918.0
max_glu_serum>200           262.0
max_glu_serum>300           255.0
A1Cresult>7                8581.0
A1Cresult>8                5694.0
metformin_Up                771.0
metformin_Down              425.0
metformin_Steady          13994.0
insulin_Up                 8354.0
insulin_Down               9390.0
insulin_Steady            22139.0
Transferred   

In [20]:
df_DiabetesReadmission.columns.drop('readmitted')

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'change', 'diabetesMed', 'race_Caucasian', 'race_AfricanAmerican',
       'race_Hispanic', 'race_Asian', 'age>=70', 'max_glu_serum>200',
       'max_glu_serum>300', 'A1Cresult>7', 'A1Cresult>8', 'metformin_Up',
       'metformin_Down', 'metformin_Steady', 'insulin_Up', 'insulin_Down',
       'insulin_Steady', 'Transferred', 'Home', 'Emergency_admission',
       'Elective_admission', 'Urgent_admission', 'Emergency Room'],
      dtype='object')

In [21]:
df_DiabetesReadmission['readmitted'] = df_DiabetesReadmission['readmitted'].astype(int)
df_DiabetesReadmission = df_DiabetesReadmission[['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'change', 'diabetesMed', 'race_Caucasian', 'race_AfricanAmerican',
       'race_Hispanic', 'race_Asian', 'age>=70', 'max_glu_serum>200',
       'max_glu_serum>300', 'A1Cresult>7', 'A1Cresult>8', 'metformin_Up',
       'metformin_Down', 'metformin_Steady', 'insulin_Up', 'insulin_Down',
       'insulin_Steady', 'Transferred', 'Home', 'Emergency_admission',
       'Elective_admission', 'Urgent_admission', 'Emergency Room', 'readmitted']]
df_DiabetesReadmission.columns

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'change', 'diabetesMed', 'race_Caucasian', 'race_AfricanAmerican',
       'race_Hispanic', 'race_Asian', 'age>=70', 'max_glu_serum>200',
       'max_glu_serum>300', 'A1Cresult>7', 'A1Cresult>8', 'metformin_Up',
       'metformin_Down', 'metformin_Steady', 'insulin_Up', 'insulin_Down',
       'insulin_Steady', 'Transferred', 'Home', 'Emergency_admission',
       'Elective_admission', 'Urgent_admission', 'Emergency Room',
       'readmitted'],
      dtype='object')

In [22]:
df_out0 = df_DiabetesReadmission[df_DiabetesReadmission['Emergency Room'] == 0]
df_out1 = df_DiabetesReadmission[df_DiabetesReadmission['Emergency Room'] == 1]
print(len(df_out0), len(df_out1))

24402 49213


In [23]:
df_out0

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,...,insulin_Up,insulin_Down,insulin_Steady,Transferred,Home,Emergency_admission,Elective_admission,Urgent_admission,Emergency Room,readmitted
11,0,11,42,2,19,0,0,0,8,,...,0,0,0,0,1,0,0,1,0,1
13,0,2,25,2,11,0,0,0,3,,...,0,0,1,0,1,1,0,0,0,1
22,0,13,48,2,18,0,0,1,8,,...,0,0,1,0,1,0,0,1,0,1
32,0,14,28,3,28,0,0,0,8,,...,0,1,0,0,1,0,1,0,0,0
36,0,3,57,1,21,0,0,0,6,,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73596,1,1,1,5,8,0,0,0,4,,...,0,0,1,0,1,0,1,0,0,0
73599,1,3,57,0,7,0,1,0,3,,...,0,0,0,0,1,0,0,1,0,0
73603,0,3,27,1,29,0,1,0,9,,...,0,0,1,0,1,0,1,0,0,0
73604,0,3,31,2,24,0,0,0,9,,...,0,1,0,0,1,0,1,0,0,1


In [24]:
df_out1

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,...,insulin_Up,insulin_Down,insulin_Steady,Transferred,Home,Emergency_admission,Elective_admission,Urgent_admission,Emergency Room,readmitted
0,0,2,11,5,13,2,0,1,6,,...,0,0,0,0,1,1,0,0,1,0
1,1,2,44,1,16,0,0,0,7,,...,1,0,0,0,1,1,0,0,1,0
2,1,1,51,0,8,0,0,0,5,,...,0,0,1,0,1,1,0,0,1,0
3,1,5,73,0,12,0,0,0,8,,...,0,0,0,0,1,1,0,0,1,1
4,0,9,47,2,17,0,0,0,9,,...,0,0,1,0,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73610,0,6,45,1,25,3,1,2,9,,...,0,1,0,0,1,1,0,0,1,1
73611,1,3,51,0,16,0,0,0,9,,...,0,1,0,1,0,1,0,0,1,1
73612,1,1,53,0,9,1,0,0,13,,...,0,1,0,0,1,1,0,0,1,0
73613,0,10,45,2,21,0,0,1,9,,...,1,0,0,1,0,0,0,1,1,0


In [25]:
# df_out0.drop(columns=['Emergency Room']).to_csv('target_DiabetesReadmission.csv', index=False)
# df_out1.drop(columns=['Emergency Room']).to_csv('source_DiabetesReadmission.csv', index=False)