# Texas Inpatient Discharg - Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from scipy import stats
import yaml, time, sys, os, glob

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

DATASET = "Texas_Inpatient_Discharge"
SPLIT_TRAINING = True
DEBUG = False
SEED = 42

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV


from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.feature_selection import SelectPercentile, chi2, RFECV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



from sklearn.model_selection import GridSearchCV, cross_val_score

In [3]:
df = pd.read_pickle(f"{ROOT}/data/df_train_preprocess_00_of_5.pkl")
print(df.shape)
df.head()

(199939, 40)


Unnamed: 0,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,PAT_STATE,PAT_COUNTRY,PUBLIC_HEALTH_REGION,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,TARGET,PROVIDER_NAME_col_0,PROVIDER_NAME_col_1,PROVIDER_NAME_col_2,PROVIDER_NAME_col_3,PROVIDER_NAME_col_4,PROVIDER_NAME_col_5,PROVIDER_NAME_col_6,COUNTY_col_0,COUNTY_col_1,COUNTY_col_2,COUNTY_col_3,COUNTY_col_4,COUNTY_col_5,COUNTY_col_6,ADMITTING_DIAGNOSIS_col_0,ADMITTING_DIAGNOSIS_col_1,ADMITTING_DIAGNOSIS_col_2,ADMITTING_DIAGNOSIS_col_3,ADMITTING_DIAGNOSIS_col_4,ADMITTING_DIAGNOSIS_col_5,ADMITTING_DIAGNOSIS_col_6,PRINC_DIAG_CODE_col_0,PRINC_DIAG_CODE_col_1,PRINC_DIAG_CODE_col_2,PRINC_DIAG_CODE_col_3,PRINC_DIAG_CODE_col_4,PRINC_DIAG_CODE_col_5,PRINC_DIAG_CODE_col_6,POA_OTH_DIAG_CODE_COUNT,POA_E_CODE_COUNT
992358,1,1,TX,US,3,M,4,2,6,2,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,8,0
900799,1,1,TX,US,11,F,4,1,4,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,14,0
770151,1,1,TX,US,11,M,4,1,6,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,11,0
762640,1,1,TX,US,7,F,2,1,6,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,13,0
896831,4,5,TX,US,8,M,4,2,4,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199939 entries, 992358 to 410879
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   TYPE_OF_ADMISSION          199939 non-null  category
 1   SOURCE_OF_ADMISSION        199939 non-null  category
 2   PAT_STATE                  199939 non-null  category
 3   PAT_COUNTRY                199939 non-null  category
 4   PUBLIC_HEALTH_REGION       199939 non-null  category
 5   SEX_CODE                   199939 non-null  category
 6   RACE                       199939 non-null  category
 7   ETHNICITY                  199939 non-null  category
 8   ADMIT_WEEKDAY              199939 non-null  category
 9   TARGET                     199939 non-null  category
 10  PROVIDER_NAME_col_0        199939 non-null  category
 11  PROVIDER_NAME_col_1        199939 non-null  category
 12  PROVIDER_NAME_col_2        199939 non-null  category
 13  PROVIDER_

In [5]:
target="TARGET"

cat_features = [catFeature for catFeature in df.select_dtypes("category").columns if catFeature not in target]

#print(cat_features)

num_features=[numFeature for numFeature in df.select_dtypes("int").columns if numFeature not in target]

features= cat_features + num_features


# cat_features=[
#     "TYPE_OF_ADMISSION",
#     "SOURCE_OF_ADMISSION",
#     "PAT_STATE",
#     "PUBLIC_HEALTH_REGION",
#     #"admit_weekly",
#     #"ADMITTING_DIAGNOSIS",
#     #"PRINC_DIAG_CODE",
#     #"POA_PRINC_DIAG_CODE"
# ]

features


['TYPE_OF_ADMISSION',
 'SOURCE_OF_ADMISSION',
 'PAT_STATE',
 'PAT_COUNTRY',
 'PUBLIC_HEALTH_REGION',
 'SEX_CODE',
 'RACE',
 'ETHNICITY',
 'ADMIT_WEEKDAY',
 'PROVIDER_NAME_col_0',
 'PROVIDER_NAME_col_1',
 'PROVIDER_NAME_col_2',
 'PROVIDER_NAME_col_3',
 'PROVIDER_NAME_col_4',
 'PROVIDER_NAME_col_5',
 'PROVIDER_NAME_col_6',
 'COUNTY_col_0',
 'COUNTY_col_1',
 'COUNTY_col_2',
 'COUNTY_col_3',
 'COUNTY_col_4',
 'COUNTY_col_5',
 'COUNTY_col_6',
 'ADMITTING_DIAGNOSIS_col_0',
 'ADMITTING_DIAGNOSIS_col_1',
 'ADMITTING_DIAGNOSIS_col_2',
 'ADMITTING_DIAGNOSIS_col_3',
 'ADMITTING_DIAGNOSIS_col_4',
 'ADMITTING_DIAGNOSIS_col_5',
 'ADMITTING_DIAGNOSIS_col_6',
 'PRINC_DIAG_CODE_col_0',
 'PRINC_DIAG_CODE_col_1',
 'PRINC_DIAG_CODE_col_2',
 'PRINC_DIAG_CODE_col_3',
 'PRINC_DIAG_CODE_col_4',
 'PRINC_DIAG_CODE_col_5',
 'PRINC_DIAG_CODE_col_6',
 'POA_OTH_DIAG_CODE_COUNT',
 'POA_E_CODE_COUNT']

In [6]:
cat_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore')),
    ('selector', SelectPercentile(chi2,percentile=80)),
])

num_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler()),
])

preprocessor = ColumnTransformer(transformers=[
    ('cat',cat_transformer,cat_features),
    ('num',num_transformer,num_features),
])

In [7]:
models = {
    "LR": LogisticRegression(max_iter=1000),
    #"DT": DecisionTreeClassifier(),
    #"KNN": KNeighborsClassifier(),
    #"RF": RandomForestClassifier(),
    #"ET": ExtraTreesClassifier(),
    "XGB": XGBClassifier()
    #"CB": CatBoostClassifier(silent=True),
    #"ADA": AdaBoostClassifier(n_estimators=100,random_state=SEED),
    #"GBRT": GradientBoostingClassifier(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42) 
}

In [8]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=SEED)
X_train.shape, X_test.shape

((119963, 39), (79976, 39))

In [9]:
for name,model in models.items():
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)
    print(name, "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LR Accuracy: 0.53 (+/- 0.01)
XGB Accuracy: 0.57 (+/- 0.00)


In [10]:
#preprocessor.fit(X_train,y_train)
#preprocessor.transform(X_train)
#preprocessor.transform(X_test)

X_train

Unnamed: 0,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,PAT_STATE,PAT_COUNTRY,PUBLIC_HEALTH_REGION,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,PROVIDER_NAME_col_0,PROVIDER_NAME_col_1,PROVIDER_NAME_col_2,PROVIDER_NAME_col_3,PROVIDER_NAME_col_4,PROVIDER_NAME_col_5,PROVIDER_NAME_col_6,COUNTY_col_0,COUNTY_col_1,COUNTY_col_2,COUNTY_col_3,COUNTY_col_4,COUNTY_col_5,COUNTY_col_6,ADMITTING_DIAGNOSIS_col_0,ADMITTING_DIAGNOSIS_col_1,ADMITTING_DIAGNOSIS_col_2,ADMITTING_DIAGNOSIS_col_3,ADMITTING_DIAGNOSIS_col_4,ADMITTING_DIAGNOSIS_col_5,ADMITTING_DIAGNOSIS_col_6,PRINC_DIAG_CODE_col_0,PRINC_DIAG_CODE_col_1,PRINC_DIAG_CODE_col_2,PRINC_DIAG_CODE_col_3,PRINC_DIAG_CODE_col_4,PRINC_DIAG_CODE_col_5,PRINC_DIAG_CODE_col_6,POA_OTH_DIAG_CODE_COUNT,POA_E_CODE_COUNT
632486,3,2,TX,US,03,F,4,2,5,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,4,0
353811,3,2,TX,US,05,M,3,2,4,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,8,0
188575,1,1,TX,US,01,F,4,2,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,5,0
151440,3,4,TX,US,06,F,3,2,2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
732852,1,1,TX,US,06,F,4,1,4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488104,4,5,TX,US,03,F,5,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
874523,1,1,TX,US,04,F,3,2,5,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,9,0
715832,1,1,TX,US,06,M,3,2,4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
783859,2,1,TX,US,09,F,4,2,7,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3,0


In [11]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('XGB', XGBClassifier()),
])

In [12]:
model.fit(X_train,y_train)

In [13]:
y_pred = model.predict(X_train)

print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.47      0.56     24526
           1       0.55      0.62      0.59     45874
           2       0.67      0.70      0.68     49563

    accuracy                           0.62    119963
   macro avg       0.64      0.60      0.61    119963
weighted avg       0.63      0.62      0.62    119963



In [14]:
param_grid = {
    'XGB__learning_rate': [0.01, 0.1, 0.5],
    'XGB__max_depth': [3, 5, 7],
    'XGB__n_estimators': [100, 500, 1000]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

#grid_search.fit(X_train, y_train)

#print("Best parameters:", grid_search.best_params_)
#print("Best score:", grid_search.best_score_)

Best parameters: {'XGB__learning_rate': 0.5, 'XGB__max_depth': 3, 'XGB__n_estimators': 500}
Best score: 0.5722681188815673

In [15]:
model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('XGB', XGBClassifier(learning_rate=0.5, max_depth=3, n_estimators=500)),
])

model2.fit(X_train,y_train)

y_pred = model2.predict(X_train)

print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.45      0.54     24526
           1       0.53      0.60      0.56     45874
           2       0.66      0.68      0.67     49563

    accuracy                           0.60    119963
   macro avg       0.62      0.58      0.59    119963
weighted avg       0.61      0.60      0.60    119963

