# Texas Inpatient Discharg - Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from scipy import stats
import yaml, time, sys, os, glob

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

DATASET = "Texas_Inpatient_Discharge"
SPLIT_TRAINING = True
DEBUG = False
SEED = 42

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

In [2]:
from sklearn.model_selection import train_test_split


from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.feature_selection import SelectPercentile, chi2

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score

In [3]:
target="TARGET"

cat_features=[
    "TYPE_OF_ADMISSION",
    "SOURCE_OF_ADMISSION",
    "PAT_STATE",
    "PUBLIC_HEALTH_REGION",
]
num_features=[]

features= cat_features + num_features


In [4]:
df = pd.read_pickle(f"{ROOT}/data/df_train_preprocess_00_of_5.pkl")
print(df.shape)
df.head()

(199939, 40)


Unnamed: 0,TYPE_OF_ADMISSION,SOURCE_OF_ADMISSION,PAT_STATE,PAT_COUNTRY,PUBLIC_HEALTH_REGION,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,TARGET,PROVIDER_NAME_col_0,PROVIDER_NAME_col_1,PROVIDER_NAME_col_2,PROVIDER_NAME_col_3,PROVIDER_NAME_col_4,PROVIDER_NAME_col_5,PROVIDER_NAME_col_6,COUNTY_col_0,COUNTY_col_1,COUNTY_col_2,COUNTY_col_3,COUNTY_col_4,COUNTY_col_5,COUNTY_col_6,ADMITTING_DIAGNOSIS_col_0,ADMITTING_DIAGNOSIS_col_1,ADMITTING_DIAGNOSIS_col_2,ADMITTING_DIAGNOSIS_col_3,ADMITTING_DIAGNOSIS_col_4,ADMITTING_DIAGNOSIS_col_5,ADMITTING_DIAGNOSIS_col_6,PRINC_DIAG_CODE_col_0,PRINC_DIAG_CODE_col_1,PRINC_DIAG_CODE_col_2,PRINC_DIAG_CODE_col_3,PRINC_DIAG_CODE_col_4,PRINC_DIAG_CODE_col_5,PRINC_DIAG_CODE_col_6,POA_OTH_DIAG_CODE_COUNT,POA_E_CODE_COUNT
992358,1,1,TX,US,3,M,4,2,6,2,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,8,0
900799,1,1,TX,US,11,F,4,1,4,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,14,0
770151,1,1,TX,US,11,M,4,1,6,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,11,0
762640,1,1,TX,US,7,F,2,1,6,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,13,0
896831,4,5,TX,US,8,M,4,2,4,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0


In [8]:
cat_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore')),
    ('selector', SelectPercentile(chi2,percentile=80)),
])

num_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler()),
])

preprocessor = ColumnTransformer(transformers=[
    ('cat',cat_transformer,cat_features),
    ('num',num_transformer,num_features),
])

# model = RandomForestClassifier()
    
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier()),
])

In [10]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=SEED)
X_train.shape, X_test.shape

((119963, 4), (79976, 4))

In [11]:
#preprocessor.fit(X_train,y_train)
# preprocessor.transform(X_train)
# preprocessor.transform(X_test)


In [12]:
model.fit(X_train,y_train)

In [14]:
y_pred = model.predict(X_train)

print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.22      0.33     24526
           1       0.44      0.63      0.52     45874
           2       0.56      0.51      0.53     49563

    accuracy                           0.50    119963
   macro avg       0.55      0.45      0.46    119963
weighted avg       0.53      0.50      0.48    119963

