In [1]:
import datetime as dt
import os
import sys

import numpy as np
import pandas as pd
from scipy import interp
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve, average_precision_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
import sqlalchemy as sa
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../')
from utilities import sql_utils as su
from utilities import model_eval_utils as meu

DWH = os.getenv('MIMIC_DWH')
engine = create_engine(DWH)

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('display.float_format', lambda x: '%.3f' % x)

  """)


In [None]:
QUERY = """
select
  *
from datasets.model_demog_dx
"""
with engine.begin() as conn:
    df = pd.read_sql(QUERY, conn)

## Exploring The Data

In [None]:
df.head()

In [None]:
df.hospital_expire_flag.value_counts()

In [None]:
df.groupby('insurance').describe()

## Machine Learning

### Data Pre-Processing

In [None]:
def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    
    Keyword Args:
    dataset: The Actual Dataset
    train_percentage: Percentage of Dataset to split into Training
    feature_headers: columns that are features to include
    target_header: column that is the outcome variable of interest
    :return: train_x, test_x, train_y, test_y
    """

    # Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers],
                                                        dataset[target_header],
                                                        train_size=train_percentage)
    return train_x, test_x, train_y, test_y

In [None]:
admission_type_dummies = \
    pd.get_dummies(df.admission_type, prefix='admission_type', dummy_na=True)
insurance_dummies = \
    pd.get_dummies(df.insurance, prefix='insurance', dummy_na=True)
language_dummies = \
    pd.get_dummies(df.language, prefix='language', dummy_na=True)
marital_dummies = \
    pd.get_dummies(df.marital_status, prefix='marital', dummy_na=True)
ethnicity_dummies = \
    pd.get_dummies(df.ethnicity, prefix='ethnicity', dummy_na=True)

In [None]:
df_model = pd.concat([df, admission_type_dummies, insurance_dummies,
                     language_dummies, marital_dummies, ethnicity_dummies], axis=1)

In [None]:
df_model.head()

In [None]:
target_header = 'hospital_expire_flag'
demog_features = ['is_male',
                  # 'age_at_admit',
                 ]
ccs_features = [c for c in df.columns if 'ccs' in c]
feature_headers = list(admission_type_dummies.columns) +\
                  list(insurance_dummies.columns) +\
                  list(language_dummies.columns) +\
                  list(marital_dummies.columns) +\
                  list(ethnicity_dummies.columns) +\
                  demog_features + ccs_features
X = df_model.loc[:, feature_headers]
y = df_model.hospital_expire_flag
train_x, test_x, train_y, test_y = split_dataset(df_model, 0.7, feature_headers, target_header)

In [None]:
# Train and Test dataset size details
print("Train_x Shape :: ", train_x.shape)
print("Train_y Shape :: ", train_y.shape)
print("Test_x Shape :: ", test_x.shape)
print("Test_y Shape :: ", test_y.shape)

### Random Forest Classifier

In [None]:
clf = RandomForestClassifier()
trained_model = clf.fit(train_x, train_y)

#### Evaluation

In [None]:
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)

In [None]:
# Creating ROC Curve with Cross Validation
meu.draw_cv_roc_curve(clf, cv, train_x, train_y)

In [None]:
# Plotting Precision Recall Curve
meu.plot_precision_recall_curve(clf, test_x, test_y)

In [None]:
predictions = trained_model.predict(test_x)

In [None]:
# Train and Test Accuracy
print("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print(" Confusion matrix ", confusion_matrix(test_y, predictions))

In [None]:
meu.show_confusion_matrix(confusion_matrix(test_y, predictions))

#### Feature Importances

In [None]:
importances = trained_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in trained_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

#for f in range(train_x.shape[1]):
for f in range(25): 
    column = train_x.columns[indices[f]]
    print("{ranking}. feature {column} ({importance})".format(ranking=f+1,
                                                             column=column,
                                                             importance=importances[indices[f]]))

In [None]:
# Plot the feature importances of the forest
top_features = 10
plt.figure(figsize=(20, 10))
plt.title("Feature importances")
plt.bar(range(top_features),
        importances[indices[0:top_features]],
        color="r",
        yerr=std[indices[0:top_features]],
        align="center")
plt.xticks(range(top_features), train_x.columns[indices[0:top_features]])
plt.xlim([-1, top_features])
plt.show()