#**Patients Readmission Prediction (Maven Hospital Challenge)**

##This is an extension of the Maven Hospital Dashboard Challenge project. Using data that resembles real-world Hospital provided by Maven, binary classification model was built to predict patients readmission.

In [1]:
import zipfile
with zipfile.ZipFile('Hospital_Patient_Records.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [2]:
import pandas as pd

In [3]:
df1 = pd.read_csv('/content/data_dictionary.csv')
df2 = pd.read_csv('/content/encounters.csv')
df3 = pd.read_csv('/content/organizations.csv')
df4 = pd.read_csv('/content/patients.csv')
df5 = pd.read_csv('/content/payers.csv')
df6 = pd.read_csv('/content/procedures.csv')

In [4]:
df2.columns

Index(['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PAYER',
       'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST',
       'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE',
       'REASONDESCRIPTION'],
      dtype='object')

In [5]:
df2.head()

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,32c84703-2481-49cd-d571-3899d5820253,2011-01-02T09:26:36Z,2011-01-02T12:58:36Z,3de74169-7f67-9304-91d4-757e0f3a14d2,d78e84ec-30aa-3bba-a33a-f29a3a454662,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,ambulatory,185347001,Encounter for problem (procedure),85.55,1018.02,0.0,,
1,c98059da-320a-c0a6-fced-c8815f3e3f39,2011-01-03T05:44:39Z,2011-01-03T06:01:42Z,d9ec2e44-32e9-9148-179a-1653348cc4e2,d78e84ec-30aa-3bba-a33a-f29a3a454662,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,outpatient,308335008,Patient encounter procedure,142.58,2619.36,0.0,,
2,4ad28a3a-2479-782b-f29c-d5b3f41a001e,2011-01-03T14:32:11Z,2011-01-03T14:47:11Z,73babadf-5b2b-fee7-189e-6f41ff213e01,d78e84ec-30aa-3bba-a33a-f29a3a454662,7caa7254-5050-3b5e-9eae-bd5ea30e809c,outpatient,185349003,Encounter for check up (procedure),85.55,461.59,305.27,,
3,c3f4da61-e4b4-21d5-587a-fbc89943bc19,2011-01-03T16:24:45Z,2011-01-03T16:39:45Z,3b46a0b7-0f34-9b9a-c319-ace4a1f58c0b,d78e84ec-30aa-3bba-a33a-f29a3a454662,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,162673000,General examination of patient (procedure),136.8,1784.24,0.0,,
4,a9183b4f-2572-72ea-54c2-b3cd038b4be7,2011-01-03T17:36:53Z,2011-01-03T17:51:53Z,fa006887-d93c-d302-8b89-f3c25f88c0e1,d78e84ec-30aa-3bba-a33a-f29a3a454662,42c4fca7-f8a9-3cd1-982a-dd9751bf3e2a,ambulatory,390906007,Follow-up encounter,85.55,234.72,0.0,55822004.0,Hyperlipidemia


In [6]:
# patients that were admitted and readmitted overtime

## Note
# Important field encounters('START','STOP',PATIENT')
# how to solve
# step 1: find the duplicated patient key inside encounters dataframe
#         - how about group patient key with the time start and stop? result: 27791 rows, what does that mean? it means 27791 patients were admitted (could be the same patients multiple time)

# try to group patient key with time start and time stop (it means a patient could undergo multiple encounters at the same time frame)
df2_filtered = df2.groupby(['PATIENT','START','STOP']).size().reset_index(name = "Count").sort_values(by = "Count", ascending = False)
#27791 admissions
# df2_filtered

# further grouped patient key with different timeframe in df2_filtered
df2_filtered = df2_filtered.groupby("PATIENT").size().reset_index(name = "Count").sort_values(by = "Count", ascending = False)
#974 rows (number of patients admitted)
# df2_filtered

# filter admitted count that's > 1
df2_filtered = df2_filtered[df2_filtered['Count'] > 1]
#854 patients readmitted
# df2_filtered

# to create a new column 'readmitted' inside df2 and put 1 or 0 in it, based on the ID that was in df2_filtered
df2['readmitted'] = df2['PATIENT'].isin(df2_filtered['PATIENT']).astype(int)
# df2

###**Binary Classification**

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import gensim.downloader as api
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import torch
import numpy as np


from google.colab import userdata
userdata.get('Hospital_Bin_Classification')

# Feature Engineering
df2['START'] = pd.to_datetime(df2['START'])
df2['STOP'] = pd.to_datetime(df2['STOP'])
df2['DURATION'] = ((df2['STOP'] - df2['START']).dt.total_seconds())/(60*60)

# Encoding descriptions to vector

# GloVe
glove_model = api.load("glove-wiki-gigaword-100")
nltk.download('punkt')
nltk.download('stopwords')

def text_to_glove(text, model, dim=100):
  words = word_tokenize(text.lower())
  words = [word for word in words if word.isalpha()] # keep only alphabetic words
  words = [word for word in words if word not in stopwords.words('english')] # remove stopwords
  word_vectors = [model[word] for word in words if word in model]
  if len(word_vectors) > 0:
    return np.mean(word_vectors, axis = 0)
  else:
    return np.zeros(dim)






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:


# Apply the function to the entire 'description' column
dim = 100
df2['glove_embedding'] = df2['DESCRIPTION'].apply(lambda x: text_to_glove(x, glove_model, dim))

# Convert the embeddings into a DataFrame
glove_embeddings_df = pd.DataFrame(df2['glove_embedding'].tolist(), columns=[f'glove_feature_{i}' for i in range(dim)])

# Combine with the original DataFrame
df2_combined = pd.concat([df2.drop(columns=['DESCRIPTION','glove_embedding']), glove_embeddings_df], axis = 1)

# Encode categorical features
df2_combined = pd.get_dummies(df2_combined, columns = ['ENCOUNTERCLASS'], drop_first = True)

#Handle missing data
df2_combined.dropna()

# Labelling
y = df2_combined['readmitted']
X = df2_combined.drop(columns = ['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PAYER',
       'CODE', 'REASONCODE',
       'REASONDESCRIPTION','readmitted'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

# Model Selection and Training

# RandomForest without cross validation
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("RF Accuracy:",accuracy_score(y_test,y_pred))
print("RF Classification Report:\n", classification_report(y_test,y_pred))
print("RF ROC-AUC:", roc_auc_score(y_test,rf_model.predict_proba(X_test)[:, 1]))
print("RF Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

#Hyperparameter tuning
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [10,20],
    'min_samples_split': [2,5],
    'min_samples_leaf': [1,2]
}

# RandomForest With cross validation
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv= 5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Final Evaluation
y_pred_best = best_model.predict(X_test)
print("Tuned RF Model Accuracy:", accuracy_score(y_test, y_pred_best))

print("Tuned RF Model ROC_AUC:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
print("Tuned RF Model Confusion Matrix:\n", confusion_matrix(y_test,y_pred))

# XGBoost classifier
ratio = y_train.value_counts()[0]/y_train.value_counts()[1]
xgb_model = XGBClassifier(
      objective = 'binary:logistic', # For binary classification
      eval_metric = 'logloss',  # Log loss is commonly used for binary classification
      use_label_encoder = False, # New versions of XGBoost require this to avoid warnings
      random_state = 42,
      scale_pos_weight = ratio # ratio of class distribution of majority to minority
)

xgb_model.fit(X_train,y_train)

y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:,1] # probability estimates for ROC-AUC

# Evaluate the model
print("XGB Accuracy:", accuracy_score(y_test,y_pred))
print("XGB Classification Report:\n", classification_report(y_test,y_pred))
print("XGB ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print("XGB Confusion Matrix:\n", confusion_matrix(y_test,y_pred))

# Understanding features importance; uncomment to see the result
# Feature Importance if using tree-based model

#print()
# importances = rf_model.feature_importances_
# indices = np.argsort(importances)[::-1]

# for f in range(X.shape[1]):
#   print(f"{X.columns[indices[f]]}: {importances[indices[f]]}")

# print()

RF Accuracy: 0.9944434486467109
RF Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.04      0.06        28
           1       1.00      1.00      1.00      5551

    accuracy                           0.99      5579
   macro avg       0.60      0.52      0.53      5579
weighted avg       0.99      0.99      0.99      5579

RF ROC-AUC: 0.9027105798183083
RF Confusion Matrix:
 [[   1   27]
 [   4 5547]]
Tuned RF Model Accuracy: 0.9951604230148772
Tuned RF Model ROC_AUC: 0.9432148647605323
Tuned RF Model Confusion Matrix:
 [[   1   27]
 [   4 5547]]


Parameters: { "use_label_encoder" } are not used.



XGB Accuracy: 0.8759634343072236
XGB Classification Report:
               precision    recall  f1-score   support

           0       0.04      0.96      0.07        28
           1       1.00      0.88      0.93      5551

    accuracy                           0.88      5579
   macro avg       0.52      0.92      0.50      5579
weighted avg       0.99      0.88      0.93      5579

XGB ROC-AUC: 0.9603353321151915
XGB Confusion Matrix:
 [[  27    1]
 [ 691 4860]]


###**Result**
###**-Random Forest Classifier (without cross validation)** - We could see that the imbalance data might be the reason for the high false negative that may led to bias in prediction. The model may not generalize well to unseen data. Also, the confusion matrix shows that the model might tends to classify a person as readmitted even when it is not always the case.

###**-Random Forest Classifier (with cross validation)** - There's no changes in the confusion matrix, but the accuracy and ROC-AUC score increases instead

###**-XGBoost Classifier** - The confusion matrix observed was the best among three models. It exhibits high true positive and high true negative when comparing to its counterparts(false positive, false negative)

###-A **follow up experimentation** was done next in order to tackle the imbalance in dataset

### **Retrain using Balanced Random Forest Classifier**

In [10]:
from imblearn.ensemble import BalancedRandomForestClassifier

# Apply the function to the entire 'description' column
dim = 100
df2['glove_embedding'] = df2['DESCRIPTION'].apply(lambda x: text_to_glove(x, glove_model, dim))

# Convert the embeddings into a DataFrame
glove_embeddings_df = pd.DataFrame(df2['glove_embedding'].tolist(), columns=[f'glove_feature_{i}' for i in range(dim)])

# Combine with the original DataFrame
df2_combined = pd.concat([df2.drop(columns=['DESCRIPTION','glove_embedding']), glove_embeddings_df], axis = 1)

# Encode categorical features
df2_combined = pd.get_dummies(df2_combined, columns = ['ENCOUNTERCLASS'], drop_first = True)

#Handle missing data
df2_combined.dropna()

# Labelling
y = df2_combined['readmitted']
X = df2_combined.drop(columns = ['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PAYER',
       'CODE', 'REASONCODE',
       'REASONDESCRIPTION','readmitted'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

# Model Selection and Training

# Balanced Random Forest without cross validation
rf_model = BalancedRandomForestClassifier(random_state=42, sampling_strategy = 'all', replacement = True, bootstrap = False)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("RF Accuracy:",accuracy_score(y_test,y_pred))
print("RF Classification Report:\n", classification_report(y_test,y_pred))
print("RF ROC-AUC:", roc_auc_score(y_test,rf_model.predict_proba(X_test)[:, 1]))
print("RF Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

#Hyperparameter tuning
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [10,20],
    'min_samples_split': [2,5],
    'min_samples_leaf': [1,2]
}

# Balanced Random Forest with cross validation
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv= 5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Final Evaluation
y_pred_best = best_model.predict(X_test)
print("Tuned RF Model Accuracy:", accuracy_score(y_test, y_pred_best))

print("Tuned RF Model ROC_AUC:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
print("Tuned RF Model Confusion Matrix:\n", confusion_matrix(y_test,y_pred))


ratio = y_train.value_counts()[0]/y_train.value_counts()[1]

# XGBoost classifier
xgb_model = XGBClassifier(
      objective = 'binary:logistic', # For binary classification
      eval_metric = 'logloss',  # Log loss is commonly used for binary classification
      use_label_encoder = False, # New versions of XGBoost require this to avoid warnings
      random_state = 42,
      scale_pos_weight = ratio # ratio of class distribution of majority to minority
)

xgb_model.fit(X_train,y_train)

y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:,1] # probability estimates for ROC-AUC

# Evaluate the model
print("XGB Accuracy:", accuracy_score(y_test,y_pred))
print("XGB Classification Report:\n", classification_report(y_test,y_pred))
print("XGB ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print("XGB Confusion Matrix:\n", confusion_matrix(y_test,y_pred))


# Understanding features importance; uncomment to see the result
# Feature Importance if using tree-based model

# print()
# importances = rf_model.feature_importances_
# indices = np.argsort(importances)[::-1]

# for f in range(X.shape[1]):
#   print(f"{X.columns[indices[f]]}: {importances[indices[f]]}")

# print()

RF Accuracy: 0.8539164724861086
RF Classification Report:
               precision    recall  f1-score   support

           0       0.03      0.96      0.06        28
           1       1.00      0.85      0.92      5551

    accuracy                           0.85      5579
   macro avg       0.52      0.91      0.49      5579
weighted avg       0.99      0.85      0.92      5579

RF ROC-AUC: 0.9623330416655944
RF Confusion Matrix:
 [[  27    1]
 [ 814 4737]]
Tuned RF Model Accuracy: 0.8548126904463166
Tuned RF Model ROC_AUC: 0.965675425277299
Tuned RF Model Confusion Matrix:
 [[  27    1]
 [ 814 4737]]


Parameters: { "use_label_encoder" } are not used.



XGB Accuracy: 0.8759634343072236
XGB Classification Report:
               precision    recall  f1-score   support

           0       0.04      0.96      0.07        28
           1       1.00      0.88      0.93      5551

    accuracy                           0.88      5579
   macro avg       0.52      0.92      0.50      5579
weighted avg       0.99      0.88      0.93      5579

XGB ROC-AUC: 0.9603353321151915
XGB Confusion Matrix:
 [[  27    1]
 [ 691 4860]]


###**Result**
###**-Balance Random Forest Classifier (without cross validation)** - The model performed well with high true positive and high true negative with a high accuracy and ROC-AUC score.

###**-Balance Random Forest Classifier (with cross validation)** - There's no changes in the confusion matrix, but the accuracy and ROC-AUC score increases instead.

###**-XGBoost Classifier** - XGBoost classifier still outperformed Balanced Random Forest Classifier in terms of high true positive, high true negative and accuracy. It XGBoost just fell short in ROC-AUC by ~ 0.05
