In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier


In [8]:
df_model = pd.read_csv('/Users/femiafolabi/Documents/Projects Data Analytics/Hospital_Readmission_Prediction/Data/encoded_cleaned_data.csv')

In [9]:
df_model.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,False,True,False,False,False,False,False,False,True,False
1,1,1,7,3,59,0,18,0,0,0,...,True,True,False,False,False,False,False,False,False,True
2,1,1,7,2,11,5,13,2,0,1,...,False,True,False,False,False,False,False,False,True,True
3,1,1,7,2,44,1,16,0,0,0,...,True,True,False,False,False,False,False,False,False,True
4,1,1,7,1,51,0,8,0,0,0,...,False,True,False,False,False,False,False,False,False,True


In [10]:
df_model.shape

(101766, 2426)

In [None]:
x = df_model.drop(columns='readmitted_binary')
y = df_model['readmitted_binary']

x.shape, y.shape

((101766, 2425), (101766,))

In [12]:
x_train, x_test, y_train, y_test = train_test_split(
    x,y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

x_train.shape, x_test.shape

((81412, 2425), (20354, 2425))

In [13]:
log_reg = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

log_reg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
y_pred = log_reg.predict(x_test)
y_prob = log_reg.predict_proba(x_test)[:, 1]

In [15]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('ROC-AUC:', roc_auc_score(y_test, y_prob))

Accuracy: 0.8884740100226
ROC-AUC: 0.644178552086247


In [16]:
confusion_matrix(y_test, y_pred)

array([[18039,    44],
       [ 2226,    45]])

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.51      0.02      0.04      2271

    accuracy                           0.89     20354
   macro avg       0.70      0.51      0.49     20354
weighted avg       0.85      0.89      0.84     20354



In [19]:
feature_importance = pd.DataFrame({
    "feature": x.columns,
    "coefficient": log_reg.coef_[0]
})

feature_importance["abs_coef"] = np.abs(feature_importance["coefficient"])
feature_importance.sort_values("abs_coef", ascending=False).head(15)

Unnamed: 0,feature,coefficient,abs_coef
829,diag_1_V58,1.286163,1.286163
1948,diag_3_553,-1.198804,1.198804
964,diag_2_272,-1.185863,1.185863
1130,diag_2_465,1.15991,1.15991
1158,diag_2_507,-1.140399,1.140399
410,diag_1_443,1.117996,1.117996
363,diag_1_386,-1.106494,1.106494
1560,diag_2_V43,-1.102707,1.102707
86,medical_specialty_Pediatrics-Endocrinology,-1.096863,1.096863
232,diag_1_250.41,1.095244,1.095244


In [22]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

In [23]:
rf.fit(x_train, y_train)

In [24]:
y_pred_rf = rf.predict(x_test)
y_prob_rf = rf.predict_proba(x_test)[:, 1]

In [25]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_prob_rf))

Random Forest Accuracy: 0.8884248796305394
Random Forest ROC-AUC: 0.6766010187429445


In [27]:
confusion_matrix(y_test, y_pred_rf)

array([[18083,     0],
       [ 2271,     0]])

In [28]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.00      0.00      0.00      2271

    accuracy                           0.89     20354
   macro avg       0.44      0.50      0.47     20354
weighted avg       0.79      0.89      0.84     20354



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
rf_importance = pd.DataFrame({
    "feature": x.columns,
    "importance": rf.feature_importances_
})

rf_importance = rf_importance.sort_values(
    by="importance",
    ascending=False
)

rf_importance.head(15)

Unnamed: 0,feature,importance
9,number_inpatient,0.129638
1,discharge_disposition_id,0.049174
6,num_medications,0.035547
4,num_lab_procedures,0.03502
8,number_emergency,0.02918
3,time_in_hospital,0.02765
10,number_diagnoses,0.020464
5,num_procedures,0.015825
7,number_outpatient,0.013302
0,admission_type_id,0.012982


In [31]:
model_comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "ROC-AUC": [
        roc_auc_score(y_test, y_prob),
        roc_auc_score(y_test, y_prob_rf)
    ]
})

model_comparison

Unnamed: 0,Model,ROC-AUC
0,Logistic Regression,0.644179
1,Random Forest,0.676601


In [32]:
y_prob_rf = rf.predict_proba(x_test)[:, 1]


In [34]:
thresholds = np.arange(0.2, 0.7, 0.05)

results = []

for t in thresholds:
    y_pred_t = (y_prob_rf >= t).astype(int)
    
    results.append({
        "threshold": t,
        "precision": precision_score(y_test, y_pred_t),
        "recall": recall_score(y_test, y_pred_t),
        "f1": f1_score(y_test, y_pred_t)
    })

threshold_df = pd.DataFrame(results)
threshold_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,threshold,precision,recall,f1
0,0.2,0.375242,0.085425,0.139168
1,0.25,0.602041,0.02598,0.04981
2,0.3,0.615385,0.003523,0.007005
3,0.35,0.0,0.0,0.0
4,0.4,0.0,0.0,0.0
5,0.45,0.0,0.0,0.0
6,0.5,0.0,0.0,0.0
7,0.55,0.0,0.0,0.0
8,0.6,0.0,0.0,0.0
9,0.65,0.0,0.0,0.0


In [35]:
chosen_threshold = 0.35
y_pred_rf_adj = (y_prob_rf >= chosen_threshold).astype(int)


In [37]:
confusion_matrix(y_test, y_pred_rf_adj)

array([[18083,     0],
       [ 2271,     0]])

In [38]:
print(classification_report(y_test, y_pred_rf_adj))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.00      0.00      0.00      2271

    accuracy                           0.89     20354
   macro avg       0.44      0.50      0.47     20354
weighted avg       0.79      0.89      0.84     20354



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
final_predictions = x_test.copy()

final_predictions["actual_readmitted"] = y_test.values
final_predictions["predicted_probability"] = y_prob_rf
final_predictions["predicted_readmitted"] = y_pred_rf_adj

final_predictions.to_csv(
    "/Users/femiafolabi/Documents/Projects Data Analytics/Hospital_Readmission_Prediction/Data/readmission_prediction_data.csv",
    index=False
)