In [7]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [9]:
# Load cleaned dataset
df = pd.read_csv("C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/data/processed/diabetic_data_cleaned.csv")
df.head()


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


##  Summary of Previous Notebook
The dataset has already been cleaned, and exploratory analysis was performed. This notebook will clearly focus on Feature Engineering, Machine Learning Modeling, Hyperparameter Tuning, and evaluation.


In [11]:
# Simplify target variable
df['readmitted_binary'] = df['readmitted'].apply(lambda x: 0 if x == 'NO' else 1)

# Verify distribution
print(df['readmitted_binary'].value_counts())

readmitted_binary
0    54864
1    46902
Name: count, dtype: int64


In [13]:
df.drop(['encounter_id', 'patient_nbr', 'readmitted'], axis=1, inplace=True)


In [15]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verify encoded dataframe
df_encoded.head()


Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,False,True,False,False,False,False,False,False,True,False
1,1,1,7,3,59,0,18,0,0,0,...,True,True,False,False,False,False,False,False,False,True
2,1,1,7,2,11,5,13,2,0,1,...,False,True,False,False,False,False,False,False,True,True
3,1,1,7,2,44,1,16,0,0,0,...,True,True,False,False,False,False,False,False,False,True
4,1,1,7,1,51,0,8,0,0,0,...,False,True,False,False,False,False,False,False,False,True


In [17]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('readmitted_binary', axis=1)
y = df_encoded['readmitted_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (71236, 2428)
Testing set size: (30530, 2428)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.6312151981657386
              precision    recall  f1-score   support

           0       0.63      0.75      0.69     16459
           1       0.63      0.49      0.55     14071

    accuracy                           0.63     30530
   macro avg       0.63      0.62      0.62     30530
weighted avg       0.63      0.63      0.62     30530



In [21]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate Random Forest
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.6414674091057976
              precision    recall  f1-score   support

           0       0.64      0.76      0.69     16459
           1       0.64      0.51      0.57     14071

    accuracy                           0.64     30530
   macro avg       0.64      0.63      0.63     30530
weighted avg       0.64      0.64      0.64     30530



In [23]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}



In [25]:
import numpy as np
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select Top 50 or 100 important features
top_n = 100
top_features = X_train.columns[indices][:top_n]

# Reduce your dataset clearly:
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]


In [27]:
# Reduced grid search
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           scoring='accuracy',
                           verbose=2)

grid_search.fit(X_train_reduced, y_train)

# Evaluate the best model
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test_reduced)

print("Tuned Random Forest Accuracy (Reduced Features):", accuracy_score(y_test, y_pred_best_rf))
print(classification_report(y_test, y_pred_best_rf))


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Tuned Random Forest Accuracy (Reduced Features): 0.64736324926302
              precision    recall  f1-score   support

           0       0.65      0.76      0.70     16459
           1       0.65      0.51      0.57     14071

    accuracy                           0.65     30530
   macro avg       0.65      0.64      0.64     30530
weighted avg       0.65      0.65      0.64     30530



In [29]:
import joblib

joblib.dump(best_rf,"C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/models/random_forest_readmission_final.pkl")

['C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/models/random_forest_readmission_final.pkl']

In [31]:
# Save training and testing datasets for reproducibility
X_train_reduced.to_csv("C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/data/processed/X_train_reduced.csv", index=False)

X_test_reduced.to_csv("C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/data/processed/X_test_reduced.csv", index=False)

y_train.to_csv("C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/data/processed/y_train.csv", index=False)

y_test.to_csv("C:/Users/Vigneshwaran/OneDrive/Desktop/patient-readmission-prediction/data/processed/y_test.csv", index=False)
