In [1]:
import pandas as pd

df = pd.read_csv("diabetic_data_cleaned.csv")

In [2]:
df.shape

(99329, 21)

In [3]:
features = [
    'num_medications', 'diag_1', 'time_in_hospital', 'diag_2', 'diag_3',
    'age', 'discharge_disposition_id', 'number_diagnoses',
    'number_preceding_year_visits', 'num_procedures', 'num_lab_procedures',
    'number_diabetes_meds', 'race', 'insulin', 'admission_type_id'
]

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class FeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def _map_diag(self, val):
        try:
            val = float(val)
        except:
            return 0
        if 1 <= val < 140: return 1
        elif 140 <= val < 240: return 2
        elif 240 <= val < 280: return 3
        elif 280 <= val < 290: return 4
        elif 290 <= val < 320: return 5
        elif 320 <= val < 390: return 6
        elif 390 <= val < 460: return 7
        elif 460 <= val < 520: return 8
        elif 520 <= val < 580: return 9
        elif 580 <= val < 630: return 10
        elif 630 <= val < 680: return 11
        elif 680 <= val < 710: return 12
        elif 710 <= val < 740: return 13
        elif 740 <= val < 760: return 14
        elif 760 <= val < 780: return 15
        elif 780 <= val < 800: return 16
        elif 800 <= val < 1000: return 17
        return 0

    def _map_age(self, val):
        # Accept both [60-70) or raw numeric like 63
        if isinstance(val, str) and "[" in val:
            val = val.strip("[]").split("-")
            return (int(val[0]) + int(val[1])) // 2
        else:
            val = float(val)
            val = np.clip(val, 0, 100)
            return int(((val // 10) * 10) + 5)

    def _map_insulin(self, val):
        return {'No': -2, 'Down': -1, 'Steady': 0, 'Up': 1}.get(val, -2)

    def _map_race(self, val):
        return {
            'Caucasian': 0, 'AfricanAmerican': 1,
            'Asian': 2, 'Hispanic': 3, 'Other': 4
        }.get(val, 4)

    def transform(self, X):
        X = X.copy()
        X['diag_1'] = X['diag_1'].apply(self._map_diag)
        X['diag_2'] = X['diag_2'].apply(self._map_diag)
        X['diag_3'] = X['diag_3'].apply(self._map_diag)
        X['age'] = X['age'].apply(self._map_age)
        X['insulin'] = X['insulin'].apply(self._map_insulin)
        X['race'] = X['race'].apply(self._map_race)
        return X

In [5]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, roc_auc_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

X = df[features].copy()
y = df['readmitted'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# ----------------------------
# Define and Evaluate Models
# ----------------------------
models = []

models.append(('LR', Pipeline([
    ("Transformer", PowerTransformer()),
    ("Scaler", StandardScaler()),
    ("LogReg", LogisticRegression(random_state=0, solver='liblinear'))
])))

models.append(('DT', DecisionTreeClassifier(random_state=0)))
models.append(('RF', RandomForestClassifier(random_state=0, n_jobs=-1)))
models.append(('ADA', AdaBoostClassifier(random_state=0)))
models.append(('XGB', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0, n_jobs=-1)))
models.append(('LGB', LGBMClassifier(random_state=0, n_jobs=-1))) 

precision, recall, f1, accuracy, roc_auc = [], [], [], [], []
Training_Time, Prediction_Time, names = [], [], []

for name, model in models:
    start = time.time()
    model.fit(X_train, y_train)
    Training_Time.append(time.time() - start)
    
    start = time.time()
    y_pred = model.predict(X_test)
    Prediction_Time.append(time.time() - start)
    
    y_proba = model.predict_proba(X_test)[:, 1]
    
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    accuracy.append(accuracy_score(y_test, y_pred))
    roc_auc.append(roc_auc_score(y_test, y_proba))
    names.append(name)

# ----------------------------
# Results Summary
# ----------------------------
results_df = pd.DataFrame({
    'Model': names,
    'Precision': precision,
    'Recall': recall,
    'F1': f1,
    'Accuracy': accuracy,
    'ROC_AUC': roc_auc,
    'Training_Time (s)': Training_Time,
    'Prediction_Time (s)': Prediction_Time
}).sort_values(by='ROC_AUC', ascending=False).reset_index(drop=True)

print(results_df.round(4))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 9050, number of negative: 70413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 240
[LightGBM] [Info] Number of data points in the train set: 79463, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113889 -> initscore=-2.051613
[LightGBM] [Info] Start training from score -2.051613
  Model  Precision  Recall      F1  Accuracy  ROC_AUC  Training_Time (s)  \
0   LGB     0.5714  0.0053  0.0105    0.8862   0.6534             1.0280   
1   ADA     0.0000  0.0000  0.0000    0.8861   0.6419             3.3094   
2   XGB     0.4432  0.0172  0.0332    0.8856   0.6398             0.5181   
3    RF     0.5882  0.0044  0.0088    0.8862   0.6282             4.8135   
4    LR     0.0000  0.0000  0.0000    0.8861   0.6277             1.3484   
5    DT     0.1496  0.1887  0.1669    0.7854   0.525

In [6]:
import joblib
import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, roc_auc_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

X = df[features].copy()
y = df['readmitted'].copy()

# Class weight calculations
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

# ----------------------------
# Define Models with Weighting
# ----------------------------
models_weighted = [
    ('LR', Pipeline([
        ("Transformer", PowerTransformer()),
        ("Scaler", StandardScaler()),
        ("LogReg", LogisticRegression(random_state=0, solver='liblinear', class_weight='balanced'))
    ])),
    ('DT', DecisionTreeClassifier(random_state=0, class_weight='balanced')),
    ('RF', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced')),
    ('ADA', AdaBoostClassifier(random_state=0)),  # No class_weight param
    ('XGB', XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                          random_state=0, n_jobs=-1, scale_pos_weight=scale_pos_weight)),
    ('LGB', LGBMClassifier(random_state=0, n_jobs=-1, class_weight='balanced'))
]

# ----------------------------
# Model Evaluation
# ----------------------------
results = []
for name, model in models_weighted:
    t0 = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - t0

    t0 = time.time()
    y_pred = model.predict(X_test)
    prediction_time = time.time() - t0

    y_proba = model.predict_proba(X_test)[:, 1]

    results.append({
        'Model': name,
        'Precision': precision_score(y_test, y_pred)*100,
        'Recall': recall_score(y_test, y_pred)*100,
        'F1': f1_score(y_test, y_pred)*100,
        'Accuracy': accuracy_score(y_test, y_pred)*100,
        'ROC_AUC': roc_auc_score(y_test, y_proba)*100,
        'Training_Time (s)': training_time,
        'Prediction_Time (s)': prediction_time
    })

results_weighted_df = pd.DataFrame(results).sort_values(by='ROC_AUC', ascending=False).reset_index(drop=True)
print(results_weighted_df.round(4))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 9050, number of negative: 70413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 242
[LightGBM] [Info] Number of data points in the train set: 79463, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
  Model  Precision   Recall       F1  Accuracy  ROC_AUC  Training_Time (s)  \
0   LGB    16.9936  58.9483  26.3819   62.5239  65.2943             0.3841   
1   XGB    17.1306  49.4918  25.4517   66.9737  63.6206             0.3380   
2   ADA     0.0000   0.0000   0.0000   88.6087  63.5678             3.2069   
3    RF    71.4286   0.4419   0.8783   88.6389  62.6235             4.1059   
4    LR    15.4121  57.1807  24.2800   59.3728  61.7511 

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

X = df[features].copy()
y = df['readmitted'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Build the pipeline
pipeline = Pipeline([
    ("transformer", FeatureTransformer()),
    ("scaler", StandardScaler()),
    ("classifier", LGBMClassifier(
        class_weight='balanced',
        n_estimators=200,
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
print(cm)

[LightGBM] [Info] Number of positive: 9050, number of negative: 70413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 194
[LightGBM] [Info] Number of data points in the train set: 79463, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
              precision    recall  f1-score   support

           0       0.92      0.64      0.76     17603
           1       0.17      0.56      0.26      2263

    accuracy                           0.63     19866
   macro avg       0.54      0.60      0.51     19866
weighted avg       0.83      0.63      0.70     19866

Confusion Matrix
[[11322  6281]
 [  994  1269]]




In [8]:
import joblib
joblib.dump(pipeline, "hospital_readmission_pipeline_v3.0.pkl")

['hospital_readmission_pipeline_v3.0.pkl']

In [9]:
import pandas as pd
import joblib

pipeline = joblib.load("hospital_readmission_pipeline_v3.0.pkl")

# New record
input_data = {
    'num_medications': 15,
    'diag_1': '296',
    'time_in_hospital': 2,
    'diag_2': '427',
    'diag_3': '250.02',
    'age': 35,
    'discharge_disposition_id': 1,
    'number_diagnoses': 3,
    'number_preceding_year_visits': 6,
    'num_procedures': 0,
    'num_lab_procedures': 18,
    'number_diabetes_meds': 1,
    'race': 'Caucasian',
    'insulin': 'Steady',
    'admission_type_id': 1
}

df_input = pd.DataFrame([input_data])

prediction = pipeline.predict(df_input)[0]
proba = pipeline.predict_proba(df_input)[0][1]

print("=================================")
print(prediction)
print(f"Prediction: {'🔴 Readmitted' if prediction==1 else '🟢 Not Readmitted'}")
print(f"Probability of Readmission: {proba:.2%}")
print("=================================")



1
Prediction: 🔴 Readmitted
Probability of Readmission: 75.89%
