Import necessary libraries for Training the model

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, f1_score,precision_score, recall_score, roc_auc_score, confusion_matrix)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold,RandomizedSearchCV
from sklearn.metrics import classification_report

Loading the preprocessed data 

In [2]:
df = pd.read_csv(r"C:\Users\Portia\Documents\Wandiles Document\diabetic_data_cleaned.csv")

Preparing the data for training

In [3]:
df['target'] = df['readmitted_30'].apply(lambda x: 1 if x == 1 else 0)

In [4]:
#Drops unnecessary columns
cols_to_drop = ['patient_nbr', 'readmitted', 'weight', 'payer_code', 'medical_specialty', 'readmitted_30', 'readmitted_any']
cols_to_drop = [col for col in cols_to_drop if col in df.columns]
df = df.drop(cols_to_drop, axis=1)

In [5]:
#converts categorical variables to numerical
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

Training the model using Train-Test split of (70-30)

In [6]:
X = df.drop('target', axis=1)
y = df['target']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Class distribution in training set: \n{y_train.value_counts(normalize=True)}")

Training samples: 71236
Testing samples: 30530
Class distribution in training set: 
target
0    0.888399
1    0.111601
Name: proportion, dtype: float64


In [8]:
#   Initializing the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

In [9]:
#handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
param_grid = {
    'n_estimators': [100],       # Skip 200 (faster training)
    'max_depth': [3, 5],         # Depth 7 is too slow
    'learning_rate': [0.1],      # Skip 0.01 (requires more trees)
    'subsample': [0.8]           # Remove 1.0
}

In [11]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=5,           # Test only 5 random combos (vs 32)
    cv=3,               # Reduce from 5-fold to 3-fold
    n_jobs=1,           # Avoid overloading your Core i3
    verbose=3
)

In [None]:
#Performing grid search
print("Starting grid search")
grid_search.fit(X_train_res, y_train_res)

Starting grid search
Fitting 5 folds for each of 972 candidates, totalling 4860 fits


In [None]:
best_model = grid_search.best_estimator_