In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv(r"C:\Users\shraw\Downloads\cleaned_healthcare_noshow_dataset.csv")

target_col = "target"  
X = df.drop(target_col, axis=1)
y = df[target_col]

categorical_cols = X.select_dtypes(include=["object"]).columns
print("Encoding categorical columns:", list(categorical_cols))

le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1)

param_grid = {
    'n_estimators': [100, 200],         
    'max_depth': [10, 20, None],        
    'min_samples_split': [2, 5, 10],    
    'min_samples_leaf': [1, 2, 4],      
    'max_features': ['sqrt', 'log2']    
}

grid_search = GridSearchCV(
    rf, param_grid,
    scoring='roc_auc',  
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)


print("\n=== Random Forest Results with Balancing ===")
print(classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

roc_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
print("\nROC-AUC:", roc_auc)

Encoding categorical columns: ['Gender', 'Neighbourhood', 'DayOfWeek']
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

=== Random Forest Results with Balancing ===
              precision    recall  f1-score   support

           0     0.3500    0.6132    0.4457      4336
           1     0.8785    0.7106    0.7857     17062

    accuracy                         0.6909     21398
   macro avg     0.6142    0.6619    0.6157     21398
weighted avg     0.7714    0.6909    0.7168     21398


Confusion Matrix:
 [[ 2659  1677]
 [ 4938 12124]]

ROC-AUC: 0.7445945255116893


In [2]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

y_proba = grid_search.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

def evaluate_threshold(threshold):
    y_pred_thresh = (y_proba >= threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred_thresh)
    tn, fp, fn, tp = cm.ravel()
    
    sensitivity = tp / (tp + fn)  
    specificity = tn / (tn + fp)  
    
    print(f"\n=== Threshold: {threshold:.2f} ===")
    print("Confusion Matrix:")
    print(cm)
    print(f"Sensitivity (Recall 1): {sensitivity:.4f}")
    print(f"Specificity (Recall 0): {specificity:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred_thresh, digits=4))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred_thresh))

evaluate_threshold(0.5)
for thr in [0.3, 0.4, 0.6, 0.7]:
    evaluate_threshold(thr)

j_scores = tpr - fpr
best_idx = np.argmax(j_scores)
best_threshold = thresholds[best_idx]
print(f"\nBest Threshold by Youden’s J: {best_threshold:.4f}")
evaluate_threshold(best_threshold)




=== Threshold: 0.50 ===
Confusion Matrix:
[[ 2659  1677]
 [ 4938 12124]]
Sensitivity (Recall 1): 0.7106
Specificity (Recall 0): 0.6132
Classification Report:
              precision    recall  f1-score   support

           0     0.3500    0.6132    0.4457      4336
           1     0.8785    0.7106    0.7857     17062

    accuracy                         0.6909     21398
   macro avg     0.6142    0.6619    0.6157     21398
weighted avg     0.7714    0.6909    0.7168     21398

ROC-AUC: 0.6619114664728292

=== Threshold: 0.30 ===
Confusion Matrix:
[[  388  3948]
 [  384 16678]]
Sensitivity (Recall 1): 0.9775
Specificity (Recall 0): 0.0895
Classification Report:
              precision    recall  f1-score   support

           0     0.5026    0.0895    0.1519      4336
           1     0.8086    0.9775    0.8851     17062

    accuracy                         0.7976     21398
   macro avg     0.6556    0.5335    0.5185     21398
weighted avg     0.7466    0.7976    0.7365     21398

