In [None]:
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load artifacts
MODEL_PATH = "../Models/final_xgboost_model.pkl"
PREPROCESS_PATH = "../Data/processed/preprocessor_finsecure.pkl"
RAW_PATH = "../Data/loan_data.csv"

# Load model & preprocessor
model = joblib.load(MODEL_PATH)
preprocessor = joblib.load(PREPROCESS_PATH)

df_raw = pd.read_csv(RAW_PATH)
df_raw = df_raw.drop(columns=["id"])  # drop id to match earlier steps

# --- RECREATE FEATURE ENGINEERING USED WHEN FITTING THE PREPROCESSOR ---

# 1) Split grade_subgrade into grade (A–F) and subgrade_num (1–5)
df_raw["grade"] = df_raw["grade_subgrade"].str[0]
df_raw["subgrade_num"] = df_raw["grade_subgrade"].str[1:].astype(int)

# 2) income_to_loan
df_raw["income_to_loan"] = df_raw["annual_income"] / df_raw["loan_amount"]

# 3) credit_bucket (example FICO-style bins, adjust thresholds to match your original code)
df_raw["credit_bucket"] = pd.cut(
    df_raw["credit_score"],
    bins=[0, 580, 670, 740, 800, 900],
    labels=["Poor", "Fair", "Good", "Very Good", "Excellent"],
    right=False
)

# 4) interest_bin (quartile bins)
df_raw["interest_bin"] = pd.qcut(
    df_raw["interest_rate"],
    q=4,
    labels=["ir_q1", "ir_q2", "ir_q3", "ir_q4"]
)

# ----------------------------------------------------------------------

target = df_raw["loan_paid_back"]
X_raw = df_raw.drop(columns=["loan_paid_back"])

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, target, test_size=0.2, stratify=target, random_state=42
)

# Transform X_test_raw using the saved preprocessor
X_test_processed = preprocessor.transform(X_test_raw)

# Generate probabilities using the final XGBoost model
probs_xgb = model.predict_proba(X_test_processed)[:, 1]

probs_xgb[:5]  # quick check


array([0.9150377 , 0.95054173, 0.84969133, 0.8609459 , 0.9556765 ],
      dtype=float32)

In [None]:
edu_auc = subgroup_auc(X_test_raw, probs_xgb, "education_level")
edu_auc

{"Bachelor's": 0.9199360779283328,
 "Master's": 0.9170396099035953,
 'High School': 0.9168195519937318,
 'PhD': 0.9152566146940704,
 'Other': 0.929931303193424}

In [None]:
plt.figure(figsize=(7,4))
sns.barplot(x=list(edu_auc.keys()), y=list(edu_auc.values()))
plt.title("AUC by Education Level")
plt.ylabel("AUC")
plt.xlabel("Education Level")
plt.xticks(rotation=25)
plt.grid(axis='y')
plt.show()


In [None]:
purpose_auc = subgroup_auc(X_test_raw, probs_xgb, "loan_purpose")
purpose_auc

{'Business': 0.9212403394082078,
 'Education': 0.9298687899136321,
 'Debt consolidation': 0.9181339697314073,
 'Home': 0.9215477250870172,
 'Medical': 0.924700076908362,
 'Vacation': 0.9239895833333335,
 'Car': 0.9091018072336512,
 'Other': 0.9201589598062335}

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=list(purpose_auc.keys()), y=list(purpose_auc.values()))
plt.title("AUC by Loan Purpose")
plt.ylabel("AUC")
plt.xlabel("Loan Purpose")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


In [None]:
sorted_purpose = sorted(purpose_auc.items(), key=lambda x: x[1], reverse=True)

top3_purpose = sorted_purpose[:3]
bottom3_purpose = sorted_purpose[-3:]

top3_purpose, bottom3_purpose


([('Education', 0.9298687899136321),
  ('Medical', 0.924700076908362),
  ('Vacation', 0.9239895833333335)],
 [('Other', 0.9201589598062335),
  ('Debt consolidation', 0.9181339697314073),
  ('Car', 0.9091018072336512)])