In [2]:

# Installing  XGBoost:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.2



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Cell 0: Environment Check

import sys
print("Python version:\n",sys.version)
print("Python executable:\n",sys.executable)

Python version:
 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)]
Python executable:
 c:\Python312\python.exe


In [4]:
# Cell 1: Imports (Milestone 4)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix
)

from xgboost import XGBClassifier
import joblib
import warnings

warnings.filterwarnings("ignore")


In [5]:
# Cell 2: Load data safely for advanced models

try:
    DATA_PATH = "C://Users//Rachit//OneDrive//Documents//CreditPathAI//data//processed//final_features.csv"

    df = pd.read_csv(DATA_PATH)
    print("Full dataset shape:", df.shape)

    # Sample aggressively for advanced models
    if len(df) > 40000:
        df = df.sample(40000, random_state=42)
        print("Sampled dataset shape:", df.shape)

    # Normalize column names
    df.columns = df.columns.str.upper()

    # Check target
    if "TARGET" not in df.columns:
        raise ValueError("TARGET column not found")

    display(df.head())

except Exception as e:
    print("Data loading error:", e)


Full dataset shape: (307511, 130)
Sampled dataset shape: (40000, 130)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CREDIT_ACTIVE,CREDIT_TYPE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,BB_MONTH_BAL_MEAN,BB_STATUS_COUNT
245895,384575,0,Cash loans,M,Y,N,2,207000.0,465457.5,52641.0,...,0.0,1.0,2.0,2.0,970276.5,276988.5,27000.0,0.0,-26.571429,2.0
98194,214010,0,Cash loans,F,Y,Y,0,247500.0,1281712.5,48946.5,...,0.0,3.0,2.0,2.0,4330867.5,0.0,0.0,0.0,,
36463,142232,0,Cash loans,F,Y,N,0,202500.0,495000.0,39109.5,...,0.0,3.0,2.0,2.0,2804085.0,1431166.5,0.0,0.0,-18.25,2.833333
249923,389171,0,Cash loans,F,N,Y,0,247500.0,254700.0,24939.0,...,0.0,0.0,1.0,1.0,252517.05,0.0,0.0,0.0,,
158389,283617,0,Cash loans,M,N,Y,0,112500.0,308133.0,15862.5,...,0.0,4.0,2.0,1.0,1360575.0,487476.0,0.0,0.0,,


In [6]:
# Cell 3: Feature / Target separation and safe preprocessing

try:
    # Separate target
    y = df["TARGET"]
    X = df.drop(columns=["TARGET"])

    # Drop ID columns if present
    id_cols = [col for col in X.columns if "SK_ID" in col]
    if id_cols:
        X = X.drop(columns=id_cols)
        print("Dropped ID columns:", id_cols)

    # Identify column types
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

    print("Numeric columns :", len(numeric_cols))
    print("Categorical columns :", len(categorical_cols))
    print("Final feature shape:", X.shape)

    # Impute missing values
    num_imputer = SimpleImputer(strategy="median")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

    print("Missing values handled successfully")

except Exception as e:
    print("Feature preparation error:", e)


Dropped ID columns: ['SK_ID_CURR']
Numeric columns : 112
Categorical columns : 16
Final feature shape: (40000, 128)
Feature preparation error: name 'SimpleImputer' is not defined


In [7]:
# Cell 4: Train-Test Split

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.25,
        random_state=42,
        stratify=y
    )

    print("Train shape :", X_train.shape)
    print("Test shape  :", X_test.shape)

    print("\nTrain target distribution:")
    print(y_train.value_counts(normalize=True))

    print("\nTest target distribution:")
    print(y_test.value_counts(normalize=True))

except Exception as e:
    print("Train-test split error:", e)


Train shape : (30000, 128)
Test shape  : (10000, 128)

Train target distribution:
TARGET
0    0.918933
1    0.081067
Name: proportion, dtype: float64

Test target distribution:
TARGET
0    0.9189
1    0.0811
Name: proportion, dtype: float64


In [None]:
# Cell 5: Advanced Model – XGBoost Initialization

try:
    xgb_model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="auc",
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        random_state=42,
        n_jobs=-1
    )

    print("XGBoost model initialized successfully")

except Exception as e:
    print("Model initialization error:", e)


XGBoost model initialized successfully


In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [12]:
# Encoding Pipeline

try:
    # Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", "passthrough", numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
        ]
    )

    print("Preprocessing pipeline created successfully")

except Exception as e:
    print("Preprocessor error:", e)


Preprocessing pipeline created successfully


In [13]:
# Applying Encoding to Train/Test Data

try:
    X_train_enc = preprocessor.fit_transform(X_train)
    X_test_enc = preprocessor.transform(X_test)

    print("Encoded Train Shape:", X_train_enc.shape)
    print("Encoded Test Shape :", X_test_enc.shape)

except Exception as e:
    print("Encoding error:", e)


Encoded Train Shape: (30000, 256)
Encoded Test Shape : (10000, 256)


In [15]:
# Cell 6: Training XGBoost Model

try:
    xgb_model.fit(X_train_enc, y_train)
    print("XGBoost training completed successfully")

except Exception as e:
    print("Model training error:", e)


XGBoost training completed successfully


In [17]:
# Cell 7: Model Evaluation

try:
    y_pred = xgb_model.predict(X_test_enc)
    y_prob = xgb_model.predict_proba(X_test_enc)[:, 1]

    auc_score = roc_auc_score(y_test, y_prob)

    print("ROC-AUC Score:", round(auc_score, 4))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

except Exception as e:
    print("Model evaluation error:", e)


ROC-AUC Score: 0.7333

Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.84      0.89      9189
           1       0.20      0.47      0.28       811

    accuracy                           0.81     10000
   macro avg       0.58      0.65      0.59     10000
weighted avg       0.89      0.81      0.84     10000



In [18]:
# Cell 8: Confusion Matrix

try:
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

except Exception as e:
    print("Confusion matrix error:", e)


Confusion Matrix:
 [[7700 1489]
 [ 430  381]]


In [19]:
# Cell 9: Risk Category Logic

def assign_risk_category(prob):
    if prob < 0.30:
        return "Low"
    elif prob < 0.60:
        return "Medium"
    else:
        return "High"


In [20]:
# Cell 10: Sample Prediction

try:
    sample = X_test.iloc[[0]]
    sample_enc = preprocessor.transform(sample)

    prob = xgb_model.predict_proba(sample_enc)[0][1]
    risk = assign_risk_category(prob)

    print("Sample Prediction")
    print("-----------------")
    print("Recovery Probability:", round(prob, 3))
    print("Risk Category:", risk)

except Exception as e:
    print("Sample prediction error:", e)


Sample Prediction
-----------------
Recovery Probability: 0.068
Risk Category: Low


In [21]:
# Cell 11: Saving Model (.pkl File)

try:
    joblib.dump(xgb_model, "xgboost_credit_risk_model.pkl")
    joblib.dump(preprocessor, "xgboost_preprocessor.pkl")

    print("Model & Preprocessor saved successfully")

except Exception as e:
    print("Saving error:", e)



Model & Preprocessor saved successfully


#### Insights: 
“Overall Performance:

ROC-AUC = 0.7333

Model has good discriminatory power

It can reasonably separate defaulters vs non-defaulters

Class-wise Understanding:
 Non-Defaulters (Class 0)

Precision: 95% → predictions are very reliable

Recall: 84% → most safe customers correctly identified

F1-score: 0.89 → strong performance

 Model is very confident and stable for low-risk borrowers.

  Defaulters (Class 1)

Recall: 47% → model catches nearly half of risky borrowers

Precision: 20% → some false alarms exist (acceptable in recovery use-case)

F1-score: 0.28 → scope for improvement via tuning

This is expected in imbalanced financial datasets.

Confusion Matrix Insight:

Correctly identified defaulters: 381

Missed defaulters (false negatives): 430

False alarms: 1489

Model prefers risk sensitivity over silence, which is good for loan recovery teams.”