In [None]:
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    confusion_matrix,
    roc_auc_score,
    brier_score_loss
)
import joblib
from catboost import CatBoostClassifier
from sklearn.utils import resample
from scipy.spatial.distance import pdist, squareform
from scipy.spatial.distance import pdist, squareform


In [None]:
# Open the pickle file in read-binary mode
with open("../models/needed_variables.pkl", "rb") as f:
    data = joblib.load(f)

print(data)

with open("../models/needed_variables2.pkl", "rb") as f:
    data2 = joblib.load(f)

print(data2)

{'X_test':         num__annual_inc  num__avg_cur_bal  num__bc_open_to_buy  num__bc_util  \
951469        -0.908541         -0.684762            -0.655219      1.311575   
951470        -0.414255          0.993807             0.145469      0.201937   
951471        -0.505971         -0.486715             0.213401     -0.598282   
951472         0.054126          0.329806            -0.415431     -0.971717   
951473        -0.278432         -0.576017            -0.602115      1.293792   
...                 ...               ...                  ...           ...   
985662        -0.418456         -0.612487             0.013196     -2.156042   
985663         3.554732         -0.673893             4.323966     -1.871519   
985664        -0.803522          0.253365            -0.519485      0.312190   
985665        -0.348444          1.316056            -0.565078     -1.611893   
985666         0.596720          0.368027            -0.032593     -1.227787   

        num__delinq_2yrs  nu

In [None]:
# extract variables
y_test = data.get("y_test")
y_pred = data.get("y_pred_test")
y_proba = data.get("y_proba")

# Performance metrics 

## Accronyms used :

some accrocnyms in terms of a confusion matrix (for classification):
TP (True Positive): correctly predicted positive cases

TN (True Negative): correctly predicted negative cases

FP (False Positive): negatives wrongly predicted as positives

FN (False Negative): positives wrongly predicted as negatives

## Accuracy
accuracy = nb of correct predictions / total number of predictions
accuracy = (TP+TN) / (TP + TN + FP + FN)

interpretation:
Out of all predictions, 0.723 were correct (either the model correctly identified a positive case or correctly rejected a negative case).

Only 0.277 were misclassified. This may be considered high if mistakes have a high cost.

Although this score does not guarantee the overall well performance of the model. Accuracy alone can be misleading when the dataset is
imbalanced (e.g., 95% negatives, 5% positives). In such cases, a model predicting only the majority class could have high
accuracy (95%) but fail to detect the minority class. That’s why we also look at sensitivity, specificity, AUC, and Brier score for a fuller picture.

In our case, the data is **. So let's look at other performance metrics

In [13]:
# 1. Accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7227323235276917

## Sensitivity

Sensitivity measures the ability of a model to correctly identify actual positives.
sensitivity = TP / (TP + FN)

High sensitivity = few positives are missed.
Low sensitivity = the model fails to catch many true positives.


How does it complement accuracy? :
Accuracy looks at overall correctness (both positives and negatives). Sensitivity zooms in on how well positives are captured.
We can have high accuracy but low sensitivity in imbalanced cases.
Example: If only 1% of patients have a rare disease, a model predicting “healthy” for everyone is 99% accurate but has 0% sensitivity.

We use sensitivity when missing positives is costly.

Of all the actual positive cases, our model correctly detected 52%. The model seems to do fairly well overall (72.3%), but its performance on the positive class is weak. This usually happens in imbalanced datasets. It might be acceptable if false positives are more costly than false negatives, such as in credit scoring cases.

Sensitivity tells you how many actual positives caught, but it ignores how many false alarms (false positives) generated.
Hence sensitivity alone can be misleading: a model that predicts everything as positive has 100% sensitivity but terrible precision.
Accuracy + Sensitivity still leave gaps : Even if a model shows high accuracy and high sensitivity, it can still be misleading because neither metric tells you anything about false positives.



In [14]:
# 2. Sensitivity (Recall : the true positive rate)
sensitivity = recall_score(y_test, y_pred)
sensitivity

0.5189917731489585

## Specificity

High specificity = very few false alarms (few false positives).
Low specificity = many false alarms (lots of false positives).

sensitivity vs specificity:
Sensitivity (Recall) = how well positives are caught (minimizing false negatives).
Specificity = how well negatives are caught (minimizing false positives).

Of all the risky applicants, the model correctly recognized as negative 76.4%. When the true condition is negative, the model correctly identifies about 3 out of 4. This is substantially better than its performance on positives.

This tells us that the model is biased toward predicting negatives (high specificity compared to sensitivity). It performs much better at ruling out negatives than at detecting positives. In the case of credit scoring, this might be more acceptable if false positives are costlier than false negatives.


In [15]:
# 3. Specificity (true negative rate)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)
specificity

0.7635948744953485

## AUC

It is one of the most widely used model evaluation metrics.

AUC stands for Area Under the Curve.
The “curve” here usually means the ROC curve (Receiver Operating Characteristic curve).
So, ROC-AUC measures the area under the ROC curve.

The ROC curve plots two things at various decision thresholds:
True Positive Rate (TPR) = Sensitivity in an axis (usually y axis)
False Positive Rate (FPR) = 1 – Specificity in the other axis
Each point on the curve corresponds to a threshold used to classify probabilities into positive/negative.
A perfect model under ROC, which at some threshold has a TPR of 1.0 and a FPR of 0.0, can be represented by either a point at (0, 1)
The curve shows the trade-off between sensitivity and specificity as you move the threshold.

The area under the ROC curve (AUC) represents the probability that the model, if given a randomly chosen positive and negative example, will rank the positive higher than the negative. While the ROC curve gives a visual representation of the trade-off between sensitivity and specificity for different thresholds, AUC provides a numerical summary of the performance of the model across all threshold values. 

Interpreting AUC:
AUC = 1.0 : perfect model (separates positives and negatives flawlessly).
    The perfect model under ROC described above, containing a square with sides of length 1, has an area under the curve (AUC) of 1.0. This means there is a 100% probability that the model will correctly rank a randomly chosen positive example higher than a randomly chosen negative example.
    It is the probability that a randomly chosen positive gets a higher score than a randomly chosen negative.
AUC = 0.5 : worthless model (random guessing).
AUC < 0.5 : actively misleading (worse than random; could be flipped).

Our model has an AUC of 0.71. This indicates moderate ability to separate positives from negatives (better than random guessing = 0.5, but far from excellent). An AUC of 0.71 means that, on average, if we randomly take a positive and a negative example, the model assigns a higher score to the positive 71% of the time.

Sensitivity = 52% is low, while specificity is much higher (76.4%). That imbalance suggests the threshold is set in a way that favors negatives. Since the AUC is moderate (0.71, not close to 0.5), the model has the ability to trade some specificity for more sensitivity.


In [19]:
# 4. AUC (Area under ROC curve)
auc = roc_auc_score(y_test, y_proba)
auc

0.7098773799495443

## Brier score

Brier score loss is another important metric, but unlike accuracy, sensitivity, or AUC, it evaluates probabilistic predictions rather than just yes/no classifications.
It measures the accuracy of probabilistic predictions.
The Brier score measures the mean squared difference between the predicted probability and the actual outcome (0 or 1). It penalizes both wrong predictions and overconfident predictions.
Interpretation:
0 = perfect probabilistic predictions.
1 = worst possible (completely wrong with absolute certainty).

A score of 0.182 is quite decent — it suggests that the model’s predicted probabilities are reasonably well calibrated, i.e., when it predicts a 70% chance of positive, positives actually happen about 70% of the time.

In [20]:
# 5. Brier Score
brier = brier_score_loss(y_test, y_proba)
brier

0.18218767491983826

## summarry / global interpretation

The model is imbalanced in its performance — it’s moderately strong at identifying negatives but weak at catching positives. The overall accuracy (72.3%) hides this imbalance. This model is moderately good at ranking cases and produces fairly well-calibrated probabilities, we may consider modifying the threashold.

In practice, the AUC suggest we could lower the threshold to classify more cases as positive. This would increase sensitivity (catch more positives). But this comes at the cost of lower specificity (more false alarms). If the cost of missing positives is high, we could consider shifting the threshold to increase sensitivity, even if specificity drops.
If negatives are more important, the current setup may be acceptable.


In [21]:
#summarry / global interpretation

print(f"Accuracy: {accuracy:.3f}")
print(f"Sensitivity: {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"AUC: {auc:.3f}")
print(f"Brier Score: {brier:.3f}")

Accuracy: 0.723
Sensitivity: 0.519
Specificity: 0.764
AUC: 0.710
Brier Score: 0.182


# Stability

In [23]:
# Open the pickle file in read-binary mode
with open("../models/step2_catboost_timesplit.pkl", "rb") as f:
    model = joblib.load(f)

print(model)

<catboost.core.CatBoostClassifier object at 0x00000267297026C0>


In [None]:
#import needed variables
X_train = data2.get("X_train")
y_train = data2.get("y_train")
X_test = data.get("X_test")

In [30]:
n_bootstrap = 3   # number of resampled datasets to train on
predictions_list = []
scores = []

for i in range(n_bootstrap):
    # 1. Bootstrap sample from training data
    X_resampled, y_resampled = resample(X_train, y_train, replace=True, random_state=i)

    # 2. Train CatBoost
    model_boot = model
    model_boot.fit(X_resampled, y_resampled)

    # 3. Predict on the fixed test set
    preds = model_boot.predict_proba(X_test)[:, 1]  # probability of positive class
    predictions_list.append(preds) # list of 1D arrays of predictions

    # 4. Evaluate performance
    score = accuracy_score(y_test, (preds > 0.5).astype(int))
    scores.append(score)


predictions_array = np.array(predictions_list) # predictions_array has shape (n_bootstrap, n_test)


In [None]:
# compute distances between predictions

# pdist computes all pairwise distances between the rows of the array, so between each bootstrap
dist_matrix = squareform(pdist(predictions_array, metric="euclidean"))
# dist_matrix is a full symmetric matrix of shape (n_bootstrap, n_bootstrap)

# average distance across all pairs
avg_dist = np.mean(dist_matrix[np.triu_indices_from(dist_matrix, k=1)])

print(f"Average pairwise Euclidean distance between models: {avg_dist:.4f}")


# Performance Stability: variance of accuracy
mean_acc = np.mean(scores)

print(f"Average accuracy: {mean_acc:.3f}")


Average pairwise Euclidean distance between models: 10.5912
Average accuracy: 0.735


In [35]:
avg_dist_per_point = avg_dist / len(X_test)
avg_dist_per_point


0.0003097027243613833

That means on average, across bootstrap models, predictions differ by less than 1% in probability per test point. This is stable with respect to resampling the training set.