In [11]:
import pandas as pd
import numpy as np
import joblib


train = pd.read_csv("../data/processed/train_pd.csv")
test = pd.read_csv("../data/processed/test_pd.csv")

model = joblib.load("../models/logistic_pd_model.pkl")

In [19]:
feature_names = [c for c in train.columns if c not in ["default", "PD", "score"]]
coefs = model.coef_[0]

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs
}).sort_values(by="coef", ascending=False)
test[feature_names] = test[feature_names].fillna(0)


In [20]:
BASE_SCORE = 600
PDO = 20
B = PDO / np.log(2)
# Base offset: log-odds of default in TRAIN
p = train["default"].mean()
A = BASE_SCORE - B * np.log((1 - p) / p)

In [21]:
scorecard = []

for feature in feature_names:
    for val in train[feature].unique():
        point = - coef_df.loc[coef_df.feature==feature, "coef"].values[0] * val * B
        scorecard.append({
            "feature": feature,
            "bin": val,
            "points": point
        })

scorecard_df = pd.DataFrame(scorecard)

In [22]:
import numpy as np

# Extract features as numpy arrays
X_train = train[feature_names].values
X_test = test[feature_names].values

# Coefficients as numpy array
coefs = coef_df["coef"].values

# Compute scores in one vectorized operation
train["score"] = (-X_train @ coefs * B) + A
test["score"] = (-X_test @ coefs * B) + A

In [23]:
scorecard_df.to_csv("../data/processed/scorecard_points.csv", index=False)
train.to_csv("../data/processed/train_score.csv", index=False)
test.to_csv("../data/processed/test_score.csv", index=False)

In [24]:
print(train[["score","PD"]].head())
print(test[["score","PD"]].head())

        score        PD
0  518.956260  0.324527
1  560.380333  0.171137
2  643.417164  0.019631
3  648.819385  0.037602
4  555.429517  0.201578
        score        PD
0  503.361817  0.446029
1  555.317616  0.187337
2  568.494928  0.142194
3  598.102228  0.109870
4  542.842690  0.210807
