# Libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# import the required libraries
import pandas as pd
from pandasql import sqldf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency
import scorecardpy as sc
from scorecardpy.LogisticRegStats import LogisticRegStats
import random as rd
import re
from IPython.display import display
from matplotlib.backends.backend_pdf import PdfPages
from pathlib import Path

# Data

In [None]:
# data prepare ------
# load germancredit data
smp_full = sc.germancredit()
smp_full['target'] = smp_full['creditability'].apply(lambda x: 1 if x == 'bad' else 0)
smp_full.loc[0:99, 'credit.amount'] = np.nan
smp_full.loc[100:199, 'credit.amount'] = -9999
smp_full['credit.amount.corr'] = smp_full['credit.amount']*2 - 1000
smp_full.loc[0:99, 'purpose'] = np.nan
smp_full.loc[100:109, 'target'] = np.nan

# Artificially multiplying the dataset
for i in range(5):
    smp_full = pd.concat([smp_full, smp_full])

# Generate a list of all month-end dates between Jan 2020 and Sep 2025
month_ends = pd.date_range(start="2020-01-31", end="2025-09-30", freq="ME")

# Randomly assign one of these month-end dates to each row
np.random.seed(123)
smp_full["RepDate_end"] = np.random.choice(month_ends, size=smp_full.shape[0])
smp_full = smp_full.reset_index(drop=True)

# 1. Preliminary analysis of variables (missings, outliers, concentration/distribution) - based on smp_full

In [None]:
# good/bad label
target = "target"

# date column (e.g. snapshot date or application date)
date = "RepDate_end"

# other columns that are not variables
var_skip = ["creditability"]

# special values for numeric variables
special_values = [-9999]

In [None]:
# all columns that are not variables
var_skip_all = var_skip + [target, date]

# variables checks summary - output to Excel
var_cat_summary, var_num_summary, var_list = sc.expl_analysis(
    smp_full, var_skip_all, special_values
)

# heatmap for the missing values
sc.miss_heatmap(smp_full, var_skip, fig_width=10, fig_height=len(var_list)/4)

In [None]:
# # variables distribution - TBD - only for numerical vars
# sc.var_distr(smp_full, ['age.in.years'], groupby = target, special_values = special_values)

In [None]:
# analysis of shares of missings and bads in target over time
def nan_rate(target):
    return sum(np.isnan(target)) / len(target)

def bad_rate(target):
    return sum(target == 1) / (sum(target == 0) + sum(target == 1))

target_ot = smp_full.groupby(date)[target].agg([nan_rate, bad_rate])

# bad rate over time
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(target_ot.index, target_ot["bad_rate"], marker="o", linestyle="-", color="steelblue")
ax.set_title("Bad Rate Over Time", fontsize=14)
ax.set_xlabel("Date", fontsize=12)
ax.set_ylabel("Bad Rate", fontsize=12)
ax.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()


# dates with blank target
pd.DataFrame(target_ot[target_ot["nan_rate"] > 0]["nan_rate"])

# 2. Development sample creation

In [None]:
# selection of the development window
smp_dev = smp_full[smp_full[date].between('2020-01-31', '2024-06-30')]

# selection of variables that will be used for the development
smp_dev = smp_dev[var_list + [target, date]]

In [None]:
# check target
print(smp_dev.groupby(target, dropna=False).size())
# delete records with blank target
smp_dev = smp_dev[smp_dev[target].notna()]

In [None]:
# train/test split as 80/20
train, test = sc.split_df(smp_dev, ratio=0.8, seed=123).values()
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# train/test sample size
sample_summary = pd.concat([
    pd.Series({"sample": "train", "bads": train[target].sum(), "obs": train[target].count()}),
    pd.Series({"sample": "test", "bads": test[target].sum(), "obs": test[target].count()})
], axis=1).T
sample_summary["BR"] = sample_summary["bads"] / sample_summary["obs"]
sample_summary

# 3. Automated binning

In [None]:
# min bin size for fine classing
min_perc_fine_bin = 0.05

# min bin size for coarse classing
count_distr_limit = 0.05

# max number of coarse classes
bin_num_limit = int(1 / count_distr_limit)

# number of decimals for bin intervals
bin_decimals = 4

In [None]:
var_inf = []
# binning
fine_class, coarse_class = sc.woebin(
    train,
    y=target,
    # x = ["age_in_years", "status_of_existing_checking_account", "foreign_worker"],
    var_skip=var_skip_all + var_inf,
    special_values=special_values,
    min_perc_fine_bin=min_perc_fine_bin,
    count_distr_limit=count_distr_limit,
    bin_num_limit=bin_num_limit,
    print_step=10,
    ignore_datetime_cols=False,
    bin_decimals=bin_decimals,
)

In [None]:
# automated filtering of variables using iv and correlation from the fine classing
var_list, var_rej_fine = sc.vars_filter(
    train, fine_class, corr_threshold=0.6, iv_threshold=0.1
)

# removing excluded variables from coarse_class dictionary
coarse_class_filt = {k: v for k, v in coarse_class.items() if k in var_list}

# binning to df
fine_class_df = pd.concat(fine_class.values()).reset_index(drop=True)
coarse_class_df = pd.concat(coarse_class.values()).reset_index(drop=True)

# iv for variables after automated binning
fine_class_iv = sc.vars_iv(fine_class)
coarse_class_iv = sc.vars_iv(coarse_class)
coarse_class_filt_iv = sc.vars_iv(coarse_class_filt)

# exctracting results to Excel
with pd.ExcelWriter(Path("3_1_automated_binning.xlsx"), engine="xlsxwriter") as writer:
    fine_class_df.to_excel(writer, sheet_name="fine_classing", index=False)
    coarse_class_df.to_excel(writer, sheet_name="coarse_classing", index=False)
    fine_class_iv.to_excel(writer, sheet_name="fine_class_iv", index=False)
    coarse_class_iv.to_excel(writer, sheet_name="coarse_class_iv", index=False)
    var_rej_fine.to_excel(writer, sheet_name="rejected_vars_fine_class", index=False)
    coarse_class_filt_iv.to_excel(writer, sheet_name="coarse_class_filt_iv", index=False)

In [None]:
# # binning visualization
# var_show = ['status.of.existing.checking.account', 'credit.history','property']
# coarse_class_selected = {}
# # coarse_class_show = {k: v for k, v in coarse_class.items() if k in var_show}
# for k in var_show:
#     coarse_class_selected[k] = coarse_class[k]
# sc.woebin_plot(coarse_class_selected)

# 4. Binning adjustments 

In [None]:
# manual review and adjustment of binning (results are being saved to save_breaks_list and can be loaded from load_breaks_list)
breaks_list = sc.woebin_adj(
    train,
    y=target,
    # x = ['N103_1'],
    load_breaks_list="4_1_breaks_list_adj.py",
    save_breaks_list="4_1_breaks_list_adj.py",
    bins=coarse_class_filt,  # used in case load_breaks_list is None or not exists
    init_bins=fine_class,
    adj_all_var=False,  # False - only non-monotonic woe variables
    show_init_bins=True,  # True - to show the table with Fine classing results
    special_values=special_values,
)

In [None]:
vars_trend_excl = [
    'credit.amount',
]

In [None]:
# coarse classing after manual adjustments
_, coarse_class_adj = sc.woebin(
    train,
    y=target,
    x=list(eval(breaks_list).keys()),
    breaks_list=breaks_list,
    var_skip=vars_trend_excl,
    special_values=special_values,
    min_perc_fine_bin=min_perc_fine_bin,
    count_distr_limit=count_distr_limit,
    bin_num_limit=bin_num_limit,
    print_step=10,
    ignore_datetime_cols=False,
    bin_decimals=bin_decimals,
)

# exctracting results to Excel
coarse_class_adj_df = pd.concat(coarse_class_adj.values()).reset_index(drop=True)
coarse_class_adj_iv = sc.vars_iv(coarse_class_adj)

with pd.ExcelWriter(Path("4_2_binning_adjustments.xlsx"), engine="xlsxwriter") as writer:
    coarse_class_adj_df.to_excel(writer, sheet_name="coarse_class_adj", index=False)
    coarse_class_adj_iv.to_excel(writer, sheet_name="coarse_class_adj_iv", index=False)
    
# applying woe transformations on train and test samples
train_woe = sc.woebin_ply(train, bins=coarse_class_adj)
test_woe = sc.woebin_ply(test, bins=coarse_class_adj)

# defining woe variables
vars_woe = []
for i in list(coarse_class_adj.keys()):
    vars_woe.append(i + "_woe")

In [None]:
# IV for variables by defined subsamples (period, product etc.)
# sc.iv_group(train_woe,
#             var_list = ["age_in_years_woe"],
#             groupby = "personal_status_and_sex",
#             y = target)

# 5. Correlation analysis

In [None]:
# correlation matrix
train_woe_corr = train_woe[vars_woe].corr()

# # plotting correlation heatmap
# plt.figure(figsize=(40, 24))
# sns.heatmap(train_woe[vars_woe].corr(), cmap="YlGnBu", annot=True)
# plt.show()

# automated filtering of variables using iv and correlation from the fine classing
vars_cand_1, var_rej_corr = sc.vars_filter(
    train,
    coarse_class_adj,
    corr_threshold=0.6,
    iv_threshold=0.1
)

with pd.ExcelWriter(Path("5_1_correlation_analysis.xlsx"), engine="xlsxwriter") as writer:
    train_woe_corr.to_excel(writer, sheet_name="train_woe_corr", index=False)
    var_rej_corr.to_excel(writer, sheet_name="correlation_rej", index=False)

# applying woe transformations on train and test samples
train_woe = sc.woebin_ply(train[[target] + vars_cand_1], bins=coarse_class_adj)
test_woe = sc.woebin_ply(test[[target] + vars_cand_1], bins=coarse_class_adj)

# check if test_woe contains null values
print(test_woe.isnull().any())

# 6. Logistic regression

## 6.1 Initial candidate

In [None]:
# defining woe variables
# list of woe variables
vars_woe = []
for i in vars_cand_1:
    vars_woe.append(i + "_woe")

# target and variables
y_train = train_woe[target]
X_train = train_woe[vars_woe]
y_test = test_woe[target]
X_test = test_woe[vars_woe]

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

# logistic regression ------
lr = LogisticRegression(
    penalty="elasticnet",
    C=0.03,
    l1_ratio=0.3,
    solver="saga",
    n_jobs=-1,
    max_iter=5000
)
lr.fit(X_train, y_train)

# predicted proability
train_pred = lr.predict_proba(X_train)[:, 1]
test_pred = lr.predict_proba(X_test)[:, 1]
# performance ks & roc ------
train_perf = sc.perf_eva(y_train, train_pred, title="train")
test_perf = sc.perf_eva(y_test, test_pred, title="test")

In [None]:
# train bad rate
train_br = {}
train_br["Total"] = y_train.count()
train_br["Bads"] = int(y_train.sum())
train_br["Bad Rate"] = round(train_br["Bads"] / train_br["Total"], 4)
# test bad rate
test_br = {}
test_br["Total"] = y_test.count()
test_br["Bads"] = int(y_test.sum())
test_br["Bad Rate"] = round(test_br["Bads"] / test_br["Total"], 4)
test_br
# combining bad rate with performance
perf = pd.concat(
    {
        "train": pd.Series({**train_br, **train_perf}),
        "test": pd.Series({**test_br, **test_perf}),
    },
    axis=1,
).convert_dtypes()
perf = perf.loc[~perf.index.isin(["pic"])]
perf

In [None]:
# score ------
card = sc.scorecard(coarse_class_adj, lr, X_train.columns, start_zero=True)
# credit score
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)

# calculating the weights of the variables
scorecard_points = pd.concat(card, ignore_index=True)
scorecard_points_vars = scorecard_points[scorecard_points['variable'] != 'basepoints']
max_points = scorecard_points_vars.groupby('variable')['points'].max().reset_index(name='max_points')
max_points['weight'] = max_points['max_points'] / max_points['max_points'].sum()

# export to Excel
with pd.ExcelWriter(Path("6_1_initial_candidate.xlsx"), engine="xlsxwriter") as writer:
    perf.to_excel(writer, sheet_name="perf_train_test", index=False)
    scorecard_points.to_excel(writer, sheet_name="scorecard_points", index=False)
    max_points.to_excel(writer, sheet_name="variable_weights", index=False)

## 6.2 Excluding useless variables

In [None]:
# exclusions by p value = 1
vars_filtered = max_points[max_points['max_points'] > 10]['variable'].tolist()
max_points[max_points['max_points'] < 10]

In [None]:
# list of variables
vars_cand_2 = []
for i in vars_filtered:
    vars_cand_2.append(i)

# list of woe variables
vars_woe = []
for i in vars_cand_2:
    vars_woe.append(i + "_woe")

# target and variables
y_train = train_woe[target]
X_train = train_woe[vars_woe]
y_test = test_woe[target]
X_test = test_woe[vars_woe]

# logistic regression ------
lr = LogisticRegression(
    penalty="elasticnet",
    C=0.03,
    l1_ratio=0.3,
    solver="saga",
    n_jobs=-1,
    max_iter=5000
)
lr.fit(X_train, y_train)

# predicted proability
train_pred = lr.predict_proba(X_train)[:, 1]
test_pred = lr.predict_proba(X_test)[:, 1]
# performance ks & roc ------
train_perf = sc.perf_eva(y_train, train_pred, title="train")
test_perf = sc.perf_eva(y_test, test_pred, title="test")

In [None]:
# train bad rate
train_br = {}
train_br["Total"] = y_train.count()
train_br["Bads"] = int(y_train.sum())
train_br["Bad Rate"] = round(train_br["Bads"] / train_br["Total"], 4)
# test bad rate
test_br = {}
test_br["Total"] = y_test.count()
test_br["Bads"] = int(y_test.sum())
test_br["Bad Rate"] = round(test_br["Bads"] / test_br["Total"], 4)
test_br
# combining bad rate with performance
perf = pd.concat(
    {
        "train": pd.Series({**train_br, **train_perf}),
        "test": pd.Series({**test_br, **test_perf}),
    },
    axis=1,
).convert_dtypes()

perf = perf.loc[~perf.index.isin(["pic"])]
perf

In [None]:
# score ------
card = sc.scorecard(coarse_class_adj, lr, X_train.columns, start_zero=True)
# credit score
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)

# calculating the weights of the variables
scorecard_points = pd.concat(card, ignore_index=True)
scorecard_points_vars = scorecard_points[scorecard_points['variable'] != 'basepoints']
max_points = scorecard_points_vars.groupby('variable')['points'].max().reset_index(name='max_points')
max_points['weight'] = max_points['max_points'] / max_points['max_points'].sum()

# export to Excel
with pd.ExcelWriter(Path("6_2_final_candidate.xlsx"), engine="xlsxwriter") as writer:
    perf.to_excel(writer, sheet_name="perf_train_test", index=False)
    # lr_output.to_excel(writer, sheet_name="regr_output", index=False)
    scorecard_points.to_excel(writer, sheet_name="scorecard_points", index=False)
    max_points.to_excel(writer, sheet_name="variable_weights", index=False)

In [None]:
# binning visualization
coarse_class_final = {
    k: v for k, v in coarse_class_adj.items() if k in vars_cand_2
}
sc.woebin_plot(coarse_class_final)

In [None]:
# coarse_class_vars = [k for k, v in coarse_class_adj.items() if k + "_woe" in vars_final]

# # manual review and adjustment of binning (results are being saved to save_breaks_list and can be loaded from load_breaks_list)
# breaks_list_final = sc.woebin_adj(
#     train,
#     y=target,
#     x=["agro_flag"],
#     # load_breaks_list="3_5_breaks_list_adj.py",
#     # save_breaks_list="9_9_breaks_list_adj.py",
#     bins=coarse_class_filt,  # used in case load_breaks_list is None or not exists
#     init_bins=fine_class,
#     adj_all_var=True,  # False - only non-monotonic woe variables
#     show_init_bins=True,  # True - to show the table with Fine classing results
#     special_values=special_values,
# )

# 7. Testing

In [None]:
smp_testing = sc.woebin_ply(smp_full, bins=coarse_class_adj, print_step=1)
smp_testing["score"] = sc.scorecard_ply(smp_full, card, print_step=0)
print(smp_testing[vars_woe+['score','target']].isnull().any())

In [None]:
smp_testing = smp_testing[smp_testing[target].notna()]

In [None]:
date = "RepDate_end"
smp_testing_outcome = smp_testing[smp_testing[date].between('2020-01-31', '2024-06-30')]

# adding target
train_score[target] = train[target]
test_score[target] = test[target]

In [None]:
sc.performance_testing(
    smp_testing=smp_testing,
    train_score=train_score,
    test_score=test_score,
    train_woe=train_woe,
    test_woe=test_woe,
    vars_woe=vars_woe,
    target=target,
    date_col=date,
    groupby_col="housing",
    output_path="7_1_testing_results.xlsx",
    outcome_period=12,
)

# 8. Recalibration

In [None]:
# preparing sample for recalibration
train_score  = sc.scorecard_ply(train, card, print_step=0)
train_score['target'] = train['target']
train_score['pd_regr'] = sc.pd_from_score(train_score['score'])

test_score  = sc.scorecard_ply(test, card, print_step=0)
test_score['target'] = test['target']
test_score['pd_regr'] = sc.pd_from_score(test_score['score'])

smp_calib_score = pd.concat([train_score, test_score], ignore_index=True)

# assigning ratings
bins = [0,500,540,580,620,660,700,740,780,1000]
labels = ['4.5','4.0','3.5','3.0','2.5','2.0','1.5','1.0','0.5']
smp_calib_score['rating'] = pd.cut(smp_calib_score['score'], bins=bins, labels=labels, include_lowest=True)

In [None]:
intercept, slope = sc.calibration(smp_calib_score, score='score', target='target')
print(intercept, slope)

In [None]:
smp_calib_score['score_new'] = smp_calib_score['score']*slope + intercept
smp_calib_score['score_new'] = smp_calib_score['score_new'].astype(int)
smp_calib_score['rating_new'] = pd.cut(smp_calib_score['score_new'], bins=bins, labels=labels, include_lowest=True)
smp_calib_score