# Libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# import the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency
import scorecardpy as sc
import random as rd
import re
from IPython.display import display
from matplotlib.backends.backend_pdf import PdfPages

# Data

In [None]:
# data prepare ------
# load germancredit data
smp_full = sc.germancredit()
smp_full['will_default'] = smp_full['creditability'].apply(lambda x: 1 if x == 'bad' else 0)
smp_full = smp_full.loc[:,smp_full.columns != 'creditability']
smp_full.loc[0:99, 'credit.amount'] = np.nan
smp_full.loc[0:99, 'purpose'] = np.nan

for i in range(5):
    smp_full = pd.concat([smp_full, smp_full])
smp_full['RepDate_End'] = np.random.randint(1, 73, smp_full.shape[0])
smp_full = smp_full.reset_index(drop=True)

smp_full.head()

# 1. Preliminary analysis of variables (missings, outliers, concentration/distribution) - based on smp_full

In [None]:
# columns that are not variables
var_skip = ['will_default','RepDate_End']
# special values for numeric variables - TBD
spl_val = []
# list of variables by type (numeric variables with less than 10 unique values are considered as categorical)
var_cat, var_num = sc.var_types(smp_full, var_skip)

In [None]:
# heatmap for the missing values
percent_missing = smp_full.loc[:, var_cat+var_num].isna().sum() * 100 / len(smp_full)
percent_missing = pd.DataFrame({'column':percent_missing.index, 'percent_missing':percent_missing.values})
percent_missing.sort_values('percent_missing', ascending=False, inplace=True)
percent_missing.reset_index(drop=True)

plt.figure(figsize=(10,6))
sns.heatmap(smp_full[percent_missing.column].isna().transpose(),
            cmap="YlGnBu",
            cbar_kws={'label': 'Missing Data'})
plt.savefig("1_1_missings_heatmap.png", dpi=100, bbox_inches = "tight")

In [None]:
#round missings
#thresholds as params
# warning checks
var_cat_summary, var_num_summary = sc.var_pre_analysis(smp_full, var_cat, var_num, spl_val, hhi_low=0.05, hhi_high=0.95, min_share=0.05)

writer = pd.ExcelWriter('1_2_preliminary_analysis.xlsx', engine='xlsxwriter')
var_cat_summary.to_excel(writer, sheet_name='var_cat_summary')
var_num_summary.to_excel(writer, sheet_name='var_num_summary')
writer.save()

display(var_cat_summary)
display(var_num_summary)

In [None]:
#treatment of nan
for var, dt in smp_full.dtypes.items():
    if var not in var_skip and smp_full[var].isna().sum() > 0:
        print(var,smp_full[var].isna().sum()) 
        if dt.name == 'category':
            smp_full[var] = smp_full[var].cat.add_categories('Missing').fillna('Missing')
            print('Missing')
        if dt.name == 'object':
            smp_full[var] = smp_full[var].fillna('Missing')
            print('Missing')
        else: 
            print(smp_full[var].median())
            smp_full[var] = smp_full[var].fillna(smp_full[var].median())

In [None]:
# distribution for categorical variables with extract to pdf
sc.var_cat_distr(smp_full, var_cat, '1_3_categorical_vars_distribution.pdf', groupby='foreign.worker')

In [None]:
sc.var_num_distr(smp_full, var_num, '1_4_numerical_vars_distribution.pdf', groupby='foreign.worker')

# 2. Development sample creation

In [None]:
# selection of the development window 
sorted_date = sorted(smp_full['RepDate_End'].unique())
del sorted_date[-12:]
smp_dev = smp_full.loc[smp_full['RepDate_End'].isin(sorted_date)]




In [None]:
# check target
smp_dev['target'] = smp_dev['will_default']
smp_dev.groupby('target').size()

In [None]:
# selection of variables that will be used for the development
smp_dev = smp_dev[var_cat+var_num+['target']+['RepDate_End']]

#smp_dev = smp_full.loc[smp_dev['prod_grp'] == 'Mortgage']

# train/test split as 80/20
train, test = sc.split_df(smp_dev, ratio=0.8, seed=123).values()
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# 3. Automated binning

In [None]:
# binning
fine_class, coarse_class = sc.woebin(train, y = 'target', x = var_cat + var_num, init_count_distr = 0.05)

In [None]:
# extracting binning results to excel
pd.concat(fine_class.values()).reset_index(drop=True).to_excel('3_1_fine_classing.xlsx')
pd.concat(coarse_class.values()).reset_index(drop=True).to_excel('3_2_coarse_classing_auto.xlsx')

In [None]:
# iv for variables after automated binning
coarse_class_iv = sc.vars_iv(var_cat + var_num, coarse_class)
coarse_class_iv

In [None]:
# binning visualization
sc.woebin_plot(coarse_class)

# 4. Binning adjustments 

In [None]:
# manual review and adjustment of binning
breaks_list = sc.woebin_adj(train, y="target", bins=coarse_class, fine_bins=fine_class, adj_all_var=True)

In [None]:
# update of coarse classing table (fine classing is relevant only for automated binning)
fine_class_adj, coarse_class_adj = sc.woebin(train, y = 'target', x = var_cat + var_num, breaks_list = breaks_list, init_count_distr = 0.05)

In [None]:
# applying woe transformations on train and test samples 
train_woe = sc.woebin_ply(train, bins=coarse_class_adj)
test_woe = sc.woebin_ply(test, bins=coarse_class_adj)
# defining woe variables
vars_woe = []
for i in var_cat+var_num:
    vars_woe.append(i+'_woe')

In [None]:
sc.iv_group(train_woe, vars_woe, groupby='RepDate_End')

# 5. Correlation analysis

In [None]:
train_woe[vars_woe].corr()

In [None]:
# plotting correlation heatmap
plt.figure(figsize=(20,12))
sns.heatmap(train_woe[vars_woe].corr(), cmap="YlGnBu", annot=True)
  
# displaying heatmap
plt.show()

# 6. Logistic regression

In [None]:
# target and variables
y_train = train_woe['target']
X_train = train_woe[vars_woe]
y_test = test_woe['target']
X_test = test_woe[vars_woe]

In [None]:
# logistic regression ------
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.fit(X_train, y_train)
# lr.coef_
# lr.intercept_

In [None]:
# predicted proability
train_pred = lr.predict_proba(X_train)[:,1]
test_pred = lr.predict_proba(X_test)[:,1]
# performance ks & roc ------
train_perf = sc.perf_eva(y_train, train_pred, title = "train")
test_perf = sc.perf_eva(y_test, test_pred, title = "test")

# 7. Initial calibration and scorecard points

In [None]:
# score ------
card = sc.scorecard(coarse_class_adj, lr, X_train.columns, start_zero=True)
# credit score
train_score  = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)

In [None]:
scorecard_points = pd.concat(card, ignore_index=True)
scorecard_points.to_excel("scorecard_points.xlsx", sheet_name='scorecard_points')