#### Load required Liabraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap
from scorecardutils.feature_selection import (shap_feature_selection,
                                              find_correlation_groups,
                                              select_best_features_from_corr_groups,
                                              vsi_check)
from scorecardutils.BivariatePlot import unified_bivariate_analysis

#from feature_engine.selection import SmartCorrelatedSelection


In [None]:
dev_data = pd.read_csv('../data/credit_risk_dataset.csv')

In [None]:
dev_data.shape

In [None]:
dev_data.head()

In [None]:
## Make sure each feature has correct data types --float,int,catgeory
for col in dev_data.select_dtypes(include='object').columns:
    dev_data[col] = dev_data[col].astype('category')

In [None]:
## Define target and features names
target = 'default'
features = dev_data.drop(columns=[target]).columns.tolist()

In [None]:
xgb_params = {
    "objective": "binary:logistic",       # For binary classification
    "eval_metric": "auc",                 # auc
    "learning_rate": 0.05,                 # Smaller means more trees; safer
    "max_depth": 6,                       # Controls model complexity
    "subsample": 0.8,                     # Fraction of samples per tree
    "colsample_bytree": 0.8,              # Fraction of features per tree
    "lambda": 1,                          # L2 regularization
    "alpha": 0,                           # L1 regularization
    "n_estimators": 100,                  # Total trees
}

In [None]:
selected_features,importance_df,_ =shap_feature_selection(train_data=dev_data,feature_names=features,target_name='default',verbose=True,
                                                        test_size=0.3,random_state=42,use_train_for_shap=False,
                                                        model_params=xgb_params)

In [None]:
_,importance_df,shapDF =shap_feature_selection(train_data=dev_data,feature_names=selected_features,target_name='default',verbose=False,
                                                        split_data=False,random_state=42,
                                                        model_params=xgb_params,create_shap_df=True)

In [None]:
correlated_groups = find_correlation_groups(shapDF, corr_threshold=0.8)

In [None]:
correlated_groups

In [None]:
_,selected_features_corr = select_best_features_from_corr_groups(correlated_groups, feature_importance_df=importance_df,
                                      feature_importance_col='SHAP_Importance',
                                      feature_name_col='Feature')

In [None]:
selected_features_corr

### Variable Binning and IV ,Stability Check

In [None]:
from optbinning import BinningProcess
from sklearn.model_selection import train_test_split

In [None]:
syndata= pd.read_csv('../data/synthetic_binary_classification_data.csv')

In [None]:
target = 'target'

In [None]:
X= syndata.drop(columns=[target])
y= syndata[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [None]:
# Combine
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
train_data.head()

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_vars= X_train.columns.tolist()

In [None]:
## Extract categorical and numerical columns
categorical_columns = X_train.select_dtypes(include=['object','category']).columns.tolist()
numerical_columns = X_train.select_dtypes(include=['number']).columns.tolist()

In [None]:
X_train.head()

In [None]:
"""
Define Any Special Codes to treat them in separate bucket
e.g 
special_codes = [-999,-1,-2]
or if need to treat separate codes with different values in different special buckets
special_codes = {'special_1': -9, "special_2": -8, "special_3": -7}
"""

special_codes = None

In [None]:
"""
Define Dictionary with optimal binning fitting options for specific variables. we can update it as per our customization
for specific attributes.
split_digits: If split_digits is set to 0, the split points are integers otherwise the split points are rounded 
to the number of digits specified by split_digits.
user_splits: If user_splits is set, the splits are fixed to the values specified by user_splits.
user_splits_fixed: If user_splits_fixed is set to True, the splits are fixed to the values specified by user_splits.
monotonic_trend: If monotonic_trend is set to 'ascending', the bad rate should be non-decreasing.
cat_cutoff:Generate bin others with categories in which the fraction of occurrences is below the cat_cutoff value. 
i.e If cat_cutoff is set to 0.05, the bin will be generated with categories in which the fraction of occurrences is below the cat_cutoff value.

Initially can be passed as None. binning_fit_params=None

binning_fit_params = {
    "dti": {"monotonic_trend": "ascending","split_digits":2 ,
            "user_splits": [ 8.89, 10.91, 14.68, 16.03,18.23, 20.8 , 22.11, 28.37],
           # "user_splits_fixed" :[True,True] 
           }
    }



"""
binning_fit_params = {

    "loyalty_score":{"split_digits":2,
        "user_splits": [    -3.07959914, -2.46296906, -1.8946799 , -1.6182403 , -1 ,
                            0,  0.37816253,  0.77244589,  1.2557528 ,
                            1.77894938],
        "user_splits_fixed": [False,False,False,False,True,True,False,False,False,False]
    }
}




In [None]:
"""
By default:optbinning assigns WoE = 0 and event rate = overall mean to special values (like missing or outliers), unless you override it.
Update Strategy for Special Values:
Strategy A: Neutralize Special Values i.e.Assign WoE = 0 (effectively no contribution to the score).
Strategy B: Assign Empirical WoE for Special Values i.e.Let the special values’ WoE be calculated based on their actual event rate in the data.This is often used when special codes have meaningful predictive power
Strategy C: Merge Special Value into Closest Bin. f a special code behaves like a particular bin (e.g., 999 behaves like bin [30–40]), assign its WoE manually to match that bin’s WoE

binning_transform_params ={
   'revol_util':{'metric_special':'empirical'},
    'dti':{'metric_special': -0.306345},
    'inq_last_6mths':{'metric_special':'empirical'}
}

or 
binning_transform_params = None
"""
specific_binning_transform_params = {
    'age':{'metric_missing':'empirical','metric_special':'empirical'}
}


In [None]:

# Define default parameter values
default_woe_metric_missing = 0  # or whatever default you prefer
default_woe_metric_special = 0  # or whatever default you prefer

# Create a complete binning_transform_params dictionary
binning_transform_params = {}

# For each variable in model_vars
for var in model_vars:
    if var in specific_binning_transform_params:
        # Use the specific parameters if defined
        binning_transform_params[var] = specific_binning_transform_params[var]
    else:
        # Otherwise use default parameters
        binning_transform_params[var] = {
            'metric_missing': default_woe_metric_missing,
            'metric_special': default_woe_metric_missing
        }

In [None]:
"""
List of variables to be fixed. 
The binning process will retain these variables if the selection criteria is not satisfied.
"""
fixed_variables=None

In [None]:
"""
Define the selection criteria for the binning process
selection_criteria = {
    "iv": {"min": 0.01, "max": 0.5,"strategy": "highest", "top": 50},
    "quality_score": {"min": 0.01}
}

or 
selection_criteria = None
"""
selection_criteria = {
    "iv": {"min": 0.01}#"strategy": "highest", "top": 11
}


In [None]:
binning_process = BinningProcess(variable_names=model_vars, special_codes=special_codes,
                                 categorical_variables=categorical_columns,
                                 selection_criteria=selection_criteria,
                                 binning_fit_params=binning_fit_params,
                                 binning_transform_params=binning_transform_params,
                                fixed_variables=fixed_variables)

In [None]:
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Fit the binning process    
binning_process.fit(X=X_train[model_vars], y=y_train.values)

In [None]:
# We can save all the variables passed to binning process and their IVs for manual screening
iv_tab=binning_process.summary().sort_values(by='iv',ascending=False)
#iv_tab.to_excel('iv_tab.xlsx', index=False)
iv_tab

In [None]:
iv_selected_variables = iv_tab[iv_tab['iv']>0.02]['name'].tolist()

In [None]:
## To see any specific variable binning table
optb = binning_process.get_binned_variable('age')
df = optb.binning_table.build()
df

In [None]:
### CSI summary
csi_summ = vsi_check(
    X_oot=X_test, 
    X_train=X_train,
    binning_process=binning_process,
    style='summary',  # or 'detailed' for bin-level information
    psi_min_bin_size=0.01,
    max_workers=4  # Adjust based on your CPU cores
)

## CSI detailed Summary
csi_det = vsi_check(
    X_oot=X_test, 
    X_train=X_train,
    binning_process=binning_process,
    style='detailed',  # or 'detailed' for bin-level information
    psi_min_bin_size=0.01,
    max_workers=4  # Adjust based on your CPU cores
)

In [None]:
# to save csi in excel
with pd.ExcelWriter('csi.xlsx', engine='openpyxl', mode='w') as writer:
    csi_summ.to_excel(writer, sheet_name='summary', index=False)
    csi_det.to_excel(writer, sheet_name='detail', index=False)

In [None]:
# Filter stable variables (PSI < threshold)
stable_variables = [str(var) for var in csi_summ[csi_summ['CSI'] < 10]['Variable'].tolist()]
    
# Filter unstable variables (PSI >= threshold)
unstable_variables = [str(var) for var in csi_summ[csi_summ['CSI'] >= 10]['Variable'].tolist()]
print("Unstable Variables:", unstable_variables)

In [None]:
selected_features_ivs = [feature for feature in iv_selected_variables if feature not in unstable_variables]

### Get Bivariate plot on both Train/Test

# Comparing training data with a newer time period
unified_bivariate_analysis(
    binning_process=binning_process,
    filename='event_rate_train',
    metric='woe',
    oot_data= None,  # Data from a more recent time period
    target_column=target,
    compare_data=False ,
    variables=selected_features_ivs,
    show_bar_values=True ,
    verbose=True
)

In [None]:
selected_features_ivs

### Transofrm variables to WoE


In [None]:
X_train_woe = binning_process.transform(X=X_train,metric='woe')
X_test_woe = binning_process.transform(X=X_test,metric='woe')

In [None]:
print(X_train_woe.shape)
print(X_test_woe.shape)


In [None]:
X_train_woe.columns

In [None]:
X_train_woe.head()


In [None]:
# Get list of final selected variables
final_predictors=list(binning_process.get_support(names=True))
#final_predictors = ['loyalty_score','credit_score','customer_group']

In [None]:
# Transform the data into woe or binned values or event rate
X_train_WOE = binning_process.transform(X_train,metric='woe')

In [None]:
# Test example to understand the how binning object trasform the data to woe values
test_exmaple = pd.DataFrame([{'age': -999,
 'income': -0.8276048978295478,
 'experience_years': -0.7913895315832409,
 'credit_score': -0.7389923463082454,
 'avg_spend': 0.7331633070112105,
 'loyalty_score': -0.8702525202009143,
 'satisfaction_rating': 2.237864030340072,
 'demographic_index': -0.3085181724416579,
 'financial_status': -0.0191376466062278,
 'work_experience': -1.812865565832083,
 'credit_index': 0.5505331954414358,
 'customer_group': 'group_B'}])

test_WOE = binning_process.transform(test_exmaple,metric='woe')
test_WOE
