#### Load required Liabraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap
from scorecardutils.feature_selection import (shap_feature_selection,
                                              find_correlation_groups,
                                              select_best_features_from_corr_groups,
                                              vsi_check)

#from feature_engine.selection import SmartCorrelatedSelection


In [None]:
dev_data = pd.read_csv('../data/credit_risk_dataset.csv')

In [None]:
dev_data.shape

In [None]:
dev_data.head()

In [None]:
## Make sure each feature has correct data types --float,int,catgeory
for col in dev_data.select_dtypes(include='object').columns:
    dev_data[col] = dev_data[col].astype('category')

In [None]:
## Define target and features names
target = 'default'
features = dev_data.drop(columns=[target]).columns.tolist()

In [None]:
xgb_params = {
    "objective": "binary:logistic",       # For binary classification
    "eval_metric": "auc",                 # auc
    "learning_rate": 0.05,                 # Smaller means more trees; safer
    "max_depth": 6,                       # Controls model complexity
    "subsample": 0.8,                     # Fraction of samples per tree
    "colsample_bytree": 0.8,              # Fraction of features per tree
    "lambda": 1,                          # L2 regularization
    "alpha": 0,                           # L1 regularization
    "n_estimators": 100,                  # Total trees
}

In [None]:
selected_features,importance_df,_ =shap_feature_selection(train_data=dev_data,feature_names=features,target_name='default',verbose=True,
                                                        test_size=0.3,random_state=42,use_train_for_shap=False,
                                                        model_params=xgb_params)

In [None]:
_,importance_df,shapDF =shap_feature_selection(train_data=dev_data,feature_names=selected_features,target_name='default',verbose=False,
                                                        split_data=False,random_state=42,
                                                        model_params=xgb_params,create_shap_df=True)

In [None]:
correlated_groups = find_correlation_groups(shapDF, corr_threshold=0.8)

In [None]:
correlated_groups

In [None]:
_,selected_features_corr = select_best_features_from_corr_groups(correlated_groups, feature_importance_df=importance_df,
                                      feature_importance_col='SHAP_Importance',
                                      feature_name_col='Feature')

In [None]:
selected_features_corr

### Variable Binning and IV ,Stability Check

In [None]:
from optbinning import BinningProcess
from sklearn.model_selection import train_test_split

In [None]:
syndata= pd.read_csv('../data/synthetic_binary_classification_data.csv')

In [None]:
target = 'target'

In [None]:
X= syndata.drop(columns=[target])
y= syndata[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_vars= X_train.columns.tolist()

In [None]:
## Extract categorical and numerical columns
categorical_columns = X_train.select_dtypes(include=['object','category']).columns.tolist()
numerical_columns = X_train.select_dtypes(include=['number']).columns.tolist()

In [None]:
categorical_columns

In [None]:
numerical_columns

In [None]:
X_train.head()

In [None]:
"""
Define Any Special Codes to treat them in separate bucket
e.g 
special_codes = [-999,-1,-2]
or if need to treat separate codes with different values in different special buckets
special_codes = {'special_1': -9, "special_2": -8, "special_3": -7}
"""

special_codes = None

In [None]:
"""
Define Dictionary with optimal binning fitting options for specific variables. we can update it as per our customization
for specific attributes.
split_digits: If split_digits is set to 0, the split points are integers otherwise the split points are rounded 
to the number of digits specified by split_digits.
user_splits: If user_splits is set, the splits are fixed to the values specified by user_splits.
user_splits_fixed: If user_splits_fixed is set to True, the splits are fixed to the values specified by user_splits.
monotonic_trend: If monotonic_trend is set to 'ascending', the bad rate should be non-decreasing.
cat_cutoff:Generate bin others with categories in which the fraction of occurrences is below the cat_cutoff value. 
i.e If cat_cutoff is set to 0.05, the bin will be generated with categories in which the fraction of occurrences is below the cat_cutoff value.

Initially can be passed as None. binning_fit_params=None

binning_fit_params = {
    "dti": {"monotonic_trend": "ascending","split_digits":2 ,
            "user_splits": [ 8.89, 10.91, 14.68, 16.03,18.23, 20.8 , 22.11, 28.37],
           # "user_splits_fixed" :[True,True] 
           }
    }



"""
binning_fit_params = {

    "loyalty_score":{"split_digits":2,
        "user_splits": [    -3.07959914, -2.46296906, -1.8946799 , -1.6182403 , -1 ,
                            0,  0.37816253,  0.77244589,  1.2557528 ,
                            1.77894938],
        "user_splits_fixed": [False,False,False,False,True,True,False,False,False,False]
    }
}




In [None]:
"""
By default:optbinning assigns WoE = 0 and event rate = overall mean to special values (like missing or outliers), unless you override it.
Update Strategy for Special Values:
Strategy A: Neutralize Special Values i.e.Assign WoE = 0 (effectively no contribution to the score).
Strategy B: Assign Empirical WoE for Special Values i.e.Let the special values’ WoE be calculated based on their actual event rate in the data.This is often used when special codes have meaningful predictive power
Strategy C: Merge Special Value into Closest Bin. f a special code behaves like a particular bin (e.g., 999 behaves like bin [30–40]), assign its WoE manually to match that bin’s WoE

binning_transform_params ={
   'revol_util':{'metric_special':'empirical'},
    'dti':{'metric_special': -0.306345},
    'inq_last_6mths':{'metric_special':'empirical'}
}

or 
binning_transform_params = None
"""
binning_transform_params = {
    'age':{'metric_missing':'empirical','metric_special':'empirical'}
}

In [None]:
"""
List of variables to be fixed. 
The binning process will retain these variables if the selection criteria is not satisfied.
"""
fixed_variables=None

In [None]:
"""
Define the selection criteria for the binning process
selection_criteria = {
    "iv": {"min": 0.01, "max": 0.5,"strategy": "highest", "top": 50},
    "quality_score": {"min": 0.01}
}

or 
selection_criteria = None
"""
selection_criteria = {
    "iv": {"min": 0.01,"strategy": "highest", "top": 50}
}

In [None]:
binning_process = BinningProcess(variable_names=model_vars, special_codes=special_codes,
                                 categorical_variables=categorical_columns,
                                 selection_criteria=selection_criteria,
                                 binning_fit_params=binning_fit_params,
                                 binning_transform_params=binning_transform_params,
                                fixed_variables=fixed_variables)

In [None]:
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Fit the binning process    
binning_process.fit(X=X_train[model_vars], y=y_train.values)

In [None]:
# We can save all the variables passed to binning process and their IVs for manual screening
iv_tab=binning_process.summary().sort_values(by='iv',ascending=False)
#iv_tab.to_excel('iv_tab.xlsx', index=False)
iv_tab

In [None]:
iv_selected_variables = iv_tab[iv_tab['iv']>0.02]['name'].tolist()

In [None]:
iv_selected_variables

In [None]:
## To see any specific variable binning table
optb = binning_process.get_binned_variable('customer_group')
df = optb.binning_table.build()
df

In [None]:
### CSI summary
csi_summ = vsi_check(
    X_oot=X_test, 
    X_train=X_train,
    binning_process=binning_process,
    style='summary',  # or 'detailed' for bin-level information
    psi_min_bin_size=0.01,
    max_workers=4  # Adjust based on your CPU cores
)

## CSI detailed Summary
csi_det = vsi_check(
    X_oot=X_test, 
    X_train=X_train,
    binning_process=binning_process,
    style='detailed',  # or 'detailed' for bin-level information
    psi_min_bin_size=0.01,
    max_workers=4  # Adjust based on your CPU cores
)

In [None]:
# to save csi in excel
with pd.ExcelWriter('csi.xlsx', engine='openpyxl', mode='w') as writer:
    csi_summ.to_excel(writer, sheet_name='summary', index=False)
    csi_det.to_excel(writer, sheet_name='detail', index=False)

In [None]:
# Filter stable variables (PSI < threshold)
stable_variables = [str(var) for var in csi_summ[csi_summ['CSI'] < 10]['Variable'].tolist()]
    
# Filter unstable variables (PSI >= threshold)
unstable_variables = [str(var) for var in csi_summ[csi_summ['CSI'] >= 10]['Variable'].tolist()]
print("Unstable Variables:", unstable_variables)

In [None]:
selected_features_ivs = [feature for feature in iv_selected_variables if feature not in unstable_variables]

### Get Bivariate plot on both Train/Test

In [None]:
from openpyxl import Workbook
from openpyxl.drawing.image import Image

In [None]:
# There is scope of improvement like currently plot is getting cropped so using seaborn we can customized
def bivariate_plot(filename='bivariate',binning_process=binning_process,metric_='event_rate'):
    with pd.ExcelWriter(f'{filename}.xlsx', engine='openpyxl', mode='w') as writer:
        for var in binning_process.get_support(names=True):
            optb = binning_process.get_binned_variable(var)
            
            # Get DataFrame from binning table
            df = optb.binning_table.build()
            df = df.drop(columns='JS')
            df = df.reset_index()
            
            # Write DataFrame to Excel
            df.to_excel(writer, sheet_name=var, index=False)
            # Plotting
            plot_image_path = f'variable_{var}_plot.png'
            optb.binning_table.plot(metric=metric_, show_bin_labels=True,savefig=plot_image_path,figsize=(10, 10))
            #plt.close()
    
            # Insert plot image into the Excel sheet
            img = Image(plot_image_path)
            img.anchor = f'K1'  # Adjust the anchor cell as needed
            writer.sheets[var].add_image(img)
    
    for var in binning_process.get_support(names=True):
        plot_image_path = f'variable_{var}_plot.png'
        os.remove(plot_image_path)

In [None]:
bivariate_plot(metric_='woe',filename='bivariate_woe')

In [None]:
optb = binning_process.get_binned_variable('age')

# Get DataFrame from binning table
df = optb.binning_table.build()
df = df.drop(columns='JS')
df = df.reset_index()

In [None]:
df

In [None]:
df.columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from openpyxl.drawing.image import Image
from typing import List, Optional, Union


def enhanced_bivariate_plot(
    binning_process,
    filename: str = 'bivariate',
    metric: str = 'event_rate',
    variables: Optional[List[str]] = None,
    figsize: tuple = (12, 6),  # Reduced figure size for better viewing
    dpi: int = 100,
    style: str = 'whitegrid'
) -> None:
    """
    Create enhanced bivariate plots and export to Excel with special handling for 
    Totals (in index column), Special, and Missing bins.
    
    Parameters:
    -----------
    binning_process : OptimalBinning process object
        The binning process containing the variables to plot
    filename : str, default='bivariate'
        Base name for the output Excel file
    metric : str, default='event_rate'
        Metric to plot. Options: 'event_rate', 'woe'
    variables : list of str, optional
        List of specific variable names to process. If None, all variables are processed.
    figsize : tuple, default=(14, 8)
        Figure size for plots in inches (width, height)
    dpi : int, default=300
        Resolution of saved images
    style : str, default='whitegrid'
        Seaborn style for plots
    
    Returns:
    --------
    None
        Saves Excel file with embedded plots and data tables
    """
    # Validate metric parameter
    metric = metric.lower()
    if metric not in ['event_rate', 'woe']:
        raise ValueError("metric must be either 'event_rate' or 'woe'")
    
    # Get all available variables from binning process
    all_variables = binning_process.get_support(names=True)
    
    # Filter variables if specified
    if variables is not None:
        # Check if all specified variables exist in the binning process
        invalid_vars = set(variables) - set(all_variables)
        if invalid_vars:
            raise ValueError(f"Variables not found in binning process: {', '.join(invalid_vars)}")
        selected_vars = variables
    else:
        selected_vars = all_variables
    
    # Set up the visualization style
    #sns.set_style(style)
    plt.style.use('default')  # Clean matplotlib style

    
    
    
    # Create a temporary directory for images if it doesn't exist
    if not os.path.exists('temp_plots'):
        os.makedirs('temp_plots')
    
    # Dictionary to store plot paths
    plot_paths = {}
    
    # Determine column names based on metric
    metric_column = 'Event rate' if metric == 'event_rate' else 'WoE'
    y_axis_label = 'Event Rate' if metric == 'event_rate' else 'Weight of Evidence (WoE)'
    
    # Create Excel writer
    with pd.ExcelWriter(f'{filename}.xlsx', engine='openpyxl', mode='w') as writer:
        # Process each selected variable
        for var in selected_vars:
            # Get the binned variable
            optb = binning_process.get_binned_variable(var)
            
            # Get DataFrame from binning table and clean it up
            df = optb.binning_table.build()
            #df["Event rate"] = (df["Event rate"] * 100).round(1)
            if 'JS' in df.columns:
                df = df.drop(columns='JS')
            
            # Keep the original DataFrame for Excel output including Totals row
            df_for_excel = df.reset_index()
            
            # Extract the Totals row and remove it from the main DataFrame for plotting
            totals_row = None
            if 'Totals' in df.index:
                totals_row = df.loc['Totals']
                df = df.drop('Totals')
            
            # Reset index after removing Totals, making the index into a column
            df = df.reset_index()
            
            # The actual bin column is 'Bin', but if it doesn't exist, use the first column as fallback
            bin_col_name = 'Bin' if 'Bin' in df.columns else df.columns[0]
            
            # Write original DataFrame (including Totals) to Excel
            df_for_excel.to_excel(writer, sheet_name=var, index=False)
            
            # Create custom plot using Seaborn and Matplotlib
            plot_image_path = f'temp_plots/variable_{var}_{metric}.png'
            
            # Create figure with proper size to avoid cropping
            fig, ax1 = plt.subplots(figsize=figsize)
            
            # Identify regular, special, and missing bins
            # Explicitly check for strings 'Special' and 'Missing' in the bins column
            regular_bins = df[(df[bin_col_name] != 'Special') & (df[bin_col_name] != 'Missing')]
            special_bin = df[df[bin_col_name] == 'Special'] if 'Special' in df[bin_col_name].values else pd.DataFrame()
            missing_bin = df[df[bin_col_name] == 'Missing'] if 'Missing' in df[bin_col_name].values else pd.DataFrame()
            
            # Create indices for x-axis
            x_indices = np.arange(len(df))
            
            # Format bin labels for better display
            bin_labels = []
            for _, row in df.iterrows():
                bin_name = row[bin_col_name]
                if bin_name in ['Special', 'Missing']:
                    bin_labels.append(bin_name)
                else:
                    # Use the actual bin values but truncate if too long
                    shortened_name = str(bin_name)
                    if len(shortened_name) > 15:
                        shortened_name = shortened_name[:12] + "..."
                    bin_labels.append(shortened_name)
            
            # Plot Count (%) as blue bars for all bins
            bars = ax1.bar(x_indices, df['Count (%)'] * 100, color='#0a3f7d', alpha=0.7)
            ax1.set_xticks(x_indices)
            ax1.set_xticklabels(bin_labels, rotation=45, ha='right', fontsize=9)
            ax1.set_xlabel('Bins', fontsize=10)
            ax1.set_ylabel('Count (%)', color='blue', fontsize=10)
            ax1.tick_params(axis='y', labelcolor='blue', labelsize=9)
            
            # Create second y-axis for Event Rate or WoE
            ax2 = ax1.twinx()
            line_color = 'darkgoldenrod'
            
            # Plot the metric line for regular bins only (connecting them)
            if not regular_bins.empty:
                # Get indices of regular bins in the full dataframe
                regular_mask = df[bin_col_name].apply(lambda x: x not in ['Special', 'Missing'])
                regular_indices = np.where(regular_mask)[0]
                
                if len(regular_indices) > 0:
                    # Plot connected line for regular bins
                    ax2.plot(regular_indices, regular_bins[metric_column], 
                             marker='o', color=line_color, linewidth=2, label=y_axis_label)
                    
                    # Add value annotations for regular bins with smaller font
                    for idx, val in zip(regular_indices, regular_bins[metric_column]):
                        ax2.annotate(f'{val:.3f}', 
                                     xy=(idx, val), 
                                     xytext=(0, 5),
                                     textcoords='offset points',
                                     ha='center', 
                                     fontsize=7)
            
            # Plot Special bin point (if exists) without connecting
            if not special_bin.empty:
                special_idx = df[df[bin_col_name] == 'Special'].index[0]
                special_val = special_bin[metric_column].values[0]
                ax2.plot(special_idx, special_val,
                         marker='s', color='red', markersize=8, linestyle='None', label='Special')
                # Reduce decimal places and font size for special bin annotation
                ax2.annotate(f'{special_val:.3f}', 
                             xy=(special_idx, special_val), 
                             xytext=(0, 5),
                             textcoords='offset points',
                             ha='center', 
                             fontsize=7)
            
            # Plot Missing bin point (if exists) without connecting
            if not missing_bin.empty:
                missing_idx = df[df[bin_col_name] == 'Missing'].index[0]
                missing_val = missing_bin[metric_column].values[0]
                ax2.plot(missing_idx, missing_val,
                         marker='D', color='purple', markersize=8, linestyle='None', label='Missing')
                # Reduce decimal places and font size for missing bin annotation
                ax2.annotate(f'{missing_val:.3f}', 
                             xy=(missing_idx, missing_val), 
                             xytext=(0, 5),
                             textcoords='offset points',
                             ha='center', 
                             fontsize=7)
            
            # Add horizontal line for Totals if available
            if totals_row is not None:
                total_metric_value = totals_row[metric_column]
                
                # Handle case where WoE might be a blank string in Totals row
                if isinstance(total_metric_value, str) and total_metric_value.strip() == '':
                    total_metric_value = 0.0
                else:
                    # Try to convert to float in case it's a string representation of a number
                    try:
                        total_metric_value = float(total_metric_value)
                    except (ValueError, TypeError):
                        total_metric_value = 0.0
                
                ax2.axhline(y=total_metric_value, color='green', linestyle='--', 
                           alpha=0.7, label=f'Total {y_axis_label}: {total_metric_value:.4f}')
            
            # Set y-axis label and title
            ax2.set_ylabel(y_axis_label, color=line_color, fontsize=11)
            ax2.tick_params(axis='y', labelcolor=line_color)
            
            # Set title with variable name but keep it concise
            plt.title(f'{var}: {y_axis_label} by Bin', fontsize=12)
            
            # Add legend with small font size and optimize position
            lines, labels = ax2.get_legend_handles_labels()
            ax2.legend(lines, labels, loc='best', fontsize=9)
            
            # Add grid for better visualization
            #ax1.grid(axis='y', linestyle='--', alpha=0.5)
            
            # Adjust layout to prevent cropping
            plt.tight_layout()
            
            # Save the figure with high quality
            plt.savefig(plot_image_path, dpi=dpi, bbox_inches='tight')
            plt.close()
            
            # Store the path for later cleanup
            plot_paths[var] = plot_image_path
            
            # Insert plot image into the Excel sheet
            img = Image(plot_image_path)
            
            # Calculate position based on data size (including Totals row)
            # Start plot after the data table with some margin
            row_position = len(df_for_excel) + 4  # +4 for margin
            img.anchor = f'A{row_position}'
            
            writer.sheets[var].add_image(img)
            
            # Adjust column widths for better readability
            for idx, col in enumerate(df_for_excel.columns):
                column_width = max(len(str(col)), df_for_excel[col].astype(str).map(len).max())
                writer.sheets[var].column_dimensions[chr(65 + idx)].width = column_width + 2
    
    # Clean up temporary image files
    for path in plot_paths.values():
        if os.path.exists(path):
            os.remove(path)
    
    # Remove temp directory if empty
    if os.path.exists('temp_plots') and not os.listdir('temp_plots'):
        os.rmdir('temp_plots')
    
    print(f"Enhanced bivariate analysis completed! Results saved to {filename}.xlsx")


# Example usage with different metrics and variable selections
# 1. Process all variables with event rate
# enhanced_bivariate_plot(binning_process, filename='bivariate_event_rate', metric='event_rate')

# 2. Process all variables with WoE
# enhanced_bivariate_plot(binning_process, filename='bivariate_woe', metric='woe')

# 3. Process only specific variables
# enhanced_bivariate_plot(
#     binning_process, 
#     filename='selected_variables', 
#     metric='event_rate',
#     variables=['age', 'income', 'credit_score']
# )

In [None]:
enhanced_bivariate_plot(
 binning_process, 
filename='selected_variables', 
 metric='event_rate',
variables=['age', 'income', 'customer_group'],
figsize = (12, 6),
dpi=100)

In [None]:
# Get list of final selected variables
final_predictors=list(binning_process.get_support(names=True))
#final_predictors = ['loyalty_score','credit_score','customer_group']

In [None]:
# Transform the data into woe or binned values or event rate
X_train_WOE = binning_process.transform(X_train,metric='woe')

In [None]:
# Test example to understand the how binning object trasform the data to woe values
test_exmaple = pd.DataFrame([{'age': -999,
 'income': -0.8276048978295478,
 'experience_years': -0.7913895315832409,
 'credit_score': -0.7389923463082454,
 'avg_spend': 0.7331633070112105,
 'loyalty_score': -0.8702525202009143,
 'satisfaction_rating': 2.237864030340072,
 'demographic_index': -0.3085181724416579,
 'financial_status': -0.0191376466062278,
 'work_experience': -1.812865565832083,
 'credit_index': 0.5505331954414358,
 'customer_group': 'group_B'}])

test_WOE = binning_process.transform(test_exmaple,metric='woe')
test_WOE
