In [3]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.tree import DecisionTreeClassifier

class OptimalBinning:
    def __init__(self, min_bins=3, max_bins=10, min_samples_bin=0.05, monotonic_trend=None, p_value_threshold=0.05):
        """
        Initialize the OptimalBinning class.

        Parameters:
        - min_bins: Minimum number of bins.
        - max_bins: Maximum number of bins.
        - min_samples_bin: Minimum percentage of samples per bin.
        - monotonic_trend: Monotonic trend ('ascending', 'descending', 'peak', 'valley').
        - p_value_threshold: Maximum p-value for statistical significance between bins.
        """
        self.min_bins = min_bins
        self.max_bins = max_bins
        self.min_samples_bin = min_samples_bin
        self.monotonic_trend = monotonic_trend
        self.p_value_threshold = p_value_threshold
        self.bins = None
        self.binning_table = None

    def _pre_binning(self, X, y):
        """
        Pre-binning using a decision tree to create initial split points.
        """
        tree = DecisionTreeClassifier(max_leaf_nodes=self.max_bins, min_samples_leaf=self.min_samples_bin)
        tree.fit(X.reshape(-1, 1), y)
        thresholds = np.sort(tree.tree_.threshold[tree.tree_.threshold != -2])
        return thresholds

    def _calculate_woe_iv(self, bin_data, total_events, total_non_events):
        """
        Calculate Weight of Evidence (WoE) and Information Value (IV) for a bin.
        """
        non_events = len(bin_data[bin_data == 0])
        events = len(bin_data[bin_data == 1])
        event_rate = events / (events + non_events) if (events + non_events) > 0 else 0
        woe = np.log((non_events / total_non_events) / (events / total_events)) if events > 0 and non_events > 0 else 0
        iv = (non_events / total_non_events - events / total_events) * woe
        return woe, iv, event_rate

    def _merge_bins(self, X, y, thresholds):
        """
        Merge bins based on constraints and calculate WoE/IV.
        """
        total_events = sum(y)
        total_non_events = len(y) - total_events
        bins = []
        binning_table = []

        for i in range(len(thresholds) + 1):
            if i == 0:
                bin_data = y[X < thresholds[i]]
            elif i == len(thresholds):
                bin_data = y[X >= thresholds[i - 1]]
            else:
                bin_data = y[(X >= thresholds[i - 1]) & (X < thresholds[i])]

            if len(bin_data) > 0:
                woe, iv, event_rate = self._calculate_woe_iv(bin_data, total_events, total_non_events)
                bins.append((thresholds[i - 1] if i > 0 else -np.inf, thresholds[i] if i < len(thresholds) else np.inf))
                binning_table.append({
                    'Bin': f"[{bins[-1][0]}, {bins[-1][1]})",
                    'Count': len(bin_data),
                    'Non-event': len(bin_data[bin_data == 0]),
                    'Event': len(bin_data[bin_data == 1]),
                    'Event rate': event_rate,
                    'WoE': woe,
                    'IV': iv
                })

        return bins, binning_table

    def _enforce_monotonicity(self, binning_table):
        """
        Enforce monotonicity constraints on the binning table by merging violating bins.
        """
        event_rates = [row['Event rate'] for row in binning_table]
        if self.monotonic_trend == 'ascending':
            for i in range(len(event_rates) - 1):
                if event_rates[i] > event_rates[i + 1]:
                    # Merge bins i and i+1
                    binning_table[i]['Count'] += binning_table[i + 1]['Count']
                    binning_table[i]['Non-event'] += binning_table[i + 1]['Non-event']
                    binning_table[i]['Event'] += binning_table[i + 1]['Event']
                    binning_table[i]['Event rate'] = binning_table[i]['Event'] / binning_table[i]['Count']
                    del binning_table[i + 1]
                    # Recalculate event rates after merging
                    event_rates = [row['Event rate'] for row in binning_table]
                    # Restart the check from the beginning
                    self._enforce_monotonicity(binning_table)
                    break
        elif self.monotonic_trend == 'descending':
            for i in range(len(event_rates) - 1):
                if event_rates[i] < event_rates[i + 1]:
                    # Merge bins i and i+1
                    binning_table[i]['Count'] += binning_table[i + 1]['Count']
                    binning_table[i]['Non-event'] += binning_table[i + 1]['Non-event']
                    binning_table[i]['Event'] += binning_table[i + 1]['Event']
                    binning_table[i]['Event rate'] = binning_table[i]['Event'] / binning_table[i]['Count']
                    del binning_table[i + 1]
                    # Recalculate event rates after merging
                    event_rates = [row['Event rate'] for row in binning_table]
                    # Restart the check from the beginning
                    self._enforce_monotonicity(binning_table)
                    break

    def _check_p_value(self, binning_table):
        """
        Check p-value constraints between consecutive bins.
        """
        for i in range(len(binning_table) - 1):
            bin1 = binning_table[i]
            bin2 = binning_table[i + 1]
            contingency_table = [[bin1['Non-event'], bin1['Event']], [bin2['Non-event'], bin2['Event']]]
            _, p_value, _, _ = chi2_contingency(contingency_table)
            if p_value > self.p_value_threshold:
                raise ValueError(f"P-value constraint violated between bins {i} and {i + 1}.")

    def fit(self, X, y):
        """
        Fit the optimal binning model.
        """
        X = np.array(X)
        y = np.array(y)
        thresholds = self._pre_binning(X, y)
        self.bins, self.binning_table = self._merge_bins(X, y, thresholds)

        # Enforce constraints
        self._enforce_monotonicity(self.binning_table)
        self._check_p_value(self.binning_table)

    def transform(self, X):
        """
        Transform the data into binned values.
        """
        X = np.array(X)
        binned_X = np.zeros_like(X, dtype=object)
        for i, (lower, upper) in enumerate(self.bins):
            if i == 0:
                binned_X[X < upper] = f"[{lower}, {upper})"
            elif i == len(self.bins) - 1:
                binned_X[X >= lower] = f"[{lower}, {upper})"
            else:
                binned_X[(X >= lower) & (X < upper)] = f"[{lower}, {upper})"
        return binned_X

    def get_binning_table(self):
        """
        Return the binning table.
        """
        return pd.DataFrame(self.binning_table)

In [5]:
# Sample data
X = np.array([25, 45, 35, 50, 23, 40, 60, 55, 30, 65])
y = np.array([0, 1, 0, 1, 0, 1, 1, 0, 0, 1])

# Initialize and fit the optimal binning model
opt_bin = OptimalBinning(min_bins=3, max_bins=5, monotonic_trend='descending')
opt_bin.fit(X, y)

# Get the binning table
print(opt_bin.get_binning_table())

# Transform new data
new_X = np.array([20, 35, 45, 55, 70])
binned_X = opt_bin.transform(new_X)
print(binned_X)

            Bin  Count  Non-event  Event  Event rate  WoE   IV
0  [-inf, 37.5)     10          5      5         0.5    0  0.0
['[-inf, 37.5)' '[-inf, 37.5)' '[37.5, 52.5)' '[52.5, 57.5)' '[57.5, inf)']
