In [1]:
from typing import Any

import numpy as np
import pandas as pd

from binning import EqualWidthBinning, IntervalBinningBase

In [2]:
preserve_dataframe=True
est = EqualWidthBinning(n_bins=2, bin_range=(0.0, 2.0), preserve_dataframe=preserve_dataframe)
X_np = np.array([[0.5, 1.5], [1.0, 2.0]])
X_pd = pd.DataFrame(X_np, columns=["a", "b"])

est.fit(X_pd)
out_pd = est.transform(X_pd)
assert out_pd.shape == X_pd.shape
if preserve_dataframe:
    assert hasattr(out_pd, "columns")
    assert isinstance(out_pd, pd.DataFrame)
    assert list(out_pd.columns) == ["a", "b"]
else:
    assert isinstance(out_pd, np.ndarray)

KeyError: 0

In [4]:
est.bin_spec, est.bin_spec_

(None,
 {'a': [np.float64(0.0), np.float64(1.0), np.float64(2.0)],
  'b': [np.float64(0.0), np.float64(1.0), np.float64(2.0)]})

In [4]:
class DummyIntervalBinning(IntervalBinningBase):
    """Minimal subclass for testing IntervalBinningBase.

    Allows custom bin edges for testing below/above range logic.

    Attributes:
        bin_edges (list or None): Custom bin edges to use for all columns.
        _test_bin_edges (list or None): Internal storage for test bin edges.
    """

    def __init__(self, bin_edges=None, **kwargs):
        """Initialize DummyIntervalBinning.

        Args:
            bin_edges (list or None): Custom bin edges to use for all columns.
            **kwargs: Additional keyword arguments passed to IntervalBinningBase.
        """
        self.bin_edges = bin_edges
        self._test_bin_edges = bin_edges
        super().__init__(**kwargs)

    def _calculate_bins(self, x_col: np.ndarray, col_id: Any) -> list:
        """Return custom bin edges or default edges for a column.

        Args:
            x_col (np.ndarray): Data for a single column.
            col_id (Any): Column identifier.

        Returns:
            list: Bin edges for the column.
        """
        if self._test_bin_edges is not None:
            return self._test_bin_edges
        return [-np.inf, 0.0, 1.0, np.inf]


In [5]:
def _fit_dummy(est, X):
    """Helper to fit bins and reps for DummyIntervalBinning.

    Args:
        est (DummyIntervalBinning): The estimator to fit.
        X (np.ndarray): Input data.
    """
    est._fit_bins(X)
    est.is_fitted_ = True

In [6]:
finite_edges = [0.0, 1.0, 2.0]
X = np.array([[0.1, 1.5], [0.6, 1.8]])
est = DummyIntervalBinning(
    bin_spec={0: finite_edges, 1: finite_edges},
    bin_edges=finite_edges,
    clip=True,
)
_fit_dummy(est, X)

bin_ranges = est.lookup_bin_ranges()
assert isinstance(bin_ranges, dict)
assert set(bin_ranges.keys()) == {0, 1}
assert all(v == 2 for v in bin_ranges.values())

params = est.get_params()
assert isinstance(params, dict)
assert "bin_spec" in params
assert "bin_reps" in params
assert "clip" in params
for v in params["bin_spec"].values():
    assert isinstance(v, list)
if params["bin_reps"] is not None:
    for v in params["bin_reps"].values():
        assert isinstance(v, list)

In [8]:
est = DummyIntervalBinning(
    bin_spec={0: finite_edges, 1: finite_edges},
    bin_edges=finite_edges,
    clip=True,
)

In [10]:
est.bin_reps_

{0: [0.5, 1.5], 1: [0.5, 1.5]}

In [7]:
params

{'bin_edges': [0.0, 1.0, 2.0],
 'fit_jointly': False,
 'preserve_dataframe': False,
 'bin_spec': {0: [0.0, 1.0, 2.0], 1: [0.0, 1.0, 2.0]},
 'bin_reps': None,
 'clip': True}

# Exact values binning

In [None]:
import numpy as np
import pandas as pd

from binning import IntervalBinningBase

# Construct example data
data_np = np.array([
    [10, 100],
    [20, 200],
    [20, 200],
    [30, 300],
    [40, 400],
    [np.nan, 500],   # To illustrate NaN handling
    [50, np.nan],    # To illustrate NaN handling
])

data_df = pd.DataFrame(data_np, columns=["A", "B"])

print("Original numpy array:")
print(data_np)
print("\nOriginal pandas DataFrame:")
print(data_df)

Original numpy array:
[[ 10. 100.]
 [ 20. 200.]
 [ 20. 200.]
 [ 30. 300.]
 [ 40. 400.]
 [ nan 500.]
 [ 50.  nan]]

Original pandas DataFrame:
      A      B
0  10.0  100.0
1  20.0  200.0
2  20.0  200.0
3  30.0  300.0
4  40.0  400.0
5   NaN  500.0
6  50.0    NaN


In [2]:
# 1. Default binning (clip=True, preserve_dataframe=False)
binner_default = ExactValuesBinning()
binner_default.fit(data_np)
print("\nDefault binning (numpy array, clip=True):")
print(binner_default.transform(data_np))

binner_default_df = ExactValuesBinning()
binner_default_df.fit(data_df)
print("\nDefault binning (DataFrame, clip=True):")
print(binner_default_df.transform(data_df))


Default binning (numpy array, clip=True):
[[ 0  0]
 [ 1  1]
 [ 1  1]
 [ 2  2]
 [ 3  3]
 [-2  4]
 [ 4 -2]]

Default binning (DataFrame, clip=True):
[[ 0  0]
 [ 1  1]
 [ 1  1]
 [ 2  2]
 [ 3  3]
 [-2  4]
 [ 4 -2]]


In [3]:
binner_default.bin_edges_

{0: array([-inf,  15.,  25.,  35.,  45.,  inf]),
 1: array([-inf, 150., 250., 350., 450.,  inf])}

In [4]:
# 2. No clipping (clip=False)
binner_noclip = ExactValuesBinning(clip=False)
binner_noclip.fit(data_np)
print("\nNo clipping (numpy array, clip=False):")
print(binner_noclip.transform(data_np))


No clipping (numpy array, clip=False):
[[ 0  0]
 [ 1  1]
 [ 1  1]
 [ 2  2]
 [ 3  3]
 [-2  4]
 [ 4 -2]]


In [11]:
binner_noclip.bin_edges_

{0: array([-inf,  15.,  25.,  35.,  45.,  inf]),
 1: array([-inf, 150., 250., 350., 450.,  inf])}

In [5]:
# 3. Preserve DataFrame output
binner_preserve_df = ExactValuesBinning(preserve_dataframe=True)
binner_preserve_df.fit(data_df)
print("\nPreserve DataFrame output (preserve_dataframe=True):")
print(binner_preserve_df.transform(data_df))


Preserve DataFrame output (preserve_dataframe=True):
   A  B
0  0  0
1  1  1
2  1  1
3  2  2
4  3  3
5 -2  4
6  4 -2


In [6]:
# 4. Custom bin edges and representatives
custom_edges = {0: [0, 25, 50], 1: [0, 250, 500]}
custom_reps = {0: [12.5, 37.5], 1: [125, 375]}
binner_custom = ExactValuesBinning(bin_edges=custom_edges, bin_representatives=custom_reps)
binner_custom.fit(data_np)
print("\nCustom bin edges and representatives (numpy array):")
print("Transformed:\n", binner_custom.transform(data_np))
print("Bin representatives:", binner_custom.bin_representatives_)


Custom bin edges and representatives (numpy array):
Transformed:
 [[ 0  0]
 [ 0  0]
 [ 0  0]
 [ 1  1]
 [ 1  1]
 [-2  1]
 [ 1 -2]]
Bin representatives: {0: array([12.5, 37.5]), 1: array([125, 375])}


In [7]:
# 5. Joint fitting (fit_jointly=True)
binner_joint = ExactValuesBinning(fit_jointly=True)
binner_joint.fit(data_np)
print("\nJoint fitting (fit_jointly=True):")
print(binner_joint.transform(data_np))


Joint fitting (fit_jointly=True):
[[ 0  5]
 [ 1  6]
 [ 1  6]
 [ 2  7]
 [ 3  8]
 [-2  9]
 [ 4 -2]]


In [8]:
# 6. Show how NaN values are handled (should be -2 in output)
print("\nNaN handling (should be -2 for NaN locations):")
print(binner_default.transform(data_np))


NaN handling (should be -2 for NaN locations):
[[ 0  0]
 [ 1  1]
 [ 1  1]
 [ 2  2]
 [ 3  3]
 [-2  4]
 [ 4 -2]]


In [9]:
# 7. Outliers below and above the training data
test_np_outliers = np.array([
    [5, 50],      # Both values below training min
    [60, 600],    # Both values above training max
    [10, 100],    # At training min
    [50, 500],    # At training max
    [25, 250],    # In the middle
    [np.nan, 200] # NaN in first column
])

test_df_outliers = pd.DataFrame(test_np_outliers, columns=["A", "B"])

print("\nTest data with outliers (numpy array):")
print(test_np_outliers)
print("\nTest data with outliers (DataFrame):")
print(test_df_outliers)

# Use the default binner trained earlier
print("\nDefault binning (clip=True) on outlier test data (numpy array):")
print(binner_default.transform(test_np_outliers))

print("\nNo clipping (clip=False) on outlier test data (numpy array):")
print(binner_noclip.transform(test_np_outliers))


Test data with outliers (numpy array):
[[  5.  50.]
 [ 60. 600.]
 [ 10. 100.]
 [ 50. 500.]
 [ 25. 250.]
 [ nan 200.]]

Test data with outliers (DataFrame):
      A      B
0   5.0   50.0
1  60.0  600.0
2  10.0  100.0
3  50.0  500.0
4  25.0  250.0
5   NaN  200.0

Default binning (clip=True) on outlier test data (numpy array):
[[ 0  0]
 [ 4  4]
 [ 0  0]
 [ 4  4]
 [ 2  2]
 [-2  1]]

No clipping (clip=False) on outlier test data (numpy array):
[[ 0  0]
 [ 4  4]
 [ 0  0]
 [ 4  4]
 [ 2  2]
 [-2  1]]


In [10]:
binner_noclip.bin_edges_

{0: array([-inf,  15.,  25.,  35.,  45.,  inf]),
 1: array([-inf, 150., 250., 350., 450.,  inf])}