# EqualWidthBinning Comprehensive Demo

This notebook demonstrates the main functionalities of EqualWidthBinning:
1. **Array binning** - Working with NumPy arrays
2. **Pandas integration** - DataFrame support
3. **Polars integration** - Modern DataFrame library support  
4. **Sklearn integration** - Pipeline compatibility
5. **Serialization/Deserialization** - Parameter persistence

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
try:
    import polars as pl
    POLARS_AVAILABLE = True
except ImportError:
    POLARS_AVAILABLE = False
    print("Polars not available - skipping polars examples")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
import json

# Import our binning classes
import sys
sys.path.insert(0, '../..')
from binning.methods import EqualWidthBinning
from binning import get_config, set_config

print("All imports successful!")
print(f"Polars available: {POLARS_AVAILABLE}")

All imports successful!
Polars available: True


In [3]:
# Create sample data for demonstrations
np.random.seed(42)

# Generate sample data with different distributions
n_samples = 1000
data = {
    'uniform': np.random.uniform(0, 100, n_samples),
    'normal': np.random.normal(50, 15, n_samples),
    'exponential': np.random.exponential(2, n_samples) * 10,
    'bimodal': np.concatenate([
        np.random.normal(30, 5, n_samples//2),
        np.random.normal(70, 5, n_samples//2)
    ])
}

# Create NumPy array (2D)
X_array = np.column_stack([data['uniform'], data['normal'], data['exponential']])
print(f"NumPy array shape: {X_array.shape}")
print(f"Data ranges: uniform [{X_array[:, 0].min():.1f}, {X_array[:, 0].max():.1f}], "
      f"normal [{X_array[:, 1].min():.1f}, {X_array[:, 1].max():.1f}], "
      f"exponential [{X_array[:, 2].min():.1f}, {X_array[:, 2].max():.1f}]")

# Create pandas DataFrame
df_pandas = pd.DataFrame(data)
print(f"\nPandas DataFrame shape: {df_pandas.shape}")
print(f"Columns: {list(df_pandas.columns)}")

# Create polars DataFrame (if available)
if POLARS_AVAILABLE:
    df_polars = pl.DataFrame(data)
    print(f"\nPolars DataFrame shape: {df_polars.shape}")
    print(f"Columns: {df_polars.columns}")

NumPy array shape: (1000, 3)
Data ranges: uniform [0.5, 100.0], normal [6.2, 97.9], exponential [0.0, 122.6]

Pandas DataFrame shape: (1000, 4)
Columns: ['uniform', 'normal', 'exponential', 'bimodal']

Polars DataFrame shape: (1000, 4)
Columns: ['uniform', 'normal', 'exponential', 'bimodal']


## 1. Array Binning - NumPy Integration

EqualWidthBinning works seamlessly with NumPy arrays, providing flexible binning options.

In [4]:
# Basic array binning
print("=== Basic Array Binning ===")

# Simple binning with default parameters
binner_basic = EqualWidthBinning(n_bins=5)
print(f"Basic binner: {repr(binner_basic)}")

# Fit and transform
binner_basic.fit(X_array)
X_binned = binner_basic.transform(X_array)

print(f"Original data shape: {X_array.shape}")
print(f"Binned data shape: {X_binned.shape}")
print(f"Bin indices range: [{X_binned.min()}, {X_binned.max()}]")
print(f"First 5 rows original:\n{X_array[:5]}")
print(f"First 5 rows binned:\n{X_binned[:5]}")

# Show bin edges for each column
print(f"\nBin edges per column:")
for col, edges in binner_basic._bin_edges.items():
    print(f"  Column {col}: {[f'{e:.2f}' for e in edges]}")

# Different number of bins per column
print("\n=== Per-column bin configuration ===")
binner_custom = EqualWidthBinning(n_bins={0: 3, 1: 7, 2: 4})
print(f"Custom binner: {repr(binner_custom)}")

binner_custom.fit(X_array)
X_binned_custom = binner_custom.transform(X_array)

print(f"Custom binned shape: {X_binned_custom.shape}")
print(f"Unique bins per column:")
for i in range(X_binned_custom.shape[1]):
    unique_bins = np.unique(X_binned_custom[:, i])
    print(f"  Column {i}: {len(unique_bins)} unique bins -> {unique_bins}")

# Inverse transform
X_reconstructed = binner_custom.inverse_transform(X_binned_custom)
print(f"\nInverse transform shape: {X_reconstructed.shape}")
print(f"First 5 rows reconstructed:\n{X_reconstructed[:5]}")

=== Basic Array Binning ===
Basic binner: EqualWidthBinning(n_bins=5, clip=True, preserve_dataframe=False, fit_jointly=False)
Original data shape: (1000, 3)
Binned data shape: (1000, 3)
Bin indices range: [0, 4]
First 5 rows original:
[[37.45401188 52.66551501 71.17997798]
 [95.07143064 29.96983462  8.04980097]
 [73.19939418 55.70296777 13.15718605]
 [59.86584842 59.15878618  4.36555101]
 [15.60186404 58.39685672 18.8722145 ]]
First 5 rows binned:
[[1 2 2]
 [4 1 0]
 [3 2 0]
 [2 2 0]
 [0 2 0]]

Bin edges per column:
  Column 0: ['0.46', '20.36', '40.27', '60.17', '80.07', '99.97']
  Column 1: ['6.18', '24.52', '42.87', '61.21', '79.55', '97.90']
  Column 2: ['0.00', '24.52', '49.03', '73.55', '98.06', '122.58']

=== Per-column bin configuration ===
Custom binner: EqualWidthBinning(n_bins={0: 3, 1: 7, 2: 4}, clip=True, preserve_dataframe=False, fit_jointly=False)
Custom binned shape: (1000, 3)
Unique bins per column:
  Column 0: 3 unique bins -> [0 1 2]
  Column 1: 7 unique bins -> [0 1 

## 2. Pandas Integration

EqualWidthBinning preserves DataFrame structure and column names when `preserve_dataframe=True`.

In [5]:
# Pandas DataFrame binning
print("=== Pandas DataFrame Integration ===")

# Bin with DataFrame preservation
binner_pandas = EqualWidthBinning(n_bins=4, preserve_dataframe=True, clip=True)
print(f"Pandas binner: {repr(binner_pandas)}")

# Fit and transform DataFrame
binner_pandas.fit(df_pandas)
df_binned = binner_pandas.transform(df_pandas)

print(f"Original DataFrame type: {type(df_pandas)}")
print(f"Binned DataFrame type: {type(df_binned)}")
print(f"Preserved columns: {list(df_binned.columns)}")
print(f"Binned DataFrame shape: {df_binned.shape}")

print(f"\nOriginal data (first 5 rows):")
print(df_pandas.head())

print(f"\nBinned data (first 5 rows):")
print(df_binned.head())

# Show statistics per column
print(f"\nBin statistics per column:")
for col in df_binned.columns:
    print(f"  {col}: bins {df_binned[col].min()}-{df_binned[col].max()}, "
          f"unique bins: {df_binned[col].nunique()}")

# Working with specific columns only
subset_columns = ['uniform', 'normal']
df_subset = df_pandas[subset_columns]

binner_subset = EqualWidthBinning(n_bins=6, preserve_dataframe=True)
binner_subset.fit(df_subset)
df_subset_binned = binner_subset.transform(df_subset)

print(f"\nSubset binning:")
print(f"  Input columns: {list(df_subset.columns)}")
print(f"  Output columns: {list(df_subset_binned.columns)}")
print(f"  Output shape: {df_subset_binned.shape}")

# Inverse transform back to original values (representatives)
df_reconstructed = binner_pandas.inverse_transform(df_binned)
print(f"\nInverse transform (first 5 rows):")
print(df_reconstructed.head())

=== Pandas DataFrame Integration ===
Pandas binner: EqualWidthBinning(n_bins=4, clip=True, preserve_dataframe=True, fit_jointly=False)
Original DataFrame type: <class 'pandas.core.frame.DataFrame'>
Binned DataFrame type: <class 'pandas.core.frame.DataFrame'>
Preserved columns: ['uniform', 'normal', 'exponential', 'bimodal']
Binned DataFrame shape: (1000, 4)

Original data (first 5 rows):
     uniform     normal  exponential    bimodal
0  37.454012  52.665515    71.179978  36.187191
1  95.071431  29.969835     8.049801  27.713492
2  73.199394  55.702968    13.157186  29.785883
3  59.865848  59.158786     4.365551  30.290114
4  15.601864  58.396857    18.872215  34.242154

Binned data (first 5 rows):
   uniform  normal  exponential  bimodal
0        1       2            2        1
1        3       1            0        0
2        2       2            0        0
3        2       2            0        0
4        0       2            0        1

Bin statistics per column:
  uniform: bins 0-

## 3. Polars Integration

EqualWidthBinning also supports Polars DataFrames with automatic format detection.

In [6]:
# Polars DataFrame binning (if available)
if POLARS_AVAILABLE:
    print("=== Polars DataFrame Integration ===")
    
    # Bin with Polars DataFrame preservation
    binner_polars = EqualWidthBinning(n_bins=5, preserve_dataframe=True)
    print(f"Polars binner: {repr(binner_polars)}")
    
    # Fit and transform Polars DataFrame
    binner_polars.fit(df_polars)
    df_polars_binned = binner_polars.transform(df_polars)
    
    print(f"Original Polars DataFrame type: {type(df_polars)}")
    print(f"Binned Polars DataFrame type: {type(df_polars_binned)}")
    print(f"Preserved columns: {df_polars_binned.columns}")
    print(f"Binned DataFrame shape: {df_polars_binned.shape}")
    
    print(f"\nOriginal data (first 5 rows):")
    print(df_polars.head())
    
    print(f"\nBinned data (first 5 rows):")
    print(df_polars_binned.head())
    
    # Show unique bins per column
    print(f"\nUnique bins per column:")
    for col in df_polars_binned.columns:
        unique_count = df_polars_binned[col].n_unique()
        min_val = df_polars_binned[col].min()
        max_val = df_polars_binned[col].max()
        print(f"  {col}: {unique_count} unique bins, range [{min_val}, {max_val}]")
        
    # Inverse transform
    df_polars_reconstructed = binner_polars.inverse_transform(df_polars_binned)
    print(f"\nInverse transform (first 5 rows):")
    print(df_polars_reconstructed.head())
    
else:
    print("Polars not available - skipping polars demonstration")

=== Polars DataFrame Integration ===
Polars binner: EqualWidthBinning(n_bins=5, clip=True, preserve_dataframe=True, fit_jointly=False)
Original Polars DataFrame type: <class 'polars.dataframe.frame.DataFrame'>
Binned Polars DataFrame type: <class 'polars.dataframe.frame.DataFrame'>
Preserved columns: ['uniform', 'normal', 'exponential', 'bimodal']
Binned DataFrame shape: (1000, 4)

Original data (first 5 rows):
shape: (5, 4)
┌───────────┬───────────┬─────────────┬───────────┐
│ uniform   ┆ normal    ┆ exponential ┆ bimodal   │
│ ---       ┆ ---       ┆ ---         ┆ ---       │
│ f64       ┆ f64       ┆ f64         ┆ f64       │
╞═══════════╪═══════════╪═════════════╪═══════════╡
│ 37.454012 ┆ 52.665515 ┆ 71.179978   ┆ 36.187191 │
│ 95.071431 ┆ 29.969835 ┆ 8.049801    ┆ 27.713492 │
│ 73.199394 ┆ 55.702968 ┆ 13.157186   ┆ 29.785883 │
│ 59.865848 ┆ 59.158786 ┆ 4.365551    ┆ 30.290114 │
│ 15.601864 ┆ 58.396857 ┆ 18.872215   ┆ 34.242154 │
└───────────┴───────────┴─────────────┴───────────┘

## 4. Sklearn Integration

EqualWidthBinning is fully compatible with sklearn pipelines and follows sklearn conventions.

In [7]:
# Sklearn pipeline integration
print("=== Sklearn Pipeline Integration ===")

# Create a pipeline with binning and scaling
pipeline = Pipeline([
    ('binner', EqualWidthBinning(n_bins=8, clip=True)),
    ('scaler', StandardScaler())
])

print(f"Pipeline steps: {[step[0] for step in pipeline.steps]}")

# Split data for demonstration
X_train, X_test = train_test_split(X_array, test_size=0.3, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Fit pipeline on training data
pipeline.fit(X_train)
print("Pipeline fitted successfully")

# Transform both training and test data
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print(f"Transformed training shape: {X_train_transformed.shape}")
print(f"Transformed test shape: {X_test_transformed.shape}")

# Check that binning step is fitted
binner_from_pipeline = pipeline.named_steps['binner']
print(f"Binner is fitted: {binner_from_pipeline.is_fitted_}")
print(f"Number of features in: {binner_from_pipeline.n_features_in_}")

# Show bin edges from pipeline
print(f"\nBin edges from pipeline:")
for col, edges in binner_from_pipeline._bin_edges.items():
    print(f"  Column {col}: {len(edges)-1} bins, range [{edges[0]:.2f}, {edges[-1]:.2f}]")

# Check scaling statistics
scaler_from_pipeline = pipeline.named_steps['scaler']
print(f"\nScaler statistics (mean, std):")
for i, (mean, std) in enumerate(zip(scaler_from_pipeline.mean_, scaler_from_pipeline.scale_)):
    print(f"  Column {i}: mean={mean:.3f}, std={std:.3f}")

# Demonstrate get_params and set_params
print(f"\n=== Pipeline Parameters ===")
params = pipeline.get_params()
binner_params = {k: v for k, v in params.items() if k.startswith('binner__')}
print(f"Binner parameters in pipeline:")
for param, value in binner_params.items():
    print(f"  {param}: {value}")

# Modify parameters
pipeline.set_params(binner__n_bins=6)
print(f"Modified binner n_bins to: {pipeline.get_params()['binner__n_bins']}")

# Show that we need to refit after parameter changes
print(f"Binner is still fitted after param change: {pipeline.named_steps['binner'].is_fitted_}")
# Note: Pipeline would need to be refit to use new parameters

=== Sklearn Pipeline Integration ===
Pipeline steps: ['binner', 'scaler']
Training set shape: (700, 3)
Test set shape: (300, 3)
Pipeline fitted successfully
Transformed training shape: (700, 3)
Transformed test shape: (300, 3)
Binner is fitted: True
Number of features in: 3

Bin edges from pipeline:
  Column 0: 8 bins, range [0.46, 99.97]
  Column 1: 8 bins, range [6.92, 97.90]
  Column 2: 8 bins, range [0.00, 122.58]

Scaler statistics (mean, std):
  Column 0: mean=3.451, std=2.351
  Column 1: mean=3.447, std=1.366
  Column 2: mean=0.839, std=1.226

=== Pipeline Parameters ===
Binner parameters in pipeline:
  binner__bin_edges: {0: [np.float64(0.4632023004602859), np.float64(12.901772928979383), np.float64(25.34034355749848), np.float64(37.77891418601758), np.float64(50.217484814536675), np.float64(62.65605544305577), np.float64(75.09462607157487), np.float64(87.53319670009395), np.float64(99.97176732861305)], 1: [np.float64(6.916066800775262), np.float64(18.288635140387466), np.float

In [8]:
# Cross-validation with binning
print("\n=== Cross-validation Demonstration ===")

# Create a simple regression target
np.random.seed(42)
y_regression = X_array[:, 0] * 2 + X_array[:, 1] * -1 + np.random.normal(0, 0.5, X_array.shape[0])

# Create a pipeline for regression
reg_pipeline = Pipeline([
    ('binner', EqualWidthBinning(n_bins=5)),
    ('regressor', LinearRegression())
])

# Perform cross-validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(reg_pipeline, X_array, y_regression, cv=5, scoring='neg_mean_squared_error')

print(f"Cross-validation MSE scores: {-cv_scores}")
print(f"Mean CV MSE: {-cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Compare with unbinned data
unbinned_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

cv_scores_unbinned = cross_val_score(unbinned_pipeline, X_array, y_regression, cv=5, scoring='neg_mean_squared_error')
print(f"Unbinned CV MSE: {-cv_scores_unbinned.mean():.4f} (+/- {cv_scores_unbinned.std() * 2:.4f})")

# Grid search demonstration
print("\n=== Grid Search for Optimal Bins ===")
from sklearn.model_selection import GridSearchCV

param_grid = {
    'binner__n_bins': [3, 5, 8, 10, 15],
    'binner__clip': [True, False]
}

grid_search = GridSearchCV(reg_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_array, y_regression)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {-grid_search.best_score_:.4f}")

# Show the effect of different n_bins values
best_binner = grid_search.best_estimator_.named_steps['binner']
print(f"Best binner config: n_bins={best_binner.n_bins}, clip={best_binner.clip}")


=== Cross-validation Demonstration ===
Cross-validation MSE scores: [152.98084025 162.06780471 160.01265241 146.26959162 162.97065581]
Mean CV MSE: 156.8603 (+/- 12.6998)
Unbinned CV MSE: 0.2406 (+/- 0.0298)

=== Grid Search for Optimal Bins ===
Best parameters: {'binner__clip': True, 'binner__n_bins': 15}
Best CV score: 17.4372
Best binner config: n_bins=15, clip=True


## 5. Parameter Serialization and Persistence

The EqualWidthBinning class supports sklearn-style parameter serialization through `get_params()` and `set_params()`. This enables model persistence, hyperparameter tuning, and configuration management.

In [10]:
# Parameter serialization demonstration
print("=== Parameter Serialization ===")

# Create and fit a binner with specific configuration
original_binner = EqualWidthBinning(n_bins=7, clip=True)
original_binner.fit(X_array)

print(f"Original binner: {original_binner}")
print(f"Original fitted: {original_binner.is_fitted_}")

# Get all parameters
params = original_binner.get_params()
print(f"\nExtracted parameters:")
for param, value in params.items():
    print(f"  {param}: {value}")

# Create a new instance from parameters
reconstructed_binner = EqualWidthBinning(**params)
print(f"\nReconstructed binner: {reconstructed_binner}")
print(f"Reconstructed fitted: {reconstructed_binner.is_fitted_}")

# The reconstructed binner needs to be fitted
reconstructed_binner.fit(X_array)
print(f"After fitting reconstructed: {reconstructed_binner.is_fitted_}")

# Compare transformations
original_transformed = original_binner.transform(X_array[:5])
reconstructed_transformed = reconstructed_binner.transform(X_array[:5])

print(f"\nTransformation comparison (first 5 rows):")
print(f"Original:      {original_transformed}")
print(f"Reconstructed: {reconstructed_transformed}")
print(f"Arrays equal: {np.array_equal(original_transformed, reconstructed_transformed)}")

# Demonstrate parameter modification
print(f"\n=== Parameter Modification ===")
print(f"Original n_bins: {original_binner.get_params()['n_bins']}")

# Modify parameters
original_binner.set_params(n_bins=10)
print(f"Modified n_bins: {original_binner.get_params()['n_bins']}")
print(f"Still fitted after param change: {original_binner.is_fitted_}")

# Note: After parameter changes, refitting is needed for changes to take effect
print("Note: Parameter changes require refitting to take effect")

=== Parameter Serialization ===
Original binner: EqualWidthBinning(n_bins=7, clip=True, preserve_dataframe=False, fit_jointly=False)
Original fitted: True

Extracted parameters:
  bin_edges: {0: [np.float64(0.4632023004602859), np.float64(14.678711590196396), np.float64(28.894220879932504), np.float64(43.109730169668616), np.float64(57.32523945940473), np.float64(71.54074874914083), np.float64(85.75625803887694), np.float64(99.97176732861305)], 1: [np.float64(6.179742747580889), np.float64(19.282152857594035), np.float64(32.38456296760718), np.float64(45.48697307762032), np.float64(58.589383187633466), np.float64(71.69179329764661), np.float64(84.79420340765975), np.float64(97.8966135176729)], 2: [np.float64(0.00023269646100864647), np.float64(17.5111222938354), np.float64(35.02201189120979), np.float64(52.53290148858419), np.float64(70.04379108595857), np.float64(87.55468068333296), np.float64(105.06557028070736), np.float64(122.57645987808174)]}
  bin_range: None
  bin_representative

In [None]:
# Model persistence demonstration
print("=== Model Persistence with Pickle ===")

import pickle
import tempfile
import os

# Create and fit a complex binner
persistent_binner = EqualWidthBinning(n_bins=6, clip=True)
persistent_binner.fit(X_array)

# Transform some data
original_result = persistent_binner.transform(X_array[:3])
print(f"Original transformation result:\n{original_result}")

# Save to pickle
with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_file:
    pickle.dump(persistent_binner, tmp_file)
    pickle_path = tmp_file.name

print(f"Saved binner to: {pickle_path}")

# Load from pickle
with open(pickle_path, 'rb') as f:
    loaded_binner = pickle.load(f)

print(f"Loaded binner: {loaded_binner}")
print(f"Loaded binner fitted: {loaded_binner.is_fitted_}")

# Compare parameters
original_params = persistent_binner.get_params()
loaded_params = loaded_binner.get_params()
params_match = all(original_params[k] == loaded_params[k] for k in original_params.keys())
print(f"Parameters match: {params_match}")

# Compare transformations
loaded_result = loaded_binner.transform(X_array[:3])
print(f"Loaded transformation result:\n{loaded_result}")
print(f"Results identical: {np.array_equal(original_result, loaded_result)}")

# Compare bin edges
print(f"\nBin edges comparison:")
for col in range(X_array.shape[1]):
    orig_edges = persistent_binner._bin_edges[col]
    load_edges = loaded_binner._bin_edges[col]
    edges_match = np.allclose(orig_edges, load_edges)
    print(f"  Column {col}: {edges_match} (original: {len(orig_edges)} edges, loaded: {len(load_edges)} edges)")

# Clean up
os.unlink(pickle_path)
print(f"Cleaned up temporary file")

# JSON serialization of parameters (for configuration files)
print(f"\n=== JSON-serializable Parameters ===")
import json

# Get parameters and make them JSON serializable
json_params = {}
for key, value in persistent_binner.get_params().items():
    if isinstance(value, (str, int, float, bool, type(None))):
        json_params[key] = value
    else:
        json_params[key] = str(value)  # Convert complex objects to string

json_str = json.dumps(json_params, indent=2)
print(f"JSON parameters:\n{json_str}")

# Parse back (note: strategy would need special handling for full reconstruction)
parsed_params = json.loads(json_str)
print(f"Parsed back: {parsed_params}")

## 6. Advanced Features

This section demonstrates advanced features of EqualWidthBinning including pre-fitted bins, joint fitting across datasets, and edge case handling.

In [None]:
# Pre-fitted bins demonstration
print("=== Pre-fitted Bins ===")

# Define custom bin edges for each column
custom_bin_edges = {
    0: np.array([-2.0, -1.0, 0.0, 1.0, 2.0]),  # 4 bins for column 0
    1: np.array([-1.5, -0.5, 0.5, 1.5]),       # 3 bins for column 1
    2: np.array([0.0, 0.5, 1.0, 1.5, 2.0, 3.0]) # 5 bins for column 2
}

# Create binner with pre-specified edges (automatically fitted)
prefitted_binner = EqualWidthBinning(bin_edges=custom_bin_edges)
print(f"Pre-fitted binner: {prefitted_binner}")
print(f"Is fitted without calling fit(): {prefitted_binner.is_fitted_}")

# Can transform immediately without calling fit()
test_data = np.array([[0.5, 0.2, 1.2], [-0.8, -0.3, 0.8], [1.5, 1.0, 2.5]])
prefitted_result = prefitted_binner.transform(test_data)
print(f"Pre-fitted transformation:\n{prefitted_result}")

# Show the bin edges that were used
print(f"Bin edges used:")
for col, edges in prefitted_binner._bin_edges.items():
    print(f"  Column {col}: {edges}")

# Joint fitting demonstration
print(f"\n=== Joint Fitting ===")

# Create separate train and validation sets with different distributions
np.random.seed(42)
X_train = np.random.normal(0, 1, (100, 3))
X_val = np.random.normal(0.5, 1.2, (50, 3))  # Slightly different distribution

print(f"Training data range: [{X_train.min():.2f}, {X_train.max():.2f}]")
print(f"Validation data range: [{X_val.min():.2f}, {X_val.max():.2f}]")

# Fit on training data only
train_only_binner = EqualWidthBinning(n_bins=5)
train_only_binner.fit(X_train)

# Fit jointly on both datasets
joint_binner = EqualWidthBinning(n_bins=5)
joint_data = np.vstack([X_train, X_val])
joint_binner.fit(joint_data)

print(f"\nBin ranges comparison:")
for col in range(3):
    train_edges = train_only_binner._bin_edges[col]
    joint_edges = joint_binner._bin_edges[col]
    print(f"Column {col}:")
    print(f"  Train only: [{train_edges[0]:.2f}, {train_edges[-1]:.2f}]")
    print(f"  Joint fit:  [{joint_edges[0]:.2f}, {joint_edges[-1]:.2f}]")

# Show how this affects validation data transformation
val_train_binned = train_only_binner.transform(X_val[:3])
val_joint_binned = joint_binner.transform(X_val[:3])

print(f"\nValidation data binning (first 3 rows):")
print(f"Original:     {X_val[:3]}")
print(f"Train-fitted: {val_train_binned}")
print(f"Joint-fitted: {val_joint_binned}")

# Check for out-of-bounds issues with train-only binner
val_clipped = train_only_binner.transform(X_val, clip=True)[:3]
print(f"Train-fitted with clipping: {val_clipped}")

# Clipping demonstration
print(f"\n=== Clipping Behavior ===")

# Create data with extreme outliers
outlier_data = np.array([[-5.0, 3.0, 10.0], [0.0, 0.0, 0.0], [5.0, -3.0, -5.0]])

# Transform with and without clipping
no_clip_result = joint_binner.transform(outlier_data, clip=False)
clip_result = joint_binner.transform(outlier_data, clip=True)

print(f"Outlier data: {outlier_data}")
print(f"Without clipping: {no_clip_result}")
print(f"With clipping: {clip_result}")

# Show what the bin assignments mean
print(f"Bin edge interpretation for column 0:")
col0_edges = joint_binner._bin_edges[0]
print(f"  Edges: {col0_edges}")
print(f"  -5.0 maps to bin: {no_clip_result[0, 0]} (no clip), {clip_result[0, 0]} (clip)")
print(f"  5.0 maps to bin: {no_clip_result[2, 0]} (no clip), {clip_result[2, 0]} (clip)")