In [None]:
# Manual Binning Classes Demonstration

This notebook demonstrates the usage of the two new manual binning classes: `ManualIntervalBinning` and `ManualFlexibleBinning`.

## Import necessary libraries

import numpy as np
import pandas as pd
from binning.methods import ManualIntervalBinning, ManualFlexibleBinning

# Set random seed for reproducibility
np.random.seed(42)

## 1. ManualIntervalBinning

### 1.1 Basic Usage

The `ManualIntervalBinning` class allows you to specify exact bin edges for each feature. This ensures consistent binning across different datasets.

# Define custom bin edges for different features
edges = {
    0: [0, 18, 35, 50, 65, 100],    # Age groups: child, young adult, adult, middle-aged, senior
    1: [0, 30000, 60000, 100000],   # Income brackets: low, medium, high
    2: [0, 60, 80, 90, 100]         # Score ranges: fail, pass, good, excellent
}

# Create the binning transformer
interval_binner = ManualIntervalBinning(bin_edges=edges)

# Create sample data
X = np.array([
    [25, 45000, 85],   # Young adult, medium income, good score
    [45, 75000, 92],   # Adult, high income, excellent score  
    [70, 25000, 55],   # Senior, low income, fail score
    [30, 55000, 78],   # Adult, medium income, pass score
    [15, 0, 95]        # Child, no income, excellent score
])

print("Original data:")
print(X)

# Fit and transform the data
X_binned = interval_binner.fit_transform(X)
print(f"\nBinned data:")
print(X_binned)

print(f"\nBin edges:")
for col, edges_list in interval_binner.bin_edges.items():
    print(f"  Column {col}: {edges_list}")

### 1.2 DataFrame Support with Named Columns

# Create DataFrame with meaningful column names
df = pd.DataFrame({
    'age': [25, 45, 70, 30, 15],
    'income': [45000, 75000, 25000, 55000, 0],
    'score': [85, 92, 55, 78, 95]
})

# Define edges using column names
named_edges = {
    'age': [0, 18, 35, 50, 65, 100],
    'income': [0, 30000, 60000, 100000],
    'score': [0, 60, 80, 90, 100]
}

# Create binner that preserves DataFrame format
df_binner = ManualIntervalBinning(
    bin_edges=named_edges, 
    preserve_dataframe=True
)

df_binned = df_binner.fit_transform(df)
print(f"\nOriginal DataFrame:")
print(df)
print(f"\nBinned DataFrame:")
print(df_binned)

### 1.3 Custom Representatives

You can specify custom representative values for each bin, useful for inverse transformations.

# Define custom representatives (e.g., midpoints or meaningful values)
custom_reps = {
    'age': [9, 26.5, 42.5, 57.5, 82.5],      # Midpoints of age groups
    'income': [15000, 45000, 80000],           # Representative incomes
    'score': [30, 70, 85, 95]                 # Representative scores
}

custom_binner = ManualIntervalBinning(
    bin_edges=named_edges,
    bin_representatives=custom_reps
)

custom_binner.fit(df)
print(f"\nCustom representatives:")
for col, reps in custom_binner.bin_representatives.items():
    print(f"  {col}: {reps}")

## 2. ManualFlexibleBinning

### 2.1 Mixed Singleton and Interval Bins

The `ManualFlexibleBinning` class supports both singleton bins (exact value matches) and interval bins (range matches).

# Define flexible bin specifications
flexible_specs = {
    'grade': ['A', 'B', 'C', 'D', (0, 60)],  # Letter grades + failing range
    'category': [1, 2, 3, (10, 20), (20, 50)], # Specific values + ranges
    'measurement': [(0, 10), (10, 25), 50, 100] # Ranges + specific values
}

flexible_binner = ManualFlexibleBinning(bin_spec=flexible_specs)

# Create mixed data that matches the specifications
flexible_data = [
    ['A', 1, 5],      # Exact matches and range match
    ['B', 15, 50],    # Exact match, range match, exact match
    ['C', 2, 100],    # All exact matches
    [45, 30, 8],      # Range match for grade (failing), range matches
    ['D', 3, 25]      # Mix of exact and range matches
]

flexible_df = pd.DataFrame(flexible_data, columns=['grade', 'category', 'measurement'])
print(f"Original flexible data:")
print(flexible_df)

flexible_binned = flexible_binner.fit_transform(flexible_df)
print(f"\nFlexible binned data:")
print(flexible_binned)

### 2.2 Categorical-like Binning

You can use only singleton bins for categorical-like behavior:

categorical_specs = {
    'color': ['red', 'green', 'blue', 'yellow'],
    'size': ['small', 'medium', 'large'],
    'priority': [1, 2, 3, 4, 5]
}

categorical_binner = ManualFlexibleBinning(bin_spec=categorical_specs)

categorical_data = [
    ['red', 'small', 1],
    ['blue', 'large', 3],
    ['green', 'medium', 2],
    ['yellow', 'small', 5]
]

categorical_df = pd.DataFrame(categorical_data, columns=['color', 'size', 'priority'])
print(f"\nCategorical data:")
print(categorical_df)

categorical_binned = categorical_binner.fit_transform(categorical_df)
print(f"\nCategorical binned:")
print(categorical_binned)

### 2.3 Custom Representatives for Flexible Bins

# Define custom representatives for the flexible bins
flexible_custom_reps = {
    'grade': [4.0, 3.0, 2.0, 1.0, 30.0],  # GPA-like values for grades, average for failing
    'category': [1, 2, 3, 15, 35],         # Representative values
    'measurement': [5, 17.5, 50, 100]      # Midpoints for ranges, exact for singletons
}

custom_flexible_binner = ManualFlexibleBinning(
    bin_spec=flexible_specs,
    bin_representatives=flexible_custom_reps
)

custom_flexible_binner.fit(flexible_df)
print(f"\nFlexible bin specifications:")
for col, specs in custom_flexible_binner.bin_spec.items():
    print(f"  {col}: {specs}")

print(f"\nFlexible custom representatives:")
for col, reps in custom_flexible_binner.bin_representatives.items():
    print(f"  {col}: {reps}")

## 3. Comparison and Use Cases

### 3.1 When to Use ManualIntervalBinning

print("\n" + "="*60)
print("WHEN TO USE ManualIntervalBinning")
print("="*60)
print("✓ When you have continuous numerical data")
print("✓ When you want traditional interval-based bins")
print("✓ When you need consistent binning across datasets")
print("✓ For age groups, income brackets, score ranges, etc.")
print("✓ When all features can be binned using intervals")

### 3.2 When to Use ManualFlexibleBinning

print("\n" + "="*60)
print("WHEN TO USE ManualFlexibleBinning")
print("="*60)
print("✓ When you have mixed data types (categorical + numerical)")
print("✓ When you need both exact matches AND ranges")
print("✓ For complex binning rules (e.g., specific outliers + ranges)")
print("✓ When some values need exact matching (categories, special values)")
print("✓ For domain-specific binning with mixed requirements")

### 3.3 Serialization Example

Both classes support parameter serialization for model persistence:

# Get parameters from fitted binners
interval_params = interval_binner.get_params()
flexible_params = flexible_binner.get_params()

print(f"\nInterval binner parameters keys: {list(interval_params.keys())}")
print(f"Flexible binner parameters keys: {list(flexible_params.keys())}")

# Create new instances with same parameters (for model persistence)
restored_interval = ManualIntervalBinning(**interval_params)
restored_flexible = ManualFlexibleBinning(**flexible_params)

# Verify they work without additional fitting
test_data = np.array([[25, 45000, 85]])
interval_result = restored_interval.transform(test_data)
print(f"\nRestored interval binner result: {interval_result}")

## 4. Summary

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print("✅ ManualIntervalBinning: Traditional interval binning with user-defined edges")
print("✅ ManualFlexibleBinning: Mixed singleton and interval bins for complex scenarios")
print("✅ Both classes support:")
print("   • DataFrame preservation")
print("   • Custom representatives")
print("   • Parameter serialization")
print("   • sklearn-compatible interface")
print("   • Comprehensive error validation")
print("✅ Never infer from data - always use user specifications")
print("✅ Perfect for standardized, reproducible binning across datasets")