# Atlantic - Builder Pattern Examples

This notebook demonstrates all builder configurations and customization options for different use cases and scenarios.

**Configurations covered:**
1. Default (Balanced)
2. Fast Prototyping
3. Thorough Optimization
4. High-Null Data
5. High-Cardinality Categorical
6. No H2O Feature Selection
7. Regression Optimized
8. Classification Optimized
9. Minimal Preprocessing
10. Maximum Preprocessing

## Setup

In [None]:
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=Warning)

from atlantic.data import DatasetGenerator
from atlantic.pipeline import AtlanticBuilder

## Data Preparation

Generate various datasets for different scenarios.

In [None]:
# Classification dataset
data_cls, target_cls = DatasetGenerator.generate_classification(
    n_samples=1000, n_features=15, n_classes=2, n_categorical=4, 
    null_percentage=0.08, random_state=42
)

# Regression dataset
data_reg, target_reg = DatasetGenerator.generate_regression(
    n_samples=1000, n_features=15, n_categorical=4,
    null_percentage=0.08, random_state=42
)

# Dataset with datetime features
data_dt, target_dt = DatasetGenerator.generate_with_datetime(
    n_samples=1000, n_numeric=10, n_categorical=3, 
    task_type="regression", null_percentage=0.05, random_state=42
)

# High-null dataset
data_hn, target_hn = DatasetGenerator.generate_high_null(
    n_samples=1000, null_percentage=0.25, task_type="classification", random_state=42
)

# High-cardinality dataset
data_hc, target_hc = DatasetGenerator.generate_high_cardinality(
    n_samples=1000, n_categorical=5, cardinality_range=(50, 150),
    task_type="classification", random_state=42
)

print("Datasets generated:")
print(f"  Classification: {data_cls.shape}")
print(f"  Regression: {data_reg.shape}")
print(f"  Datetime: {data_dt.shape}")
print(f"  High-Null: {data_hn.shape}")
print(f"  High-Cardinality: {data_hc.shape}")

---
## Configuration 1: Default (Balanced)

In [None]:
default_builder = AtlanticBuilder()
default_config = default_builder.get_config()

print("Default Configuration:")
print(f"  Date Engineering: {default_config.date_engineering}")
print(f"  Null Removal Threshold: {default_config.null_removal_threshold}")
print(f"  Feature Selection: {default_config.feature_selection.method}")
print(f"  H2O Relevance: {default_config.feature_selection.h2o_relevance}")
print(f"  H2O Models: {default_config.feature_selection.h2o_max_models}")
print(f"  Encoding FS: {default_config.feature_selection.encoding_for_fs}")
print(f"  VIF Threshold: {default_config.feature_selection.vif_threshold}")
print(f"  Scaler: {default_config.encoding.scaler}")
print(f"  Encoder: {default_config.encoding.encoder}")
print(f"  Auto-Select Encoding: {default_config.encoding.auto_select}")
print(f"  Imputer: {default_config.imputation.method}")
print(f"  Auto-Select Imputation: {default_config.imputation.auto_select}")
print(f"  Optimization Level: {default_config.optimizer.optimization_level}")
print(f"  Random State: {default_config.optimizer.random_state}")

---
## Configuration 2: Fast Prototyping

Optimized for quick iterations during development.

In [None]:
fast_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.90)
    .with_feature_selection(
        method="h2o",
        relevance=0.85,
        h2o_models=3,
        encoding_fs=True
    )
    .with_encoding(
        scaler="minmax",
        encoder="label",
        auto_select=False
    )
    .with_imputation(
        method="simple",
        auto_select=False
    )
    .with_vif_filtering(threshold=15.0)
    .with_optimization(
        optimization_level="fast",
        random_state=42
    )
    .build()
)

print("Fast Prototyping Configuration:")
print(f"  Feature Selection: {fast_pipeline.config.feature_selection.h2o_max_models} models")
print(f"  Encoding: {fast_pipeline.config.encoding.scaler} + {fast_pipeline.config.encoding.encoder}")
print(f"  Imputation: {fast_pipeline.config.imputation.method}")
print(f"  Optimization Level: {fast_pipeline.config.optimizer.optimization_level}")

---
## Configuration 3: Thorough Optimization

Best results, longer processing time.

In [None]:
thorough_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.99)
    .with_feature_selection(
        method="h2o",
        relevance=0.98,
        h2o_models=15,
        encoding_fs=True
    )
    .with_encoding(
        scaler="standard",
        encoder="ifrequency",
        auto_select=True
    )
    .with_imputation(
        method="iterative",
        auto_select=True
    )
    .with_vif_filtering(threshold=8.0)
    .with_optimization(
        optimization_level="thorough",
        random_state=42
    )
    .build()
)

print("Thorough Optimization Configuration:")
print(f"  Feature Selection: {thorough_pipeline.config.feature_selection.h2o_max_models} models")
print(f"  Relevance: {thorough_pipeline.config.feature_selection.h2o_relevance}")
print(f"  Auto-Select Encoding: {thorough_pipeline.config.encoding.auto_select}")
print(f"  Auto-Select Imputation: {thorough_pipeline.config.imputation.auto_select}")
print(f"  VIF Threshold: {thorough_pipeline.config.feature_selection.vif_threshold}")

---
## Configuration 4: High-Null Data

Optimized for datasets with >20% missing values.

In [None]:
high_null_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.80)
    .with_feature_selection(
        method="h2o",
        relevance=0.90,
        h2o_models=5,
        encoding_fs=True
    )
    .with_encoding(
        scaler="robust",
        encoder="ifrequency",
        auto_select=True
    )
    .with_imputation(
        method="iterative",
        auto_select=True
    )
    .with_vif_filtering(threshold=12.0)
    .with_optimization(
        optimization_level="balanced",
        random_state=42
    )
    .build()
)

print("High-Null Data Configuration:")
print(f"  Null Removal: {high_null_pipeline.config.null_removal_threshold}")
print(f"  Scaler: {high_null_pipeline.config.encoding.scaler}")
print(f"  Imputation: {high_null_pipeline.config.imputation.method}")

---
## Configuration 5: High-Cardinality Categorical

Optimized for datasets with many categorical levels.

In [None]:
high_card_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.99)
    .with_feature_selection(
        method="h2o",
        relevance=0.95,
        h2o_models=7,
        encoding_fs=True
    )
    .with_encoding(
        scaler="standard",
        encoder="ifrequency",
        auto_select=True
    )
    .with_imputation(
        method="knn",
        auto_select=True
    )
    .with_vif_filtering(threshold=10.0)
    .with_optimization(
        optimization_level="balanced",
        random_state=42
    )
    .build()
)

print("High-Cardinality Configuration:")
print(f"  Encoder: {high_card_pipeline.config.encoding.encoder}")
print(f"  Encoding FS: {high_card_pipeline.config.feature_selection.encoding_for_fs}")

---
## Configuration 6: No H2O Feature Selection

VIF filtering only - useful when H2O is unavailable.

In [None]:
no_h2o_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.99)
    .with_feature_selection(method="none")
    .with_encoding(
        scaler="standard",
        encoder="label",
        auto_select=True
    )
    .with_imputation(
        method="simple",
        auto_select=True
    )
    .with_vif_filtering(threshold=10.0)
    .with_optimization(
        optimization_level="fast",
        random_state=42
    )
    .build()
)

print("No H2O Configuration:")
print(f"  Feature Selection: {no_h2o_pipeline.config.feature_selection.method}")
print(f"  VIF Filtering: {no_h2o_pipeline.config.feature_selection.vif_threshold}")

---
## Configuration 7: Regression Optimized

In [None]:
regression_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.98)
    .with_feature_selection(
        method="h2o",
        relevance=0.97,
        h2o_models=10,
        encoding_fs=True
    )
    .with_encoding(
        scaler="robust",
        encoder="ifrequency",
        auto_select=True
    )
    .with_imputation(
        method="iterative",
        auto_select=True
    )
    .with_vif_filtering(threshold=7.0)
    .with_optimization(
        optimization_level="thorough",
        random_state=42
    )
    .build()
)

print("Regression Optimized Configuration:")
print(f"  Scaler: {regression_pipeline.config.encoding.scaler}")
print(f"  VIF Threshold: {regression_pipeline.config.feature_selection.vif_threshold}")
print(f"  Imputation: {regression_pipeline.config.imputation.method}")

---
## Configuration 8: Classification Optimized

In [None]:
classification_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.99)
    .with_feature_selection(
        method="h2o",
        relevance=0.95,
        h2o_models=8,
        encoding_fs=True
    )
    .with_encoding(
        scaler="standard",
        encoder="ifrequency",
        auto_select=True
    )
    .with_imputation(
        method="knn",
        auto_select=True
    )
    .with_vif_filtering(threshold=10.0)
    .with_optimization(
        optimization_level="balanced",
        random_state=42
    )
    .build()
)

print("Classification Optimized Configuration:")
print(f"  Scaler: {classification_pipeline.config.encoding.scaler}")
print(f"  Imputation: {classification_pipeline.config.imputation.method}")

---
## Configuration 9: Minimal Preprocessing

Bare minimum transformations.

In [None]:
minimal_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=False, drop=False)
    .with_null_removal(threshold=1.0)
    .with_feature_selection(method="none")
    .with_encoding(
        scaler="minmax",
        encoder="label",
        auto_select=False
    )
    .with_imputation(
        method="simple",
        auto_select=False
    )
    .with_vif_filtering(threshold=30.0)
    .with_optimization(
        optimization_level="fast",
        random_state=42
    )
    .build()
)

print("Minimal Preprocessing Configuration:")
print(f"  Date Engineering: {minimal_pipeline.config.date_engineering}")
print(f"  Null Removal: {minimal_pipeline.config.null_removal_threshold}")
print(f"  Feature Selection: {minimal_pipeline.config.feature_selection.method}")
print(f"  VIF Threshold: {minimal_pipeline.config.feature_selection.vif_threshold}")

---
## Configuration 10: Maximum Preprocessing

All bells and whistles - aggressive optimization.

In [None]:
maximum_pipeline = (AtlanticBuilder()
    .with_date_engineering(enabled=True, drop=True)
    .with_null_removal(threshold=0.70)
    .with_feature_selection(
        method="h2o",
        relevance=0.99,
        h2o_models=20,
        encoding_fs=True
    )
    .with_encoding(
        scaler="robust",
        encoder="ifrequency",
        auto_select=True
    )
    .with_imputation(
        method="iterative",
        auto_select=True
    )
    .with_vif_filtering(threshold=5.0)
    .with_optimization(
        optimization_level="thorough",
        random_state=42
    )
    .build()
)

print("Maximum Preprocessing Configuration:")
print(f"  Null Removal: {maximum_pipeline.config.null_removal_threshold}")
print(f"  H2O Models: {maximum_pipeline.config.feature_selection.h2o_max_models}")
print(f"  VIF Threshold: {maximum_pipeline.config.feature_selection.vif_threshold}")
print(f"  Optimization Level: {maximum_pipeline.config.optimizer.optimization_level}")

---
## Summary Table

In [None]:
configs = [
    ("Default", default_config),
    ("Fast", fast_pipeline.config),
    ("Thorough", thorough_pipeline.config),
    ("High-Null", high_null_pipeline.config),
    ("High-Card", high_card_pipeline.config),
    ("No-H2O", no_h2o_pipeline.config),
    ("Regression", regression_pipeline.config),
    ("Classification", classification_pipeline.config),
    ("Minimal", minimal_pipeline.config),
    ("Maximum", maximum_pipeline.config),
]

print(f"{'Config':<15} {'H2O Models':<12} {'VIF':<8} {'Scaler':<10} {'Imputer':<12}")
print("-" * 60)
for name, cfg in configs:
    h2o_models = cfg.feature_selection.h2o_max_models if cfg.feature_selection.method == "h2o" else "N/A"
    print(f"{name:<15} {str(h2o_models):<12} {cfg.feature_selection.vif_threshold:<8} {cfg.encoding.scaler:<10} {cfg.imputation.method:<12}")

---
## Use Case Recommendations

| Use Case | Recommended Configuration |
|----------|---------------------------|
| Quick Prototyping | Fast |
| Production Pipeline | Thorough |
| High Missing Data | High-Null |
| Many Categories | High-Cardinality |
| No H2O Available | No-H2O |
| Regression Tasks | Regression Optimized |
| Classification Tasks | Classification Optimized |
| Minimal Processing | Minimal |
| Best Results | Maximum |