# Atlantic - Usage Examples

This notebook demonstrates the core functionality of the Atlantic automated data preprocessing framework for supervised machine learning.

**Covered scenarios:**
- Binary Classification
- Multi-class Classification
- Regression
- Saving & Loading Fitted Pipelines

> **Note:** Set `relevance=1.0` to skip H2O feature selection. Set `relevance<1.0` (e.g., 0.99) to enable it.

## Setup

In [None]:
from atlantic import Atlantic
from atlantic.data import DatasetGenerator
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=Warning)

---
## 1. Binary Classification

In [None]:
# Generate binary classification dataset
data, target_col = DatasetGenerator.generate_classification(
    n_samples=10000,
    n_features=15,
    n_classes=2,
    n_categorical=4,
    null_percentage=0.08,
    random_state=42
)

print(f"Dataset shape: {data.shape}")
print(f"Target column: {target_col}")
data.head()

In [None]:
# Train/test split
train, test = train_test_split(data, train_size=0.8, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# Initialize and fit Atlantic pipeline
atl = Atlantic(X=train, target=target_col)

atl.fit_processing(
    split_ratio=0.75,
    relevance=1.0,
    h2o_fs_models=7,
    vif_ratio=10.0,
    optimization_level="balanced"
)

In [None]:
# Transform datasets
train_processed = atl.data_processing(X=train.copy())
test_processed = atl.data_processing(X=test.copy())

print(f"Train processed shape: {train_processed.shape}")
print(f"Test processed shape: {test_processed.shape}")
train_processed.head()

In [None]:
# Save fitted pipeline (optional)
# atl.save('atlantic_binary.pkl')
# loaded_atl = Atlantic.load('atlantic_binary.pkl')

---
## 2. Multi-class Classification

In [None]:
# Generate multi-class dataset
data, target_col = DatasetGenerator.generate_classification(
    n_samples=1000,
    n_features=15,
    n_classes=5,
    n_categorical=4,
    null_percentage=0.08,
    random_state=42
)

print(f"Dataset shape: {data.shape}")
print(f"Number of classes: {data[target_col].nunique()}")

In [None]:
# Train/test split
train, test = train_test_split(data, train_size=0.8, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
# Fit and transform
atl = Atlantic(X=train, target=target_col)

atl.fit_processing(
    split_ratio=0.75,
    relevance=1.0,
    h2o_fs_models=7,
    vif_ratio=10.0,
    optimization_level="balanced"
)

train = atl.data_processing(X=train)
test = atl.data_processing(X=test)

print(f"Processed train shape: {train.shape}")
print(f"Processed test shape: {test.shape}")

---
## 3. Regression

In [None]:
# Generate regression dataset
data, target_col = DatasetGenerator.generate_regression(
    n_samples=1000,
    n_features=15,
    n_categorical=4,
    null_percentage=0.08,
    random_state=42
)

print(f"Dataset shape: {data.shape}")
print(f"Target stats: mean={data[target_col].mean():.2f}, std={data[target_col].std():.2f}")

In [None]:
# Train/test split
train, test = train_test_split(data, train_size=0.8, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
# Fit and transform
atl = Atlantic(X=train, target=target_col)

atl.fit_processing(
    split_ratio=0.75,
    relevance=1.0,
    h2o_fs_models=7,
    vif_ratio=10.0,
    optimization_level="balanced"
)

train = atl.data_processing(X=train)
test = atl.data_processing(X=test)

print(f"Processed train shape: {train.shape}")
print(f"Processed test shape: {test.shape}")

In [None]:
# Save for later use
atl.save('atlantic_regression.pkl')
print("Pipeline saved to 'atlantic_regression.pkl'")

---
## 4. Loading Saved Fitted Pipeline

In [None]:
# Load previously fitted Atlantic pipeline
loaded_atl = Atlantic.load('atlantic_regression.pkl')
print("Pipeline loaded successfully!")

In [None]:
# Generate new data to process
new_data, _ = DatasetGenerator.generate_regression(
    n_samples=200,
    n_features=15,
    n_categorical=4,
    null_percentage=0.05,
    random_state=99
)

new_data = new_data.reset_index(drop=True)
print(f"New data shape: {new_data.shape}")

In [None]:
# Apply preprocessing using loaded pipeline
new_data_processed = loaded_atl.data_processing(X=new_data)
print(f"Processed new data shape: {new_data_processed.shape}")

In [None]:
# Access fitted components metadata
print(f"Encoding Method: {loaded_atl.enc_method}")
print(f"Imputation Method: {loaded_atl.imp_method}")
print(f"Selected Columns: {loaded_atl.cols}")
print(f"Numerical Columns: {loaded_atl.n_cols}")
print(f"Categorical Columns: {loaded_atl.c_cols}")