In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pipeline import PipelineConfig, PipelineFactory

In [None]:
# Load
df = pd.read_csv("../data/cleaned_data_after_eda.csv")
TARGET = "Percent_Bleached"
y = df[TARGET].astype(int)
X = df.drop(columns=[TARGET])

# Column lists
numerical_cols    = X.select_dtypes(include="number").columns.tolist()
categorical_cols  = ["Exposure"]  # only this one is categorical per your note
log_cols          = []            # fill if you want log1p on specific numeric cols


In [None]:
# Split
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

# Build config
cfg = PipelineConfig(
    numeric=numerical_cols,
    categorical=categorical_cols,
    log=log_cols,
    model_name="lasso",
    # selector_kind=None, selector_k=None, resampler=None
)

# Build & train pipeline
pipe = PipelineFactory().build(cfg)
pipe.fit(X_tr, y_tr)

# Evaluate (simple R^2 example)
r2 = pipe.score(X_te, y_te)
print(f"Test R^2: {r2:.3f}")
