In [None]:
"""
Bayesian Optimization Pipeline - Modular Implementation
========================================================
Phase 1: Feature Selection and Screening
Phase 2: Bayesian Optimization
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import json
import pickle
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple, Any, Union

warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel, Matern
from scipy.stats import norm
from scipy.optimize import minimize
from itertools import combinations

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Configuration Classes
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class Phase1Config:
    """Configuration for Phase 1: Feature Selection."""
    data_file: str
    response_column: str
    sheet_name: str = 'data'
    header_row: int = 5
    split_keyword: Optional[str] = "PREDICTED OPTIMUM RUNS"
    stop_feature: Optional[str] = "Batch ID"
    maximize_response: bool = False
    target_features: int = 4
    output_dir: str = 'bo_pipeline_output'
    
    # Thresholds
    correlation_strong: float = 0.4
    correlation_moderate: float = 0.2
    vip_important: float = 1.0
    vip_moderate: float = 0.8
    multicollinearity_threshold: float = 0.7
    interaction_threshold: float = 0.3


@dataclass
class Phase2Config:
    """Configuration for Phase 2: Bayesian Optimization."""
    data_file: str
    checkpoint_path: str
    batch_size: int = 5
    exploration_weight: float = 2.0
    n_optimizer_restarts: int = 20
    min_distance_between_points: float = 0.3
    max_iterations: int = 10
    improvement_threshold: float = 0.01
    patience: int = 3
    output_dir: str = 'bo_pipeline_output'
    sheet_name: str = 'Sheet1'

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# DataLoader Class
# ─────────────────────────────────────────────────────────────────────────────

class DataLoader:
    """Handles data loading, cleaning, and feature classification."""
    
    def load_excel(self, file_path: str, sheet_name: str, header_row: int) -> pd.DataFrame:
        """Load Excel file and return DataFrame."""
        xls = pd.ExcelFile(file_path, engine='openpyxl')
        df = pd.read_excel(xls, sheet_name=sheet_name, header=header_row)
        return df
    
    def split_at_keyword(self, df: pd.DataFrame, keyword: str, 
                         column: str = 'Run') -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Split DataFrame at keyword row."""
        split_index = df.index[df[column] == keyword].tolist()
        
        if split_index:
            idx = split_index[0]
            df_initial = df.iloc[:idx]
            df_optimum = df.iloc[idx+1:]
            return df_initial, df_optimum
        return df.copy(), pd.DataFrame()
    
    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean DataFrame by removing empty rows and resetting index."""
        df = df.drop(index=0, errors='ignore')
        df = df.dropna(how='all')
        return df.reset_index(drop=True)
    
    def classify_features(self, df: pd.DataFrame, feature_cols: List[str]
                         ) -> Tuple[List[str], List[str], Dict]:
        """
        Classify features as binary or continuous.
        
        Returns:
            binary_list: List of binary feature names
            continuous_list: List of continuous feature names
            mappings: Dictionary of binary feature value mappings
        """
        numeric_features = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
        
        binary_cols = []
        continuous_cols = []
        binary_mappings = {}
        
        for col in numeric_features:
            n_unique = df[col].nunique()
            
            if n_unique == 2:
                binary_cols.append(col)
                unique_vals = sorted(df[col].dropna().unique())
                mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
                df[col] = df[col].map(mapping)
                binary_mappings[col] = mapping
            else:
                continuous_cols.append(col)
        
        return binary_cols, continuous_cols, binary_mappings
    
    def get_feature_columns(self, df: pd.DataFrame, stop_feature: Optional[str], 
                           response_column: str) -> List[str]:
        """Extract feature column names."""
        columns = df.columns.tolist()
        
        if stop_feature and stop_feature in columns:
            feature_list = columns[:columns.index(stop_feature)]
        else:
            feature_list = [c for c in columns if c != response_column]
        
        # Remove index-like columns
        feature_list = [f for f in feature_list if f.lower() not in ['run', 'index', 'unnamed: 0']]
        return feature_list

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# FeatureRanker Class
# ─────────────────────────────────────────────────────────────────────────────

class FeatureRanker:
    """Performs feature ranking using multiple methods."""
    
    def __init__(self, config: Phase1Config):
        self.config = config
        self._corr_df = None
        self._lasso_df = None
        self._pls_df = None
        self._interaction_df = None
        self._multicollinearity_pairs = []
        self._features_with_interactions = []
        self._optimal_pls_components = None
    
    def run_correlation(self, X: pd.DataFrame, y: pd.Series, 
                       binary_features: List[str]) -> pd.DataFrame:
        """Compute Pearson correlation rankings."""
        correlations = X.corrwith(y)
        
        self._corr_df = pd.DataFrame({
            'feature': X.columns.tolist(),
            'correlation': correlations.values,
            'abs_corr': np.abs(correlations.values),
            'type': ['binary' if f in binary_features else 'continuous' for f in X.columns],
            'direction': ['Positive' if c > 0 else 'Negative' for c in correlations.values]
        }).sort_values('abs_corr', ascending=False).reset_index(drop=True)
        
        self._corr_df['rank_corr'] = range(1, len(self._corr_df) + 1)
        self._corr_df['strength'] = self._corr_df['correlation'].apply(self._corr_strength)
        
        return self._corr_df
    
    def _corr_strength(self, r: float) -> str:
        if abs(r) >= self.config.correlation_strong: return 'Strong'
        elif abs(r) >= self.config.correlation_moderate: return 'Moderate'
        return 'Weak'
    
    def run_lasso(self, X: pd.DataFrame, y: pd.Series, 
                  binary_features: List[str]) -> Tuple[pd.DataFrame, List[str]]:
        """Run Lasso regression for feature selection."""
        X_model = self._prepare_for_modeling(X, binary_features)
        
        lasso = LassoCV(cv=5, max_iter=10000, random_state=RANDOM_STATE)
        lasso.fit(X_model, y)
        
        self._lasso_df = pd.DataFrame({
            'feature': X.columns.tolist(),
            'coefficient': lasso.coef_,
            'abs_coef': np.abs(lasso.coef_),
            'selected': lasso.coef_ != 0,
            'type': ['binary' if f in binary_features else 'continuous' for f in X.columns]
        }).sort_values('abs_coef', ascending=False).reset_index(drop=True)
        
        self._lasso_df['rank_lasso'] = range(1, len(self._lasso_df) + 1)
        selected_features = self._lasso_df[self._lasso_df['selected']]['feature'].tolist()
        
        return self._lasso_df, selected_features
    
    def run_pls_vip(self, X: pd.DataFrame, y: pd.Series, 
                    binary_features: List[str]) -> Tuple[pd.DataFrame, int]:
        """Run PLS regression and compute VIP scores."""
        X_model = self._prepare_for_modeling(X, binary_features)
        
        max_comp = min(5, len(X.columns), len(X) - 1)
        cv_scores = []
        
        for n in range(1, max_comp + 1):
            scores = cross_val_score(PLSRegression(n_components=n), X_model, y, cv=5, scoring='r2')
            cv_scores.append(scores.mean())
        
        self._optimal_pls_components = np.argmax(cv_scores) + 1
        
        pls = PLSRegression(n_components=self._optimal_pls_components)
        pls.fit(X_model, y)
        vip_scores = self._calc_vip(pls)
        
        self._pls_df = pd.DataFrame({
            'feature': X.columns.tolist(),
            'VIP': vip_scores,
            'type': ['binary' if f in binary_features else 'continuous' for f in X.columns]
        }).sort_values('VIP', ascending=False).reset_index(drop=True)
        
        self._pls_df['rank_pls'] = range(1, len(self._pls_df) + 1)
        self._pls_df['category'] = self._pls_df['VIP'].apply(self._vip_category)
        
        return self._pls_df, self._optimal_pls_components
    
    def _calc_vip(self, model) -> np.ndarray:
        """Calculate VIP scores for PLS model."""
        t, w, q = model.x_scores_, model.x_weights_, model.y_loadings_
        m, p = w.shape
        ss = np.sum(t**2, axis=0) * q.flatten()**2
        total_ss = np.sum(ss)
        vip = np.zeros(m)
        for i in range(m):
            weight = sum((w[i,j]**2) * ss[j] / np.sum(w[:,j]**2) for j in range(p))
            vip[i] = np.sqrt(m * weight / total_ss)
        return vip
    
    def _vip_category(self, v: float) -> str:
        if v >= self.config.vip_important: return 'Important'
        elif v >= self.config.vip_moderate: return 'Moderate'
        return 'Less Important'
    
    def _prepare_for_modeling(self, X: pd.DataFrame, binary_features: List[str]) -> pd.DataFrame:
        """Scale continuous features and encode binary as [-1, +1]."""
        X_model = X.copy()
        continuous = [c for c in X.columns if c not in binary_features]
        
        if continuous:
            scaler = StandardScaler()
            X_model[continuous] = scaler.fit_transform(X[continuous])
        
        for col in binary_features:
            if col in X_model.columns:
                X_model[col] = X_model[col] * 2 - 1
        
        return X_model
    
    def run_interaction_screening(self, X: pd.DataFrame, y: pd.Series, 
                                   top_n: int = 6) -> pd.DataFrame:
        """Screen for feature interactions."""
        if self._corr_df is None:
            raise ValueError("Run correlation analysis first")
        
        top_features = self._corr_df.head(min(top_n, len(X.columns)))['feature'].tolist()
        results = []
        
        for f1, f2 in combinations(top_features, 2):
            median_f2 = X[f2].median()
            low_f2 = X[f2] <= median_f2
            high_f2 = X[f2] > median_f2
            
            if low_f2.sum() >= 3 and high_f2.sum() >= 3:
                corr_low = X.loc[low_f2, f1].corr(y[low_f2])
                corr_high = X.loc[high_f2, f1].corr(y[high_f2])
                
                if not np.isnan(corr_low) and not np.isnan(corr_high):
                    strength = abs(corr_high - corr_low)
                    results.append({
                        'interaction': f'{f1} × {f2}',
                        'feature_1': f1, 'feature_2': f2,
                        'corr_low_f2': corr_low, 'corr_high_f2': corr_high,
                        'strength': strength,
                        'significant': strength > self.config.interaction_threshold
                    })
        
        if results:
            self._interaction_df = pd.DataFrame(results).sort_values('strength', ascending=False)
            strong = self._interaction_df[self._interaction_df['significant']]
            self._features_with_interactions = list(set(
                strong['feature_1'].tolist() + strong['feature_2'].tolist()
            ))
        else:
            self._interaction_df = pd.DataFrame()
            self._features_with_interactions = []
        
        return self._interaction_df
    
    def check_multicollinearity(self, X: pd.DataFrame, 
                                 threshold: Optional[float] = None) -> List[Dict]:
        """Check for highly correlated feature pairs."""
        threshold = threshold or self.config.multicollinearity_threshold
        feature_corr = X.corr()
        pairs = []
        
        for i in range(len(X.columns)):
            for j in range(i+1, len(X.columns)):
                r = feature_corr.iloc[i, j]
                if abs(r) > threshold:
                    pairs.append({
                        'feature_1': X.columns[i],
                        'feature_2': X.columns[j],
                        'correlation': r
                    })
        
        self._multicollinearity_pairs = pairs
        return pairs
    
    def get_consensus_ranking(self) -> pd.DataFrame:
        """Combine all ranking methods into consensus ranking."""
        if any(df is None for df in [self._corr_df, self._lasso_df, self._pls_df]):
            raise ValueError("Run all ranking methods first")
        
        consensus = self._corr_df[['feature', 'rank_corr', 'correlation', 'direction', 
                                    'type', 'strength']].merge(
            self._lasso_df[['feature', 'rank_lasso', 'coefficient', 'selected']], on='feature'
        ).merge(
            self._pls_df[['feature', 'rank_pls', 'VIP', 'category']], on='feature'
        )
        
        consensus['avg_rank'] = consensus[['rank_corr', 'rank_lasso', 'rank_pls']].mean(axis=1)
        consensus = consensus.sort_values('avg_rank').reset_index(drop=True)
        consensus['final_rank'] = range(1, len(consensus) + 1)
        consensus['methods_top3'] = consensus.apply(
            lambda r: sum([r['rank_corr'] <= 3, r['rank_lasso'] <= 3, r['rank_pls'] <= 3]), axis=1
        )
        consensus['has_interaction'] = consensus['feature'].isin(self._features_with_interactions)
        
        return consensus
    
    @property
    def features_with_interactions(self) -> List[str]:
        return self._features_with_interactions
    
    @property
    def multicollinearity_pairs(self) -> List[Dict]:
        return self._multicollinearity_pairs

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# FeatureSelector Class
# ─────────────────────────────────────────────────────────────────────────────

class FeatureSelector:
    """Handles automatic and manual feature selection with validation."""
    
    def __init__(self, config: Phase1Config, ranker: FeatureRanker):
        self.config = config
        self.ranker = ranker
        self._consensus = None
        self._selected_features = []
    
    def auto_select(self, n_features: Optional[int] = None) -> List[str]:
        """Automatically select top features based on scoring."""
        n_features = n_features or self.config.target_features
        self._consensus = self.ranker.get_consensus_ranking()
        
        # Score each feature
        self._consensus['score'] = self._consensus.apply(self._score_feature, axis=1)
        self._consensus = self._consensus.sort_values('score', ascending=False).reset_index(drop=True)
        
        # Select top features
        recommended = []
        for _, row in self._consensus.iterrows():
            include = False
            if row['score'] >= 6: include = True
            if abs(row['correlation']) >= self.config.correlation_strong: include = True
            if row['VIP'] >= self.config.vip_important: include = True
            if row['has_interaction'] and row['score'] >= 4: include = True
            
            if include and len(recommended) < 6:
                recommended.append(row['feature'])
        
        # Ensure minimum
        if len(recommended) < 3:
            for _, row in self._consensus.iterrows():
                if row['feature'] not in recommended:
                    recommended.append(row['feature'])
                if len(recommended) >= 3:
                    break
        
        self._selected_features = recommended[:n_features]
        return self._selected_features
    
    def _score_feature(self, row: pd.Series) -> int:
        score = 0
        if abs(row['correlation']) >= self.config.correlation_strong: score += 3
        elif abs(row['correlation']) >= self.config.correlation_moderate: score += 2
        if row['VIP'] >= self.config.vip_important: score += 3
        elif row['VIP'] >= self.config.vip_moderate: score += 2
        if row['selected']: score += 2
        score += row['methods_top3']
        if row['has_interaction']: score += 2
        return score
    
    def manual_override(self, feature_list: List[str], 
                        all_features: List[str]) -> Tuple[List[str], List[str]]:
        """Validate and apply manual feature selection."""
        warnings = []
        validated = []
        
        for feat in feature_list:
            if feat in all_features:
                validated.append(feat)
            else:
                warnings.append(f"⚠ Feature '{feat}' not found in available features")
        
        if not validated:
            warnings.append("⚠ No valid features provided")
        
        self._selected_features = validated
        return validated, warnings
    
    def validate_selection(self, X: pd.DataFrame, y: pd.Series, features: List[str],
                           binary_features: List[str], continuous_features: List[str]
                          ) -> Dict[str, float]:
        """Validate feature selection with leakage-free LOO-CV."""
        X_sel = X[features].copy()
        sel_continuous = [f for f in features if f in continuous_features]
        sel_binary = [f for f in features if f in binary_features]
        
        loo_preds, loo_actual = [], []
        
        for train_idx, test_idx in LeaveOneOut().split(X_sel):
            X_train = X_sel.iloc[train_idx].copy()
            X_test = X_sel.iloc[test_idx].copy()
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            # Scale only on training data
            if sel_continuous:
                scaler = StandardScaler()
                X_train[sel_continuous] = scaler.fit_transform(X_train[sel_continuous])
                X_test[sel_continuous] = scaler.transform(X_test[sel_continuous])
            
            for col in sel_binary:
                X_train[col] = X_train[col] * 2 - 1
                X_test[col] = X_test[col] * 2 - 1
            
            model = RidgeCV(alphas=[0.1, 1, 10, 100], cv=3)
            model.fit(X_train, y_train)
            
            loo_preds.append(model.predict(X_test)[0])
            loo_actual.append(y_test.values[0])
        
        loo_preds = np.array(loo_preds)
        loo_actual = np.array(loo_actual)
        
        return {
            'r2': r2_score(loo_actual, loo_preds),
            'rmse': np.sqrt(mean_squared_error(loo_actual, loo_preds)),
            'mae': np.mean(np.abs(loo_actual - loo_preds))
        }
    
    def check_selection_issues(self, X: pd.DataFrame, features: List[str]
                               ) -> Dict[str, List]:
        """Check for multicollinearity and broken interactions in selection."""
        issues = {'multicollinearity': [], 'broken_interactions': []}
        
        if len(features) > 1:
            sel_corr = X[features].corr()
            for i in range(len(features)):
                for j in range(i+1, len(features)):
                    r = sel_corr.iloc[i, j]
                    if abs(r) > self.config.multicollinearity_threshold:
                        issues['multicollinearity'].append(
                            f"{features[i]} ↔ {features[j]}: r={r:.2f}"
                        )
        
        # Check broken interactions
        if hasattr(self.ranker, '_interaction_df') and len(self.ranker._interaction_df) > 0:
            strong = self.ranker._interaction_df[self.ranker._interaction_df['significant']]
            for _, row in strong.iterrows():
                f1, f2 = row['feature_1'], row['feature_2']
                if (f1 in features) != (f2 in features):
                    in1 = "IN" if f1 in features else "OUT"
                    in2 = "IN" if f2 in features else "OUT"
                    issues['broken_interactions'].append(f"{f1} ({in1}) × {f2} ({in2})")
        
        return issues
    
    @property
    def selected_features(self) -> List[str]:
        return self._selected_features
    
    @property
    def consensus(self) -> pd.DataFrame:
        return self._consensus

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase1Checkpoint Class
# ─────────────────────────────────────────────────────────────────────────────

class Phase1Checkpoint:
    """Handles saving and loading Phase 1 results."""
    
    def save(self, output_dir: str, results_dict: Dict[str, Any]) -> None:
        """Save checkpoint to disk."""
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)
        
        # Save JSON (human-readable)
        with open(output_path / 'phase1_checkpoint.json', 'w') as f:
            json.dump(results_dict, f, indent=2, default=str)
        
        # Save pickle (preserves all objects)
        with open(output_path / 'phase1_checkpoint.pkl', 'wb') as f:
            pickle.dump(results_dict, f)
        
        print(f"✓ Checkpoint saved to {output_path}")
    
    def load(self, checkpoint_path: str) -> Dict[str, Any]:
        """Load checkpoint from disk."""
        path = Path(checkpoint_path)
        
        if path.suffix == '.pkl':
            with open(path, 'rb') as f:
                return pickle.load(f)
        else:
            with open(path, 'r') as f:
                return json.load(f)
    
    def export_bounds(self, features: List[str], X: pd.DataFrame, 
                      binary_features: List[str], margin: float = 0.1) -> pd.DataFrame:
        """Export search space bounds for selected features."""
        bounds = []
        
        for feat in features:
            if feat in binary_features:
                bounds.append({
                    'feature': feat, 'type': 'binary',
                    'min': 0, 'max': 1,
                    'observed_min': 0, 'observed_max': 1
                })
            else:
                feat_min, feat_max = X[feat].min(), X[feat].max()
                feat_range = feat_max - feat_min
                bounds.append({
                    'feature': feat, 'type': 'continuous',
                    'min': round(feat_min - margin * feat_range, 4),
                    'max': round(feat_max + margin * feat_range, 4),
                    'observed_min': feat_min, 'observed_max': feat_max
                })
        
        return pd.DataFrame(bounds)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase1Plotter Class
# ─────────────────────────────────────────────────────────────────────────────

class Phase1Plotter:
    """Plotting utilities for Phase 1 analysis."""
    
    def __init__(self, output_dir: str):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
    
    def _save_and_show(self, fig, filename: str, save_path: Optional[str] = None):
        path = save_path or (self.output_dir / filename)
        plt.savefig(path, dpi=150, bbox_inches='tight')
        plt.show()
        return fig
    
    def scatter_plots(self, X: pd.DataFrame, y: pd.Series, binary_features: List[str],
                      response_name: str, save_path: Optional[str] = None):
        """Plot feature vs response scatter plots."""
        n_feat = len(X.columns)
        n_cols = min(4, n_feat)
        n_rows = int(np.ceil(n_feat / n_cols))
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3.5*n_rows))
        axes = axes.flatten() if n_feat > 1 else [axes]
        
        for i, col in enumerate(X.columns):
            ax = axes[i]
            corr = X[col].corr(y)
            
            if col in binary_features:
                for val in [0, 1]:
                    data = y[X[col] == val]
                    if len(data) > 0:
                        ax.boxplot([data], positions=[val], widths=0.6)
                ax.set_xticks([0, 1])
                ax.set_xlabel(f'{col} (binary)')
            else:
                ax.scatter(X[col], y, alpha=0.6, edgecolors='black', linewidth=0.5)
                z = np.polyfit(X[col], y, 1)
                p = np.poly1d(z)
                x_line = np.linspace(X[col].min(), X[col].max(), 100)
                ax.plot(x_line, p(x_line), 'r--', linewidth=2)
                ax.set_xlabel(col)
            
            ax.set_ylabel(response_name)
            ax.set_title(f'r = {corr:.3f}', fontsize=10)
        
        for j in range(i+1, len(axes)):
            axes[j].set_visible(False)
        
        plt.suptitle(f'Features vs {response_name}', fontsize=12, y=1.02)
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_scatter_plots.png', save_path)
    
    def correlation_plot(self, corr_df: pd.DataFrame, config: Phase1Config,
                         save_path: Optional[str] = None):
        """Plot correlation analysis results."""
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        colors = ['forestgreen' if c > 0 else 'crimson' for c in corr_df['correlation']]
        axes[0].barh(corr_df['feature'][::-1], corr_df['abs_corr'][::-1], color=colors[::-1])
        axes[0].axvline(x=config.correlation_strong, color='green', linestyle='--', 
                       linewidth=2, label=f'Strong ({config.correlation_strong})')
        axes[0].axvline(x=config.correlation_moderate, color='orange', linestyle='--', 
                       linewidth=1.5, label=f'Moderate ({config.correlation_moderate})')
        axes[0].set_xlabel('|Correlation|')
        axes[0].set_title('Feature-Response Correlation')
        axes[0].legend(loc='lower right')
        
        axes[1].barh(corr_df['feature'][::-1], corr_df['correlation'][::-1], color=colors[::-1])
        axes[1].axvline(x=0, color='black', linewidth=1)
        axes[1].set_xlabel('Correlation (with sign)')
        axes[1].set_title('Direction of Effect')
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_correlation_analysis.png', save_path)
    
    def lasso_plot(self, lasso_df: pd.DataFrame, save_path: Optional[str] = None):
        """Plot Lasso regression results."""
        fig, ax = plt.subplots(figsize=(10, 6))
        
        colors = ['forestgreen' if s else 'lightgray' for s in lasso_df['selected']]
        ax.barh(lasso_df['feature'][::-1], lasso_df['abs_coef'][::-1], color=colors[::-1])
        ax.set_xlabel('|Coefficient|')
        ax.set_title('Lasso Coefficients (Green = Selected)')
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_lasso_analysis.png', save_path)
    
    def pls_plot(self, pls_df: pd.DataFrame, config: Phase1Config, 
                 cv_scores: List[float] = None, save_path: Optional[str] = None):
        """Plot PLS VIP scores."""
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        colors = ['darkgreen' if v >= config.vip_important else 
                 'orange' if v >= config.vip_moderate else 'lightcoral' 
                 for v in pls_df['VIP']]
        axes[0].barh(pls_df['feature'][::-1], pls_df['VIP'][::-1], color=colors[::-1])
        axes[0].axvline(x=config.vip_important, color='green', linestyle='--', 
                       linewidth=2, label=f'Important ({config.vip_important})')
        axes[0].axvline(x=config.vip_moderate, color='orange', linestyle='--',
                       linewidth=1.5, label=f'Moderate ({config.vip_moderate})')
        axes[0].set_xlabel('VIP Score')
        axes[0].set_title('PLS Variable Importance')
        axes[0].legend()
        
        if cv_scores:
            axes[1].plot(range(1, len(cv_scores) + 1), cv_scores, 'bo-', linewidth=2, markersize=8)
            optimal = np.argmax(cv_scores) + 1
            axes[1].axvline(x=optimal, color='red', linestyle='--', label=f'Optimal = {optimal}')
            axes[1].set_xlabel('Number of Components')
            axes[1].set_ylabel('CV R²')
            axes[1].set_title('PLS Component Selection')
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_pls_analysis.png', save_path)
    
    def multicollinearity_heatmap(self, X: pd.DataFrame, save_path: Optional[str] = None):
        """Plot feature correlation heatmap."""
        fig, ax = plt.subplots(figsize=(10, 8))
        
        feature_corr = X.corr()
        mask = np.triu(np.ones_like(feature_corr, dtype=bool), k=0)
        sns.heatmap(feature_corr, annot=True, cmap='RdBu_r', center=0, fmt='.2f',
                    mask=mask, square=True, linewidths=0.5, ax=ax)
        ax.set_title('Feature-Feature Correlations')
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_multicollinearity.png', save_path)
    
    def validation_plot(self, loo_actual: np.ndarray, loo_preds: np.ndarray, 
                        r2: float, save_path: Optional[str] = None):
        """Plot LOO-CV validation results."""
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        axes[0].scatter(loo_actual, loo_preds, alpha=0.7, edgecolors='black', linewidth=0.5)
        lims = [min(loo_actual.min(), loo_preds.min()), max(loo_actual.max(), loo_preds.max())]
        axes[0].plot(lims, lims, 'r--', linewidth=2, label='Perfect')
        axes[0].set_xlabel('Actual')
        axes[0].set_ylabel('Predicted')
        axes[0].set_title(f'LOO-CV: R² = {r2:.4f}')
        axes[0].legend()
        
        residuals = loo_actual - loo_preds
        axes[1].hist(residuals, bins=12, edgecolor='black', alpha=0.7)
        axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2)
        axes[1].set_xlabel('Residual')
        axes[1].set_ylabel('Frequency')
        axes[1].set_title(f'Residuals: Mean={residuals.mean():.3f}')
        
        axes[2].scatter(loo_preds, residuals, alpha=0.7, edgecolors='black', linewidth=0.5)
        axes[2].axhline(y=0, color='red', linestyle='--', linewidth=2)
        axes[2].set_xlabel('Predicted')
        axes[2].set_ylabel('Residual')
        axes[2].set_title('Residuals vs Predicted')
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_validation.png', save_path)
    
    def consensus_plot(self, consensus: pd.DataFrame, config: Phase1Config,
                       save_path: Optional[str] = None):
        """Plot consensus ranking visualization."""
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # Rankings heatmap
        heatmap_data = consensus.set_index('feature')[['rank_corr', 'rank_lasso', 'rank_pls']]
        heatmap_data.columns = ['Correlation', 'Lasso', 'PLS']
        sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='RdYlGn_r', ax=axes[0, 0],
                    cbar_kws={'label': 'Rank (lower=better)'})
        axes[0, 0].set_title('Rankings Across Methods')
        
        # Method agreement
        colors = ['darkgreen' if a >= 3 else 'orange' if a >= 2 else 'lightcoral' 
                  for a in consensus['methods_top3']]
        axes[0, 1].barh(consensus['feature'][::-1], consensus['methods_top3'][::-1], color=colors[::-1])
        axes[0, 1].axvline(x=2, color='orange', linestyle='--', linewidth=2)
        axes[0, 1].set_xlabel('Methods Ranking Feature in Top 3')
        axes[0, 1].set_title('Method Agreement')
        
        # Average rank
        colors = ['steelblue' if t == 'binary' else 'forestgreen' for t in consensus['type']]
        axes[1, 0].barh(consensus['feature'][::-1], consensus['avg_rank'][::-1], color=colors[::-1])
        axes[1, 0].set_xlabel('Average Rank (lower = better)')
        axes[1, 0].set_title('Consensus Ranking')
        axes[1, 0].invert_xaxis()
        
        # Correlation vs VIP
        for _, row in consensus.iterrows():
            color = 'steelblue' if row['type'] == 'binary' else 'forestgreen'
            marker = 's' if row['has_interaction'] else 'o'
            axes[1, 1].scatter(abs(row['correlation']), row['VIP'], c=color, s=100, 
                              marker=marker, edgecolors='black', linewidth=0.5)
            axes[1, 1].annotate(row['feature'], (abs(row['correlation']), row['VIP']), 
                               fontsize=8, ha='left', va='bottom')
        
        axes[1, 1].axhline(y=config.vip_important, color='green', linestyle='--', alpha=0.7)
        axes[1, 1].axvline(x=config.correlation_strong, color='blue', linestyle='--', alpha=0.7)
        axes[1, 1].set_xlabel('|Correlation|')
        axes[1, 1].set_ylabel('VIP Score')
        axes[1, 1].set_title('Correlation vs VIP')
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase1_consensus.png', save_path)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase1Pipeline Class
# ─────────────────────────────────────────────────────────────────────────────

class Phase1Pipeline:
    """Complete Phase 1 pipeline for feature screening and selection."""
    
    def __init__(self, config: Phase1Config):
        self.config = config
        self._output_dir = Path(config.output_dir)
        self._output_dir.mkdir(exist_ok=True)
        
        self._loader = DataLoader()
        self._ranker = FeatureRanker(config)
        self._selector = None
        self._checkpoint_mgr = Phase1Checkpoint()
        self._plotter = Phase1Plotter(config.output_dir)
        
        # Data attributes
        self._df = None
        self._X = None
        self._y = None
        self._feature_cols = []
        self._binary_features = []
        self._continuous_features = []
        self._binary_mappings = {}
        self._validation_metrics = {}
        self._bounds_df = None
    
    def run(self, data_path: Optional[str] = None) -> 'Phase1Pipeline':
        """Run complete Phase 1 pipeline."""
        data_path = data_path or self.config.data_file
        
        print("─" * 60)
        print("Phase 1: Feature Screening")
        print("─" * 60)
        
        # Load and prepare data
        self._load_data(data_path)
        
        # Run all ranking methods
        self._run_ranking()
        
        # Auto-select features
        self._selector = FeatureSelector(self.config, self._ranker)
        self._selector.auto_select(self.config.target_features)
        
        # Validate selection
        self._validate()
        
        # Generate plots
        self._generate_plots()
        
        self._print_summary()
        return self
    
    def _load_data(self, data_path: str):
        """Load and prepare data."""
        print("  Loading data...")
        df = self._loader.load_excel(data_path, self.config.sheet_name, self.config.header_row)
        
        if self.config.split_keyword:
            df_initial, _ = self._loader.split_at_keyword(df, self.config.split_keyword)
        else:
            df_initial = df.copy()
        
        self._df = self._loader.clean_data(df_initial)
        self._feature_cols = self._loader.get_feature_columns(
            self._df, self.config.stop_feature, self.config.response_column
        )
        
        self._binary_features, self._continuous_features, self._binary_mappings = \
            self._loader.classify_features(self._df, self._feature_cols)
        
        feature_cols = self._binary_features + self._continuous_features
        self._X = self._df[feature_cols].copy()
        self._y = self._df[self.config.response_column].copy()
        
        # Drop missing response
        valid = ~self._y.isnull()
        self._X = self._X[valid].reset_index(drop=True)
        self._y = self._y[valid].reset_index(drop=True)
        
        print(f"  ✓ {len(self._X)} samples, {len(feature_cols)} features")
        print(f"    Binary: {len(self._binary_features)}, Continuous: {len(self._continuous_features)}")
    
    def _run_ranking(self):
        """Run all feature ranking methods."""
        print("  Running feature ranking...")
        
        self._ranker.run_correlation(self._X, self._y, self._binary_features)
        self._ranker.run_lasso(self._X, self._y, self._binary_features)
        self._ranker.run_pls_vip(self._X, self._y, self._binary_features)
        self._ranker.run_interaction_screening(self._X, self._y)
        self._ranker.check_multicollinearity(self._X)
        
        print("  ✓ Completed: Correlation, Lasso, PLS-VIP, Interactions, Multicollinearity")
    
    def _validate(self):
        """Validate feature selection."""
        features = self._selector.selected_features
        self._validation_metrics = self._selector.validate_selection(
            self._X, self._y, features, self._binary_features, self._continuous_features
        )
        print(f"  ✓ Validation R²: {self._validation_metrics['r2']:.4f}")
    
    def _generate_plots(self):
        """Generate all Phase 1 plots."""
        print("  Generating plots...")
        self._plotter.scatter_plots(self._X, self._y, self._binary_features, 
                                    self.config.response_column)
        self._plotter.correlation_plot(self._ranker._corr_df, self.config)
        self._plotter.lasso_plot(self._ranker._lasso_df)
        self._plotter.pls_plot(self._ranker._pls_df, self.config)
        self._plotter.multicollinearity_heatmap(self._X)
        self._plotter.consensus_plot(self._selector.consensus, self.config)
    
    def _print_summary(self):
        """Print Phase 1 summary."""
        print("\n" + "─" * 60)
        print("Selected Features:")
        print("─" * 60)
        
        consensus = self._selector.consensus
        for feat in self._selector.selected_features:
            row = consensus[consensus['feature'] == feat].iloc[0]
            ftype = "binary" if feat in self._binary_features else "continuous"
            print(f"  ✓ {feat} ({ftype})")
            print(f"      Correlation: {row['correlation']:+.3f}, VIP: {row['VIP']:.2f}")
    
    def select_features(self, n: Optional[int] = None, 
                        manual_list: Optional[List[str]] = None) -> List[str]:
        """Select features (auto or manual override)."""
        if manual_list:
            all_features = self._binary_features + self._continuous_features
            features, warnings = self._selector.manual_override(manual_list, all_features)
            for w in warnings:
                print(f"  {w}")
        else:
            features = self._selector.auto_select(n or self.config.target_features)
        
        # Re-validate
        self._validation_metrics = self._selector.validate_selection(
            self._X, self._y, features, self._binary_features, self._continuous_features
        )
        
        # Check issues
        issues = self._selector.check_selection_issues(self._X, features)
        if issues['multicollinearity']:
            print("  ⚠ Multicollinearity detected:")
            for issue in issues['multicollinearity']:
                print(f"      {issue}")
        if issues['broken_interactions']:
            print("  ⚠ Broken interactions:")
            for issue in issues['broken_interactions']:
                print(f"      {issue}")
        
        return features
    
    def save_checkpoint(self, output_dir: Optional[str] = None) -> None:
        """Save Phase 1 checkpoint."""
        output_dir = output_dir or self.config.output_dir
        
        # Export bounds
        self._bounds_df = self._checkpoint_mgr.export_bounds(
            self._selector.selected_features, self._X, self._binary_features
        )
        self._bounds_df.to_csv(Path(output_dir) / 'phase1_bounds.csv', index=False)
        
        # Build checkpoint dict
        checkpoint = {
            'metadata': {
                'checkpoint_time': datetime.now().strftime("%Y%m%d_%H%M%S"),
                'phase1_data_file': self.config.data_file,
                'phase1_sheet_name': self.config.sheet_name,
                'response_column': self.config.response_column,
                'maximize_response': self.config.maximize_response,
                'n_initial_experiments': len(self._X),
                'n_total_features': len(self._feature_cols),
                'n_selected_features': len(self._selector.selected_features),
                'target_features': self.config.target_features,
            },
            'thresholds': {
                'correlation_strong': self.config.correlation_strong,
                'correlation_moderate': self.config.correlation_moderate,
                'vip_important': self.config.vip_important,
                'vip_moderate': self.config.vip_moderate,
                'multicollinearity_threshold': self.config.multicollinearity_threshold,
                'interaction_threshold': self.config.interaction_threshold,
            },
            'selected_features': self._selector.selected_features,
            'all_features': self._binary_features + self._continuous_features,
            'binary_features': self._binary_features,
            'continuous_features': self._continuous_features,
            'selected_binary': [f for f in self._selector.selected_features if f in self._binary_features],
            'selected_continuous': [f for f in self._selector.selected_features if f in self._continuous_features],
            'binary_mappings': self._binary_mappings,
            'validation_metrics': self._validation_metrics,
            'response_stats': {
                'min': float(self._y.min()), 'max': float(self._y.max()),
                'mean': float(self._y.mean()), 'median': float(self._y.median()),
                'std': float(self._y.std())
            },
            'interactions': {
                'features_with_interactions': self._ranker.features_with_interactions,
                'strong_interactions': self._ranker._interaction_df[
                    self._ranker._interaction_df['significant']
                ].to_dict('records') if len(self._ranker._interaction_df) > 0 else [],
            },
            'multicollinearity': {
                'high_correlation_pairs': self._ranker.multicollinearity_pairs,
            },
        }
        
        self._checkpoint_mgr.save(output_dir, checkpoint)
        
        # Save additional files
        self._selector.consensus.to_csv(Path(output_dir) / 'phase1_consensus_ranking.csv', index=False)
        self._df.to_csv(Path(output_dir) / 'phase1_original_data.csv', index=False)
    
    @property
    def data(self) -> Tuple[pd.DataFrame, pd.Series]:
        return self._X, self._y
    
    @property
    def rankings(self) -> pd.DataFrame:
        return self._selector.consensus if self._selector else None
    
    @property
    def selected_features(self) -> List[str]:
        return self._selector.selected_features if self._selector else []
    
    @property
    def validation_metrics(self) -> Dict[str, float]:
        return self._validation_metrics
    
    @property
    def bounds(self) -> pd.DataFrame:
        return self._bounds_df

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase2Scaler Class
# ─────────────────────────────────────────────────────────────────────────────

class Phase2Scaler:
    """
    Scaler for Phase 2 that handles continuous standardization and binary encoding.
    Ensures no data leakage by fitting only on training data.
    """
    
    def __init__(self, continuous_features: List[str], binary_features: List[str], 
                 all_features: List[str]):
        self.continuous_features = continuous_features
        self.binary_features = binary_features
        self.all_features = all_features
        self._scaler = StandardScaler()
        self._is_fitted = False
    
    def fit(self, X: pd.DataFrame) -> 'Phase2Scaler':
        """Fit scaler on training data."""
        if self.continuous_features:
            self._scaler.fit(X[self.continuous_features])
        self._is_fitted = True
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform data using fitted scaler."""
        if not self._is_fitted:
            raise ValueError("Scaler not fitted. Call fit() first.")
        
        X_out = X.copy()
        
        if self.continuous_features:
            X_out[self.continuous_features] = self._scaler.transform(X[self.continuous_features])
        
        for col in self.binary_features:
            if col in X_out.columns:
                X_out[col] = X_out[col] * 2 - 1
        
        return X_out
    
    def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Fit and transform in one step."""
        return self.fit(X).transform(X)
    
    def inverse_transform_point(self, x_scaled: np.ndarray) -> np.ndarray:
        """Convert a single scaled point back to original space."""
        x_orig = np.array(x_scaled).copy()
        
        for i, feat in enumerate(self.all_features):
            if feat in self.binary_features:
                x_orig[i] = (x_orig[i] + 1) / 2
            elif feat in self.continuous_features:
                cont_idx = self.continuous_features.index(feat)
                x_orig[i] = x_orig[i] * self._scaler.scale_[cont_idx] + self._scaler.mean_[cont_idx]
        
        return x_orig
    
    def transform_point(self, x_original: np.ndarray) -> np.ndarray:
        """Convert a single original point to scaled space."""
        x_scaled = np.array(x_original).copy()
        
        for i, feat in enumerate(self.all_features):
            if feat in self.binary_features:
                x_scaled[i] = x_scaled[i] * 2 - 1
            elif feat in self.continuous_features:
                cont_idx = self.continuous_features.index(feat)
                x_scaled[i] = (x_scaled[i] - self._scaler.mean_[cont_idx]) / self._scaler.scale_[cont_idx]
        
        return x_scaled
    
    def get_scaled_bounds(self, bounds_array: np.ndarray) -> np.ndarray:
        """Convert bounds to scaled space."""
        scaled_bounds = []
        
        for i, feat in enumerate(self.all_features):
            if feat in self.binary_features:
                scaled_bounds.append([-1, 1])
            else:
                cont_idx = self.continuous_features.index(feat)
                lb = (bounds_array[i, 0] - self._scaler.mean_[cont_idx]) / self._scaler.scale_[cont_idx]
                ub = (bounds_array[i, 1] - self._scaler.mean_[cont_idx]) / self._scaler.scale_[cont_idx]
                scaled_bounds.append([lb, ub])
        
        return np.array(scaled_bounds)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# GPModel Class
# ─────────────────────────────────────────────────────────────────────────────

class GPModel:
    """Gaussian Process model wrapper with validation."""
    
    def __init__(self, n_features: int, n_restarts: int = 10):
        self._kernel = (
            ConstantKernel(1.0, constant_value_bounds=(1e-3, 1e3)) * 
            Matern(length_scale=np.ones(n_features), 
                   length_scale_bounds=(1e-2, 1e2), nu=2.5) + 
            WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e1))
        )
        self._gp = GaussianProcessRegressor(
            kernel=self._kernel,
            n_restarts_optimizer=n_restarts,
            normalize_y=True,
            random_state=RANDOM_STATE
        )
        self._is_fitted = False
    
    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'GPModel':
        """Fit GP model."""
        X_arr = X.values if hasattr(X, 'values') else X
        y_arr = y.values if hasattr(y, 'values') else y
        self._gp.fit(X_arr, y_arr)
        self._is_fitted = True
        return self
    
    def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
        """Predict with uncertainty."""
        if not self._is_fitted:
            raise ValueError("Model not fitted")
        X_arr = X.values if hasattr(X, 'values') else X
        X_arr = np.atleast_2d(X_arr)
        return self._gp.predict(X_arr, return_std=True)
    
    def validate_loo(self, X: pd.DataFrame, y: pd.Series, 
                     scaler_class: type, continuous_features: List[str],
                     binary_features: List[str], all_features: List[str]
                    ) -> Dict[str, float]:
        """Perform leakage-free LOO-CV validation."""
        loo_preds, loo_actual, loo_stds = [], [], []
        
        for train_idx, test_idx in LeaveOneOut().split(X):
            # Scale on training fold only
            scaler_temp = scaler_class(continuous_features, binary_features, all_features)
            X_train_scaled = scaler_temp.fit_transform(X.iloc[train_idx])
            X_test_scaled = scaler_temp.transform(X.iloc[test_idx])
            
            # Fit GP on training fold
            gp_temp = GaussianProcessRegressor(
                kernel=self._kernel.clone_with_theta(self._kernel.theta),
                n_restarts_optimizer=5,
                normalize_y=True,
                random_state=RANDOM_STATE
            )
            gp_temp.fit(X_train_scaled.values, y.iloc[train_idx].values)
            
            pred, std = gp_temp.predict(X_test_scaled.values, return_std=True)
            loo_preds.append(pred[0])
            loo_stds.append(std[0])
            loo_actual.append(y.iloc[test_idx].values[0])
        
        loo_preds = np.array(loo_preds)
        loo_stds = np.array(loo_stds)
        loo_actual = np.array(loo_actual)
        
        # 95% CI coverage
        z_score = 1.96
        in_ci = np.sum(np.abs(loo_actual - loo_preds) <= z_score * loo_stds)
        coverage = in_ci / len(loo_actual)
        
        return {
            'r2': r2_score(loo_actual, loo_preds),
            'rmse': np.sqrt(mean_squared_error(loo_actual, loo_preds)),
            'coverage': coverage,
            'predictions': loo_preds,
            'actual': loo_actual,
            'stds': loo_stds
        }
    
    @property
    def log_marginal_likelihood(self) -> float:
        return self._gp.log_marginal_likelihood_value_ if self._is_fitted else None
    
    @property
    def kernel_params(self) -> str:
        return str(self._gp.kernel_) if self._is_fitted else None
    
    @property
    def gp(self):
        return self._gp

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# AcquisitionOptimizer Class
# ─────────────────────────────────────────────────────────────────────────────

class AcquisitionOptimizer:
    """Optimizes acquisition functions for batch selection."""
    
    @staticmethod
    def expected_improvement(X_new: np.ndarray, gp, y_best: float, 
                             xi: float = 0.01, minimize: bool = True) -> np.ndarray:
        """Expected Improvement acquisition function."""
        X_new = np.atleast_2d(X_new)
        mu, sigma = gp.predict(X_new, return_std=True)
        sigma = np.maximum(sigma, 1e-8)
        
        improvement = (y_best - mu - xi) if minimize else (mu - y_best - xi)
        Z = improvement / sigma
        ei = improvement * norm.cdf(Z) + sigma * norm.pdf(Z)
        ei[sigma < 1e-8] = 0.0
        return ei
    
    @staticmethod
    def lower_confidence_bound(X_new: np.ndarray, gp, kappa: float = 2.0) -> np.ndarray:
        """Lower Confidence Bound for minimization."""
        X_new = np.atleast_2d(X_new)
        mu, sigma = gp.predict(X_new, return_std=True)
        return mu - kappa * sigma
    
    @staticmethod
    def upper_confidence_bound(X_new: np.ndarray, gp, kappa: float = 2.0) -> np.ndarray:
        """Upper Confidence Bound for maximization."""
        X_new = np.atleast_2d(X_new)
        mu, sigma = gp.predict(X_new, return_std=True)
        return mu + kappa * sigma
    
    def optimize_batch(self, gp, bounds_scaled: np.ndarray, y_best: float, 
                       batch_size: int, minimize: bool = True,
                       kappa: float = 2.0, n_restarts: int = 20,
                       min_distance: float = 0.3) -> pd.DataFrame:
        """Find batch of diverse points optimizing acquisition function."""
        
        def negative_lcb(x):
            x = x.reshape(1, -1)
            if minimize:
                return self.lower_confidence_bound(x, gp.gp, kappa)[0]
            return -self.upper_confidence_bound(x, gp.gp, kappa)[0]
        
        def negative_ei(x):
            x = x.reshape(1, -1)
            return -self.expected_improvement(x, gp.gp, y_best, minimize=minimize)[0]
        
        n_features = bounds_scaled.shape[0]
        selected_points = []
        
        for batch_idx in range(batch_size):
            best_candidates = []
            
            for _ in range(n_restarts):
                x0 = np.random.uniform(bounds_scaled[:, 0], bounds_scaled[:, 1])
                
                try:
                    obj_func = negative_ei if batch_idx == 0 else negative_lcb
                    result = minimize(obj_func, x0, method='L-BFGS-B', bounds=bounds_scaled)
                    
                    if result.success:
                        if selected_points:
                            distances = [np.linalg.norm(result.x - p) for p in selected_points]
                            if min(distances) >= min_distance:
                                best_candidates.append((result.fun, result.x))
                        else:
                            best_candidates.append((result.fun, result.x))
                except:
                    continue
            
            if best_candidates:
                best_candidates.sort(key=lambda x: x[0])
                selected_points.append(best_candidates[0][1])
            else:
                # Fallback: random point with distance constraint
                for _ in range(100):
                    x_rand = np.random.uniform(bounds_scaled[:, 0], bounds_scaled[:, 1])
                    if not selected_points or min(np.linalg.norm(x_rand - p) for p in selected_points) >= min_distance:
                        selected_points.append(x_rand)
                        break
        
        return np.array(selected_points)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# BOIterationManager Class
# ─────────────────────────────────────────────────────────────────────────────

class BOIterationManager:
    """Manages individual Bayesian Optimization iterations."""
    
    def __init__(self, config: Phase2Config, checkpoint: Dict[str, Any]):
        self.config = config
        self.checkpoint = checkpoint
        self._selected_features = checkpoint['selected_features']
        self._binary_features = checkpoint['selected_binary']
        self._continuous_features = checkpoint['selected_continuous']
        self._binary_mappings = checkpoint['binary_mappings']
        self._maximize = checkpoint['metadata']['maximize_response']
        self._response_column = checkpoint['metadata']['response_column']
        
        self._optimizer = AcquisitionOptimizer()
        self._current_gp = None
        self._current_scaler = None
        self._validation_metrics = None
    
    def run_iteration(self, X: pd.DataFrame, y: pd.Series, 
                      bounds_array: np.ndarray, iteration: int) -> pd.DataFrame:
        """Run single BO iteration and return proposed experiments."""
        
        # Fit scaler
        self._current_scaler = Phase2Scaler(
            self._continuous_features, self._binary_features, self._selected_features
        )
        X_scaled = self._current_scaler.fit_transform(X)
        scaled_bounds = self._current_scaler.get_scaled_bounds(bounds_array)
        
        # Fit GP
        self._current_gp = GPModel(len(self._selected_features))
        self._current_gp.fit(X_scaled, y)
        
        # Validate
        self._validation_metrics = self._current_gp.validate_loo(
            X, y, Phase2Scaler, self._continuous_features, 
            self._binary_features, self._selected_features
        )
        
        # Get current best
        y_best = y.max() if self._maximize else y.min()
        
        # Optimize acquisition
        next_points_scaled = self._optimizer.optimize_batch(
            self._current_gp, scaled_bounds, y_best, self.config.batch_size,
            minimize=not self._maximize, kappa=self.config.exploration_weight,
            n_restarts=self.config.n_optimizer_restarts,
            min_distance=self.config.min_distance_between_points
        )
        
        # Convert to original scale and build DataFrame
        return self._build_experiments_df(next_points_scaled, iteration)
    
    def _build_experiments_df(self, points_scaled: np.ndarray, iteration: int) -> pd.DataFrame:
        """Build DataFrame of proposed experiments."""
        experiments = []
        
        for i, x_scaled in enumerate(points_scaled):
            x_orig = self._current_scaler.inverse_transform_point(x_scaled)
            
            row = {'Experiment_ID': f'Iter{iteration}_Exp{i+1}'}
            
            for j, feat in enumerate(self._selected_features):
                if feat in self._binary_features:
                    row[feat] = int(round(np.clip(x_orig[j], 0, 1)))
                else:
                    row[feat] = round(x_orig[j], 4)
            
            mu, sigma = self._current_gp.predict(x_scaled.reshape(1, -1))
            row['GP_Predicted_Mean'] = round(mu[0], 4)
            row['GP_Predicted_Std'] = round(sigma[0], 4)
            row['Acquisition_Rank'] = i + 1
            
            experiments.append(row)
        
        return pd.DataFrame(experiments)
    
    def input_results(self, results_df: pd.DataFrame, X: pd.DataFrame, 
                      y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """Add new experimental results to dataset."""
        X_new = results_df[self._selected_features].copy()
        y_new = results_df[self._response_column].copy()
        
        X_updated = pd.concat([X, X_new], ignore_index=True)
        y_updated = pd.concat([y, y_new], ignore_index=True)
        
        return X_updated, y_updated
    
    def check_convergence(self, history: Dict, threshold: float, 
                         patience: int) -> Tuple[bool, str]:
        """Check if optimization has converged."""
        best_values = history['best_values']
        
        if len(best_values) < 2:
            return False, "Not enough iterations"
        
        # Calculate improvements
        improvements = []
        for i in range(1, len(best_values)):
            imp = (best_values[i] - best_values[i-1]) if self._maximize else (best_values[i-1] - best_values[i])
            improvements.append(imp)
        
        recent = improvements[-patience:] if len(improvements) >= patience else improvements
        max_recent = max(recent) if recent else 0
        
        converged = max_recent < threshold
        
        report = f"Max improvement (last {patience}): {max_recent:.6f}, Threshold: {threshold}"
        return converged, report
    
    @property
    def gp(self) -> GPModel:
        return self._current_gp
    
    @property
    def scaler(self) -> Phase2Scaler:
        return self._current_scaler
    
    @property
    def validation(self) -> Dict[str, float]:
        return self._validation_metrics

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase2Plotter Class
# ─────────────────────────────────────────────────────────────────────────────

class Phase2Plotter:
    """Plotting utilities for Phase 2 Bayesian Optimization."""
    
    def __init__(self, output_dir: str):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
    
    def _save_and_show(self, fig, filename: str, save_path: Optional[str] = None):
        path = save_path or (self.output_dir / filename)
        plt.savefig(path, dpi=150, bbox_inches='tight')
        plt.show()
        return fig
    
    def gp_slices(self, gp, scaler, X: pd.DataFrame, y: pd.Series,
                  bounds_df: pd.DataFrame, selected_features: List[str],
                  binary_features: List[str], experiments_df: pd.DataFrame,
                  iteration: int, save_path: Optional[str] = None):
        """Plot 1D GP slices for each feature."""
        n_features = len(selected_features)
        fig, axes = plt.subplots(1, n_features, figsize=(4*n_features, 4))
        if n_features == 1:
            axes = [axes]
        
        X_scaled = scaler.transform(X)
        x_base = X_scaled.mean().values.copy()
        
        for i, feat in enumerate(selected_features):
            ax = axes[i]
            bounds_row = bounds_df[bounds_df['feature'] == feat].iloc[0]
            
            if feat in binary_features:
                preds, stds = [], []
                for val in [-1, 1]:
                    x_test = x_base.copy()
                    x_test[i] = val
                    mu, sigma = gp.predict(x_test.reshape(1, -1))
                    preds.append(mu[0])
                    stds.append(sigma[0])
                ax.bar([0, 1], preds, yerr=[1.96*s for s in stds], capsize=5, alpha=0.7)
                ax.set_xticks([0, 1])
            else:
                x_grid = np.linspace(bounds_row['min'], bounds_row['max'], 50)
                preds, stds = [], []
                
                for val in x_grid:
                    x_test = x_base.copy()
                    x_test[i] = scaler.transform_point(
                        [val if j == i else X[selected_features[j]].mean() 
                         for j in range(len(selected_features))]
                    )[i]
                    mu, sigma = gp.predict(x_test.reshape(1, -1))
                    preds.append(mu[0])
                    stds.append(sigma[0])
                
                preds, stds = np.array(preds), np.array(stds)
                ax.plot(x_grid, preds, 'b-', linewidth=2)
                ax.fill_between(x_grid, preds - 1.96*stds, preds + 1.96*stds, alpha=0.3)
                ax.scatter(X[feat], y, c='red', s=50, zorder=5)
                
                for _, row in experiments_df.iterrows():
                    ax.axvline(row[feat], color='green', linestyle='--', alpha=0.5)
            
            ax.set_xlabel(feat)
            ax.set_title(f'GP: {feat}')
        
        plt.tight_layout()
        return self._save_and_show(fig, f'phase2_iteration{iteration}_gp.png', save_path)
    
    def validation_plot(self, validation: Dict, iteration: int, 
                        save_path: Optional[str] = None):
        """Plot GP validation results."""
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        
        actual = validation['actual']
        preds = validation['predictions']
        stds = validation['stds']
        
        axes[0].errorbar(actual, preds, yerr=1.96*stds, fmt='o', alpha=0.7, capsize=3)
        lims = [min(actual.min(), preds.min()), max(actual.max(), preds.max())]
        axes[0].plot(lims, lims, 'r--', linewidth=2)
        axes[0].set_xlabel('Actual')
        axes[0].set_ylabel('Predicted')
        axes[0].set_title(f'LOO-CV: R² = {validation["r2"]:.3f}')
        
        residuals = actual - preds
        axes[1].hist(residuals, bins=15, edgecolor='black', alpha=0.7)
        axes[1].axvline(0, color='red', linestyle='--')
        axes[1].set_xlabel('Residual')
        axes[1].set_title('Residual Distribution')
        
        plt.tight_layout()
        return self._save_and_show(fig, f'phase2_iteration{iteration}_validation.png', save_path)
    
    def optimization_progress(self, history: Dict, response_name: str,
                              save_path: Optional[str] = None):
        """Plot optimization progress over iterations."""
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        
        axes[0].plot(history['iterations'], history['best_values'], 'bo-', linewidth=2, markersize=10)
        axes[0].set_xlabel('Iteration')
        axes[0].set_ylabel(f'Best {response_name}')
        axes[0].set_title('Optimization Progress')
        axes[0].grid(True, alpha=0.3)
        
        r2_values = [p['loo_r2'] for p in history['gp_params']]
        axes[1].plot(history['iterations'], r2_values, 'go-', linewidth=2, markersize=10)
        axes[1].set_xlabel('Iteration')
        axes[1].set_ylabel('LOO-CV R²')
        axes[1].set_title('GP Model Quality')
        axes[1].set_ylim([0, 1])
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase2_progress.png', save_path)
    
    def final_summary(self, X: pd.DataFrame, y: pd.Series, 
                      history: Dict, current_best: float,
                      response_name: str, save_path: Optional[str] = None):
        """Plot final optimization summary."""
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # Progress
        axes[0, 0].plot(history['iterations'], history['best_values'], 'bo-', linewidth=2, markersize=10)
        axes[0, 0].set_xlabel('Iteration')
        axes[0, 0].set_ylabel(f'Best {response_name}')
        axes[0, 0].set_title('Optimization Progress')
        axes[0, 0].grid(True, alpha=0.3)
        
        # GP Quality
        r2_values = [p['loo_r2'] for p in history['gp_params']]
        axes[0, 1].plot(history['iterations'], r2_values, 'go-', linewidth=2, markersize=10)
        axes[0, 1].set_xlabel('Iteration')
        axes[0, 1].set_ylabel('LOO-CV R²')
        axes[0, 1].set_title('GP Model Quality')
        axes[0, 1].grid(True, alpha=0.3)
        
        # Response distribution
        axes[1, 0].hist(y, bins=20, edgecolor='black', alpha=0.7, color='steelblue')
        axes[1, 0].axvline(current_best, color='red', linewidth=2, linestyle='--', label=f'Best: {current_best:.4f}')
        axes[1, 0].set_xlabel(response_name)
        axes[1, 0].set_title(f'Distribution of {response_name}')
        axes[1, 0].legend()
        
        # Feature correlations
        correlations = X.corrwith(y)
        colors = ['forestgreen' if c > 0 else 'crimson' for c in correlations]
        axes[1, 1].barh(X.columns, np.abs(correlations), color=colors)
        axes[1, 1].set_xlabel('|Correlation|')
        axes[1, 1].set_title('Feature-Response Correlations')
        
        plt.tight_layout()
        return self._save_and_show(fig, 'phase2_final_summary.png', save_path)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase2Pipeline Class
# ─────────────────────────────────────────────────────────────────────────────

class Phase2Pipeline:
    """Complete Phase 2 pipeline for Bayesian Optimization."""
    
    def __init__(self, config: Phase2Config):
        self.config = config
        self._output_dir = Path(config.output_dir)
        self._output_dir.mkdir(exist_ok=True)
        
        self._checkpoint = None
        self._bounds_df = None
        self._bounds_array = None
        self._X = None
        self._y = None
        self._iteration = 0
        self._is_converged = False
        
        self._iteration_mgr = None
        self._plotter = Phase2Plotter(config.output_dir)
        
        self._history = {
            'iterations': [],
            'best_values': [],
            'proposed_points': [],
            'observed_responses': [],
            'gp_params': []
        }
    
    def load_checkpoint(self, path: Optional[str] = None) -> 'Phase2Pipeline':
        """Load Phase 1 checkpoint."""
        path = path or self.config.checkpoint_path
        
        checkpoint_mgr = Phase1Checkpoint()
        self._checkpoint = checkpoint_mgr.load(path)
        
        self._bounds_df = pd.read_csv(Path(path).parent / 'phase1_bounds.csv')
        self._bounds_array = np.array([
            [row['min'], row['max']] 
            for _, row in self._bounds_df.iterrows()
        ])
        
        self._iteration_mgr = BOIterationManager(self.config, self._checkpoint)
        
        print(f"✓ Loaded checkpoint")
        print(f"  Features: {self._checkpoint['selected_features']}")
        print(f"  Response: {self._checkpoint['metadata']['response_column']}")
        
        return self
    
    def load_data(self, data_path: Optional[str] = None) -> 'Phase2Pipeline':
        """Load Phase 2 experimental data."""
        data_path = data_path or self.config.data_file
        selected_features = self._checkpoint['selected_features']
        response_col = self._checkpoint['metadata']['response_column']
        
        if data_path.endswith('.csv'):
            df = pd.read_csv(data_path)
        else:
            df = pd.read_excel(data_path, sheet_name=self.config.sheet_name)
        
        # Validate columns
        missing = [c for c in selected_features + [response_col] if c not in df.columns]
        if missing:
            raise ValueError(f"Missing columns: {missing}")
        
        self._X = df[selected_features].copy()
        self._y = df[response_col].copy()
        
        # Handle missing
        valid = ~self._y.isnull() & self._X.notna().all(axis=1)
        self._X = self._X[valid].reset_index(drop=True)
        self._y = self._y[valid].reset_index(drop=True)
        
        print(f"✓ Loaded {len(self._X)} experiments")
        print(f"  Current best: {self.current_best:.4f}")
        
        return self
    
    def run_iteration(self) -> pd.DataFrame:
        """Run single BO iteration and return proposed experiments."""
        self._iteration += 1
        
        print(f"\n─ Iteration {self._iteration}")
        
        experiments_df = self._iteration_mgr.run_iteration(
            self._X, self._y, self._bounds_array, self._iteration
        )
        
        validation = self._iteration_mgr.validation
        print(f"  GP R²: {validation['r2']:.4f}, Coverage: {validation['coverage']:.1%}")
        print(f"  ✓ Proposed {len(experiments_df)} experiments")
        
        # Update history
        self._history['iterations'].append(self._iteration)
        self._history['best_values'].append(self.current_best)
        self._history['proposed_points'].append(experiments_df.to_dict('records'))
        self._history['gp_params'].append({
            'log_marginal_likelihood': self._iteration_mgr.gp.log_marginal_likelihood,
            'kernel_params': self._iteration_mgr.gp.kernel_params,
            'loo_r2': validation['r2'],
            'loo_rmse': validation['rmse'],
            'ci_coverage': validation['coverage']
        })
        
        # Save history
        self._save_history()
        
        # Export experiments
        response_col = self._checkpoint['metadata']['response_column']
        export_df = experiments_df.copy()
        export_df[response_col] = ''
        export_path = self._output_dir / f'phase2_iteration{self._iteration}_experiments.csv'
        export_df.to_csv(export_path, index=False)
        
        # Generate plots
        self._plotter.validation_plot(validation, self._iteration)
        
        return experiments_df
    
    def add_results(self, results: Union[str, pd.DataFrame]) -> 'Phase2Pipeline':
        """Add new experimental results."""
        if isinstance(results, str):
            results_df = pd.read_csv(results)
        else:
            results_df = results
        
        response_col = self._checkpoint['metadata']['response_column']
        
        # Convert response to numeric
        results_df[response_col] = pd.to_numeric(results_df[response_col], errors='coerce')
        valid_results = results_df[~results_df[response_col].isna()]
        
        if len(valid_results) == 0:
            print("  ⚠ No valid results found")
            return self
        
        # Store previous best
        prev_best = self.current_best
        
        # Update data
        self._X, self._y = self._iteration_mgr.input_results(valid_results, self._X, self._y)
        
        self._history['observed_responses'].append(
            valid_results[response_col].tolist()
        )
        
        # Calculate improvement
        new_best = self.current_best
        maximize = self._checkpoint['metadata']['maximize_response']
        improvement = (new_best - prev_best) if maximize else (prev_best - new_best)
        
        print(f"  ✓ Added {len(valid_results)} results")
        print(f"  Previous best: {prev_best:.4f} → New best: {new_best:.4f}")
        if improvement > 0:
            print(f"  ✓ Improvement: {improvement:.4f}")
        
        # Save updated data
        self._save_all_data()
        
        # Check convergence
        converged, report = self._iteration_mgr.check_convergence(
            self._history, self.config.improvement_threshold, self.config.patience
        )
        self._is_converged = converged
        if converged:
            print(f"  ✓ Converged: {report}")
        
        return self
    
    def get_optimal(self) -> Dict[str, Any]:
        """Get optimal conditions found."""
        maximize = self._checkpoint['metadata']['maximize_response']
        best_idx = self._y.idxmax() if maximize else self._y.idxmin()
        
        result = {
            'response': float(self._y[best_idx]),
            'conditions': {feat: float(self._X.loc[best_idx, feat]) 
                          for feat in self._checkpoint['selected_features']}
        }
        return result
    
    def generate_confirmation_runs(self, n: int = 5) -> pd.DataFrame:
        """Generate confirmation runs at optimal conditions."""
        optimal = self.get_optimal()
        response_col = self._checkpoint['metadata']['response_column']
        
        runs = []
        for i in range(n):
            row = {'Run_ID': f'Confirm_{i+1}'}
            row.update(optimal['conditions'])
            row[f'Expected_{response_col}'] = optimal['response']
            row[f'Actual_{response_col}'] = ''
            runs.append(row)
        
        df = pd.DataFrame(runs)
        df.to_csv(self._output_dir / 'phase2_confirmation_runs.csv', index=False)
        
        return df
    
    def _save_history(self):
        """Save optimization history."""
        with open(self._output_dir / 'phase2_bo_history.pkl', 'wb') as f:
            pickle.dump(self._history, f)
        with open(self._output_dir / 'phase2_bo_history.json', 'w') as f:
            json.dump(self._history, f, indent=2, default=str)
    
    def _save_all_data(self):
        """Save all experimental data."""
        data = self._X.copy()
        data[self._checkpoint['metadata']['response_column']] = self._y.values
        data.to_csv(self._output_dir / 'phase2_all_data.csv', index=False)
    
    def plot_progress(self, save_path: Optional[str] = None):
        """Plot optimization progress."""
        return self._plotter.optimization_progress(
            self._history, self._checkpoint['metadata']['response_column'], save_path
        )
    
    def plot_final_summary(self, save_path: Optional[str] = None):
        """Plot final optimization summary."""
        return self._plotter.final_summary(
            self._X, self._y, self._history, self.current_best,
            self._checkpoint['metadata']['response_column'], save_path
        )
    
    @property
    def current_best(self) -> float:
        if self._y is None or len(self._y) == 0:
            return None
        maximize = self._checkpoint['metadata']['maximize_response']
        return float(self._y.max() if maximize else self._y.min())
    
    @property
    def iteration(self) -> int:
        return self._iteration
    
    @property
    def is_converged(self) -> bool:
        return self._is_converged
    
    @property
    def history(self) -> Dict:
        return self._history
    
    @property
    def data(self) -> Tuple[pd.DataFrame, pd.Series]:
        return self._X, self._y

---
# Usage Examples
---

The following cells demonstrate how to use the modular pipeline classes.

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase 1 Example Usage
# ─────────────────────────────────────────────────────────────────────────────

# Configure Phase 1
config = Phase1Config(
    data_file='data.xlsx',
    response_column='Downy Leak',
    sheet_name='data',
    header_row=5,
    split_keyword='PREDICTED OPTIMUM RUNS',
    stop_feature='Batch ID',
    maximize_response=False,
    target_features=4
)

# Run Phase 1 pipeline
p1 = Phase1Pipeline(config)
p1.run()

# Inspect results
print("\nRankings:")
print(p1.rankings[['feature', 'correlation', 'VIP', 'avg_rank']].head(10))

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Manual Feature Override (Optional)
# ─────────────────────────────────────────────────────────────────────────────

# Option 1: Use automatic selection (already done in run())
print("Auto-selected features:", p1.selected_features)

# Option 2: Manual override - uncomment to use
# p1.select_features(manual_list=['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D'])

# Validation metrics
print("\nValidation Metrics:")
print(f"  R²:   {p1.validation_metrics['r2']:.4f}")
print(f"  RMSE: {p1.validation_metrics['rmse']:.4f}")
print(f"  MAE:  {p1.validation_metrics['mae']:.4f}")

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Save Phase 1 Checkpoint
# ─────────────────────────────────────────────────────────────────────────────

p1.save_checkpoint()

print("\nCheckpoint saved. Files created:")
print("  ✓ phase1_checkpoint.json")
print("  ✓ phase1_checkpoint.pkl")
print("  ✓ phase1_bounds.csv")
print("  ✓ phase1_consensus_ranking.csv")

---
## Phase 2: Bayesian Optimization

**Run these cells after:**
1. You have selected final features (Phase 1)
2. You have run new experiments with those features
3. You have created a new data file with results

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Phase 2 Configuration and Setup
# ─────────────────────────────────────────────────────────────────────────────

# Configure Phase 2
p2_config = Phase2Config(
    data_file='phase2_experiments.xlsx',
    checkpoint_path='bo_pipeline_output/phase1_checkpoint.pkl',
    batch_size=5,
    exploration_weight=2.0,
    max_iterations=10,
    improvement_threshold=0.01,
    patience=3
)

# Initialize Phase 2 pipeline
p2 = Phase2Pipeline(p2_config)
p2.load_checkpoint()
p2.load_data()

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Run BO Iteration
# ─────────────────────────────────────────────────────────────────────────────

# Run single iteration
experiments = p2.run_iteration()

# View proposed experiments
print("\nProposed Experiments:")
print(experiments.to_string(index=False))

# Save to CSV for lab
experiments.to_csv(f'bo_pipeline_output/next_batch.csv', index=False)
print("\n✓ Saved to next_batch.csv - Run these experiments in lab")

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Add Results and Continue (Run after lab experiments)
# ─────────────────────────────────────────────────────────────────────────────

# Load completed results
# p2.add_results('bo_pipeline_output/next_batch_completed.csv')

# Or continue in a loop:
# while not p2.is_converged:
#     experiments = p2.run_iteration()
#     experiments.to_csv('next_batch.csv')
#     # [Run experiments in lab]
#     # [Fill in results in next_batch.csv]
#     p2.add_results('next_batch.csv')

print("Iteration complete. Waiting for experimental results...")

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# Final Results
# ─────────────────────────────────────────────────────────────────────────────

# Get optimal conditions
optimal = p2.get_optimal()
print("Optimal Conditions Found:")
print(f"  Response: {optimal['response']:.4f}")
for feat, val in optimal['conditions'].items():
    print(f"  {feat}: {val:.4f}")

# Generate confirmation runs
confirmation = p2.generate_confirmation_runs(5)
print("\nConfirmation Runs:")
print(confirmation.to_string(index=False))

# Plot final summary
p2.plot_final_summary()

---
## Quick Reference

### Phase 1 Classes
| Class | Purpose |
|-------|---------|
| `Phase1Config` | Configuration dataclass for Phase 1 |
| `DataLoader` | Load, clean, and classify data |
| `FeatureRanker` | Correlation, Lasso, PLS-VIP, interactions |
| `FeatureSelector` | Auto/manual feature selection with validation |
| `Phase1Checkpoint` | Save/load checkpoint files |
| `Phase1Pipeline` | Full Phase 1 workflow |

### Phase 2 Classes
| Class | Purpose |
|-------|---------|
| `Phase2Config` | Configuration dataclass for Phase 2 |
| `Phase2Scaler` | Leakage-free scaling |
| `GPModel` | Gaussian Process with LOO-CV validation |
| `AcquisitionOptimizer` | EI, LCB, UCB optimization |
| `BOIterationManager` | Single iteration management |
| `Phase2Pipeline` | Full Phase 2 workflow |

### Key Properties
- `p1.rankings` - Feature consensus ranking DataFrame
- `p1.selected_features` - List of selected feature names
- `p1.validation_metrics` - Dict with r2, rmse, mae
- `p2.current_best` - Current best response value
- `p2.is_converged` - Boolean convergence status
- `p2.history` - Optimization history dict