In [None]:
# =============================================================================
# ENCAPSULATED FEATURE SCREENING MODULE
# =============================================================================

from sklearn.model_selection import train_test_split

class FeatureScreener:
    """Feature screening pipeline for Bayesian Optimization preparation."""
    
    def __init__(self, response_column, target_features=4, maximize_response=False,
                 correlation_strong=0.4, correlation_moderate=0.2,
                 vip_important=1.0, vip_moderate=0.8,
                 multicollinearity_threshold=0.7, interaction_threshold=0.3,
                 random_state=42, test_size=0.2):
        
        self.response_column = response_column
        self.target_features = target_features
        self.maximize_response = maximize_response
        self.correlation_strong = correlation_strong
        self.correlation_moderate = correlation_moderate
        self.vip_important = vip_important
        self.vip_moderate = vip_moderate
        self.multicollinearity_threshold = multicollinearity_threshold
        self.interaction_threshold = interaction_threshold
        self.random_state = random_state
        self.test_size = test_size
        
        self.binary_features = []
        self.continuous_features = []
        self.binary_mappings = {}
        self.scaler = StandardScaler()
        self.feature_cols = []
        self.selected_features = []
        self.consensus = None
        self.bo_bounds_df = None
        self.features_with_interactions = []
        self.strong_interactions = pd.DataFrame()
        
    def classify_features(self, df, feature_list, verbose=True):
        """Classify features as binary or continuous and encode binary features."""
        if verbose:
            print("=" * 60)
            print("FEATURE TYPE CLASSIFICATION")
            print("=" * 60)
            print("Rule: Binary (2 unique) → 0/1 | Continuous (3+) → standardized\n")
        
        numeric_features = df[feature_list].select_dtypes(include=[np.number]).columns.tolist()
        numeric_features = [c for c in numeric_features if c != self.response_column]
        
        for col in numeric_features:
            n_unique = df[col].nunique()
            if n_unique == 2:
                self.binary_features.append(col)
                unique_vals = df[col].dropna().unique()
                mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
                df[col] = df[col].map(mapping)
                self.binary_mappings[col] = mapping
                if verbose:
                    print(f"  {col}: {n_unique} unique → BINARY ({unique_vals[0]}→0, {unique_vals[1]}→1)")
            else:
                self.continuous_features.append(col)
                if verbose:
                    print(f"  {col}: {n_unique} unique → CONTINUOUS")
        
        self.feature_cols = self.binary_features + self.continuous_features
        
        if verbose:
            print(f"\n  Summary: {len(self.binary_features)} binary, {len(self.continuous_features)} continuous")
        
        return df
    
    def prepare_data(self, df, verbose=True):
        """Prepare X and y, perform train-test split, then standardize."""
        if verbose:
            print("\n" + "=" * 60)
            print("DATA PREPARATION")
            print("=" * 60)
        
        X = df[self.feature_cols].copy()
        y = df[self.response_column].copy()
        
        valid = ~y.isnull()
        n_dropped = (~valid).sum()
        X, y = X[valid].reset_index(drop=True), y[valid].reset_index(drop=True)
        
        if verbose and n_dropped > 0:
            print(f"  Dropped {n_dropped} rows with missing response")
        
        # Train-test split BEFORE standardization
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state
        )
        
        if verbose:
            print(f"  Train/Test split: {len(X_train)}/{len(X_test)} samples ({1-self.test_size:.0%}/{self.test_size:.0%})")
        
        # Store originals before scaling
        self.original_X_train, self.original_y_train = X_train.copy(), y_train.copy()
        self.original_X_test, self.original_y_test = X_test.copy(), y_test.copy()
        
        # Standardize continuous features using ONLY training data
        X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()
        if self.continuous_features:
            X_train_scaled[self.continuous_features] = self.scaler.fit_transform(X_train[self.continuous_features])
            X_test_scaled[self.continuous_features] = self.scaler.transform(X_test[self.continuous_features])
            if verbose:
                print(f"  Standardized {len(self.continuous_features)} continuous features (fit on train only)")
        
        # Scale binary to [-1, +1] for modeling
        X_train_model, X_test_model = X_train_scaled.copy(), X_test_scaled.copy()
        for col in self.binary_features:
            X_train_model[col] = X_train_model[col] * 2 - 1
            X_test_model[col] = X_test_model[col] * 2 - 1
        
        if verbose and self.binary_features:
            print(f"  Binary features scaled to [-1, +1] for modeling")
            print(f"\n  Samples/Features ratio: {len(X_train)/len(self.feature_cols):.1f}")
        
        return X_train, X_test, X_train_model, X_test_model, y_train, y_test
    
    def compute_correlations(self, X, y, verbose=True):
        """Compute Pearson correlations."""
        if verbose:
            print("\n" + "=" * 60)
            print("METHOD 1: PEARSON CORRELATION")
            print("=" * 60)
        
        correlations = X.corrwith(y)
        corr_df = pd.DataFrame({
            'feature': self.feature_cols,
            'correlation': correlations.values,
            'abs_corr': np.abs(correlations.values),
            'type': ['binary' if f in self.binary_features else 'continuous' for f in self.feature_cols],
            'direction': ['Positive' if c > 0 else 'Negative' for c in correlations.values]
        }).sort_values('abs_corr', ascending=False).reset_index(drop=True)
        corr_df['rank_corr'] = range(1, len(corr_df) + 1)
        corr_df['strength'] = corr_df['correlation'].apply(
            lambda r: 'Strong' if abs(r) >= self.correlation_strong else 'Moderate' if abs(r) >= self.correlation_moderate else 'Weak'
        )
        
        if verbose:
            print("\nResults:")
            print(corr_df[['rank_corr', 'feature', 'correlation', 'direction', 'strength', 'type']].to_string(index=False))
            
            # Binary interpretation
            binary_rows = corr_df[corr_df['type'] == 'binary']
            if len(binary_rows) > 0:
                print("\nBinary Feature Effects:")
                for _, row in binary_rows.iterrows():
                    effect = "INCREASES" if row['correlation'] > 0 else "DECREASES"
                    print(f"  {row['feature']}: When=1, response {effect} (r={row['correlation']:.3f})")
        
        return corr_df
    
    def run_lasso(self, X_model, y, verbose=True):
        """Run Lasso regression for feature selection."""
        if verbose:
            print("\n" + "=" * 60)
            print("METHOD 2: LASSO REGRESSION")
            print("=" * 60)
        
        lasso = LassoCV(cv=5, max_iter=10000, random_state=self.random_state)
        lasso.fit(X_model, y)
        
        lasso_df = pd.DataFrame({
            'feature': self.feature_cols,
            'coefficient': lasso.coef_,
            'abs_coef': np.abs(lasso.coef_),
            'selected': lasso.coef_ != 0,
            'type': ['binary' if f in self.binary_features else 'continuous' for f in self.feature_cols]
        }).sort_values('abs_coef', ascending=False).reset_index(drop=True)
        lasso_df['rank_lasso'] = range(1, len(lasso_df) + 1)
        
        if verbose:
            print(f"\nOptimal alpha: {lasso.alpha_:.4f}")
            print(f"Features selected: {lasso_df['selected'].sum()}/{len(lasso_df)}")
            print("\nResults:")
            print(lasso_df[['rank_lasso', 'feature', 'coefficient', 'selected', 'type']].to_string(index=False))
            selected = lasso_df[lasso_df['selected']]['feature'].tolist()
            print(f"\n✓ Lasso selected: {selected if selected else 'None'}")
        
        return lasso_df, lasso.alpha_
    
    def run_pls(self, X_model, y, verbose=True):
        """Run PLS and compute VIP scores."""
        if verbose:
            print("\n" + "=" * 60)
            print("METHOD 3: PLS (VIP Scores)")
            print("=" * 60)
            print("\nFinding optimal components...")
        
        max_comp = min(5, len(self.feature_cols), len(X_model) - 1)
        cv_scores = []
        for n in range(1, max_comp + 1):
            score = cross_val_score(PLSRegression(n_components=n), X_model, y, cv=5, scoring='r2').mean()
            cv_scores.append(score)
            if verbose:
                print(f"  {n} components: CV R² = {score:.4f}")
        
        optimal_comp = np.argmax(cv_scores) + 1
        
        if verbose:
            print(f"\n✓ Optimal: {optimal_comp} components (CV R² = {max(cv_scores):.4f})")
        
        pls = PLSRegression(n_components=optimal_comp)
        pls.fit(X_model, y)
        
        # VIP calculation
        t, w, q = pls.x_scores_, pls.x_weights_, pls.y_loadings_
        m, p = w.shape
        ss = np.sum(t**2, axis=0) * q.flatten()**2
        total_ss = np.sum(ss)
        vip = np.array([np.sqrt(m * sum((w[i,j]**2) * ss[j] / np.sum(w[:,j]**2) for j in range(p)) / total_ss) for i in range(m)])
        
        pls_df = pd.DataFrame({
            'feature': self.feature_cols, 'VIP': vip,
            'type': ['binary' if f in self.binary_features else 'continuous' for f in self.feature_cols]
        }).sort_values('VIP', ascending=False).reset_index(drop=True)
        pls_df['rank_pls'] = range(1, len(pls_df) + 1)
        pls_df['category'] = pls_df['VIP'].apply(
            lambda v: 'Important' if v >= self.vip_important else 'Moderate' if v >= self.vip_moderate else 'Less Important'
        )
        
        if verbose:
            print("\nResults:")
            print(pls_df[['rank_pls', 'feature', 'VIP', 'category', 'type']].to_string(index=False))
            important = pls_df[pls_df['category'] == 'Important']['feature'].tolist()
            moderate = pls_df[pls_df['category'] == 'Moderate']['feature'].tolist()
            print(f"\n✓ Important (VIP ≥ {self.vip_important}): {important if important else 'None'}")
            print(f"⚠ Moderate (VIP ≥ {self.vip_moderate}): {moderate if moderate else 'None'}")
        
        return pls_df, optimal_comp, cv_scores
    
    def screen_interactions(self, X, y, corr_df, verbose=True):
        """Screen for feature interactions."""
        if verbose:
            print("\n" + "=" * 60)
            print("INTERACTION SCREENING")
            print("=" * 60)
            print(f"Checking if effect of Feature A depends on level of Feature B.")
            print(f"Threshold: {self.interaction_threshold}\n")
        
        top_features = corr_df.head(min(6, len(self.feature_cols)))['feature'].tolist()
        results = []
        
        for f1, f2 in combinations(top_features, 2):
            median_f2 = X[f2].median()
            low_f2, high_f2 = X[f2] <= median_f2, X[f2] > median_f2
            if low_f2.sum() >= 3 and high_f2.sum() >= 3:
                corr_low = X.loc[low_f2, f1].corr(y[low_f2])
                corr_high = X.loc[high_f2, f1].corr(y[high_f2])
                if not np.isnan(corr_low) and not np.isnan(corr_high):
                    strength = abs(corr_high - corr_low)
                    results.append({
                        'interaction': f'{f1} × {f2}', 'feature_1': f1, 'feature_2': f2,
                        'corr_low_f2': corr_low, 'corr_high_f2': corr_high, 'strength': strength,
                        'interpretation': 'Effect changes' if strength > self.interaction_threshold else 'No interaction'
                    })
        
        if results:
            interaction_df = pd.DataFrame(results).sort_values('strength', ascending=False)
            strong = interaction_df[interaction_df['strength'] > self.interaction_threshold]
            self.features_with_interactions = list(set(strong['feature_1'].tolist() + strong['feature_2'].tolist())) if len(strong) > 0 else []
            self.strong_interactions = strong
            
            if verbose:
                print("Interaction Analysis Results:")
                print(interaction_df[['interaction', 'corr_low_f2', 'corr_high_f2', 'strength', 'interpretation']].to_string(index=False))
                
                if len(strong) > 0:
                    print(f"\n⚠️ POTENTIAL INTERACTIONS DETECTED:")
                    for _, row in strong.iterrows():
                        print(f"\n  {row['interaction']}: strength = {row['strength']:.3f}")
                        print(f"    When {row['feature_2']} LOW:  r = {row['corr_low_f2']:.3f}")
                        print(f"    When {row['feature_2']} HIGH: r = {row['corr_high_f2']:.3f}")
                        print(f"    → Include BOTH features in BO!")
                else:
                    print(f"\n✓ No strong interactions detected (threshold: {self.interaction_threshold})")
        else:
            interaction_df = pd.DataFrame()
            self.strong_interactions = pd.DataFrame()
            self.features_with_interactions = []
            if verbose:
                print("✓ No interactions to analyze (insufficient data or features)")
        
        return interaction_df, self.strong_interactions, self.features_with_interactions
    
    def check_multicollinearity(self, X, verbose=True):
        """Check for multicollinearity between features."""
        if verbose:
            print("\n" + "=" * 60)
            print("MULTICOLLINEARITY CHECK")
            print("=" * 60)
            print(f"Checking correlations BETWEEN features.")
            print(f"Threshold: |r| > {self.multicollinearity_threshold}\n")
        
        feature_corr = X.corr()
        pairs = []
        for i in range(len(self.feature_cols)):
            for j in range(i+1, len(self.feature_cols)):
                r = feature_corr.iloc[i, j]
                if abs(r) > self.multicollinearity_threshold:
                    pairs.append({'feature_1': self.feature_cols[i], 'feature_2': self.feature_cols[j], 'correlation': r})
        
        if verbose:
            if pairs:
                print("⚠️ HIGHLY CORRELATED PAIRS:")
                for pair in pairs:
                    print(f"  {pair['feature_1']} ↔ {pair['feature_2']}: r = {pair['correlation']:.3f}")
                    print(f"    → Consider keeping only ONE in BO")
            else:
                print("✓ No highly correlated feature pairs found")
        
        return pairs, feature_corr
    
    def build_consensus(self, corr_df, lasso_df, pls_df, verbose=True):
        """Build consensus ranking from all methods."""
        if verbose:
            print("\n" + "=" * 60)
            print("CONSENSUS RANKING (All Methods)")
            print("=" * 60)
        
        consensus = corr_df[['feature', 'rank_corr', 'correlation', 'direction', 'type', 'strength']].merge(
            lasso_df[['feature', 'rank_lasso', 'selected']], on='feature'
        ).merge(pls_df[['feature', 'rank_pls', 'VIP', 'category']], on='feature')
        
        consensus['avg_rank'] = consensus[['rank_corr', 'rank_lasso', 'rank_pls']].mean(axis=1)
        consensus = consensus.sort_values('avg_rank').reset_index(drop=True)
        consensus['final_rank'] = range(1, len(consensus) + 1)
        consensus['methods_top3'] = consensus.apply(
            lambda r: sum([r['rank_corr'] <= 3, r['rank_lasso'] <= 3, r['rank_pls'] <= 3]), axis=1
        )
        consensus['has_interaction'] = consensus['feature'].isin(self.features_with_interactions)
        
        # Score features
        def score(row):
            s = 0
            if abs(row['correlation']) >= self.correlation_strong: s += 3
            elif abs(row['correlation']) >= self.correlation_moderate: s += 2
            if row['VIP'] >= self.vip_important: s += 3
            elif row['VIP'] >= self.vip_moderate: s += 2
            if row['selected']: s += 2
            s += row['methods_top3']
            if row['has_interaction']: s += 2
            return s
        
        consensus['score'] = consensus.apply(score, axis=1)
        self.consensus = consensus.sort_values('score', ascending=False).reset_index(drop=True)
        
        if verbose:
            print("\nConsensus Ranking:")
            display_cols = ['final_rank', 'feature', 'type', 'correlation', 'VIP', 'selected', 
                            'avg_rank', 'methods_top3', 'has_interaction']
            print(self.consensus[display_cols].to_string(index=False))
            
            high_agreement = self.consensus[self.consensus['methods_top3'] >= 3]['feature'].tolist()
            moderate_agreement = self.consensus[self.consensus['methods_top3'] == 2]['feature'].tolist()
            print(f"\n✓ High agreement (3/3 methods in top 3): {high_agreement if high_agreement else 'None'}")
            print(f"⚠ Moderate agreement (2/3 methods): {moderate_agreement if moderate_agreement else 'None'}")
        
        return self.consensus
    
    def recommend_features(self, verbose=True):
        """Generate feature recommendations."""
        if verbose:
            print("\n" + "=" * 60)
            print("FEATURE RECOMMENDATION")
            print("=" * 60)
            print("\nFeature Scores:")
            print(self.consensus[['feature', 'type', 'correlation', 'VIP', 'selected', 'has_interaction', 'score']].to_string(index=False))
        
        recommended, reasons = [], {}
        
        for _, row in self.consensus.iterrows():
            include, reason = False, []
            if row['score'] >= 6: include, reason = True, [f"High score ({row['score']})"]
            if abs(row['correlation']) >= self.correlation_strong: 
                include = True
                if "High score" not in str(reason): reason.append("Strong correlation")
            if row['VIP'] >= self.vip_important: 
                include = True
                if "High score" not in str(reason): reason.append(f"VIP ≥ {self.vip_important}")
            if row['has_interaction'] and row['score'] >= 4: 
                include = True
                reason.append("Part of interaction")
            
            if include and len(recommended) < 6:
                recommended.append(row['feature'])
                reasons[row['feature']] = ', '.join(reason) if reason else "High overall score"
        
        # Ensure minimum
        while len(recommended) < 3:
            for _, row in self.consensus.iterrows():
                if row['feature'] not in recommended:
                    recommended.append(row['feature'])
                    reasons[row['feature']] = "Added to meet minimum"
                    break
        
        self.selected_features = recommended[:self.target_features]
        self._recommendation_reasons = reasons
        
        if verbose:
            print(f"\n{'='*50}")
            print(f"RECOMMENDED ({len(self.selected_features)} features):")
            print(f"{'='*50}")
            for feat in self.selected_features:
                row = self.consensus[self.consensus['feature'] == feat].iloc[0]
                feat_type = "[binary]" if row['type'] == 'binary' else "[continuous]"
                print(f"\n  ✓ {feat} {feat_type}")
                print(f"      Correlation: {row['correlation']:.3f}")
                print(f"      VIP: {row['VIP']:.2f}")
                print(f"      Reason: {reasons.get(feat, 'N/A')}")
        
        return self.selected_features, reasons
    
    def set_features_manual(self, feature_list, verbose=True):
        """
        Manually override feature selection.
        
        Parameters:
        -----------
        feature_list : list - List of feature names to use
        """
        if verbose:
            print("\n" + "=" * 60)
            print("MANUAL FEATURE OVERRIDE")
            print("=" * 60)
        
        # Validate features exist
        invalid = [f for f in feature_list if f not in self.feature_cols]
        if invalid:
            raise ValueError(f"Invalid features: {invalid}. Available: {self.feature_cols}")
        
        self.selected_features = feature_list
        
        if verbose:
            print(f"\n✓ Manually selected {len(feature_list)} features:")
            for i, feat in enumerate(feature_list, 1):
                if self.consensus is not None:
                    row = self.consensus[self.consensus['feature'] == feat].iloc[0]
                    print(f"  {i}. {feat} ({row['type']}) - corr: {row['correlation']:.3f}, VIP: {row['VIP']:.2f}")
                else:
                    ftype = 'binary' if feat in self.binary_features else 'continuous'
                    print(f"  {i}. {feat} ({ftype})")
        
        return self.selected_features
    
    def validate_selection(self, X_train, X_model, y, verbose=True):
        """Validate selected features for issues and run LOO-CV."""
        if verbose:
            print("\n" + "=" * 60)
            print("FEATURE SELECTION VALIDATION")
            print("=" * 60)
            print(f"\nSelected features ({len(self.selected_features)}):")
            for i, feat in enumerate(self.selected_features, 1):
                row = self.consensus[self.consensus['feature'] == feat].iloc[0]
                print(f"  {i}. {feat} ({row['type']}) - corr: {row['correlation']:.3f}, VIP: {row['VIP']:.2f}")
        
        issues = []
        
        # Check multicollinearity in selection
        if verbose:
            print("\n" + "-" * 40)
            print("VALIDATION CHECKS:")
        
        if len(self.selected_features) > 1:
            sel_corr = X_train[self.selected_features].corr()
            collinear_issues = []
            for i in range(len(self.selected_features)):
                for j in range(i+1, len(self.selected_features)):
                    r = sel_corr.iloc[i, j]
                    if abs(r) > self.multicollinearity_threshold:
                        collinear_issues.append(f"{self.selected_features[i]} ↔ {self.selected_features[j]}: r={r:.2f}")
            
            if collinear_issues:
                issues.extend(collinear_issues)
                if verbose:
                    print(f"  ⚠️ Multicollinearity in selection:")
                    for issue in collinear_issues:
                        print(f"      {issue}")
            elif verbose:
                print(f"  ✓ No multicollinearity issues")
        
        # Check broken interactions
        if len(self.strong_interactions) > 0:
            broken = []
            for _, row in self.strong_interactions.iterrows():
                f1, f2 = row['feature_1'], row['feature_2']
                if (f1 in self.selected_features) != (f2 in self.selected_features):
                    in1 = "IN" if f1 in self.selected_features else "OUT"
                    in2 = "IN" if f2 in self.selected_features else "OUT"
                    broken.append(f"{f1} ({in1}) × {f2} ({in2})")
            
            if broken:
                issues.extend(broken)
                if verbose:
                    print(f"  ⚠️ Broken interactions:")
                    for b in broken:
                        print(f"      {b}")
            elif verbose:
                print(f"  ✓ No broken interactions")
        elif verbose:
            print(f"  ✓ No interactions to check")
        
        # LOO-CV
        if verbose:
            print("\n" + "=" * 60)
            print("VALIDATION: Leave-One-Out Cross-Validation")
            print("=" * 60)
        
        X_sel = X_model[self.selected_features]
        loo_preds, loo_actual = [], []
        
        for train_idx, test_idx in LeaveOneOut().split(X_sel):
            model = RidgeCV(alphas=[0.1, 1, 10, 100], cv=3)
            model.fit(X_sel.iloc[train_idx], y.iloc[train_idx])
            loo_preds.append(model.predict(X_sel.iloc[test_idx])[0])
            loo_actual.append(y.iloc[test_idx].values[0])
        
        loo_preds, loo_actual = np.array(loo_preds), np.array(loo_actual)
        metrics = {
            'r2': r2_score(loo_actual, loo_preds),
            'rmse': np.sqrt(np.mean((loo_actual - loo_preds)**2)),
            'mae': np.mean(np.abs(loo_actual - loo_preds))
        }
        
        if verbose:
            print(f"\nLOO-CV Results ({len(self.selected_features)} features):")
            print(f"  R²:   {metrics['r2']:.4f}")
            print(f"  RMSE: {metrics['rmse']:.4f}")
            print(f"  MAE:  {metrics['mae']:.4f}")
            
            print(f"\nInterpretation:")
            if metrics['r2'] > 0.5:
                print("  ✓ Good signal - features are predictive")
            elif metrics['r2'] > 0.2:
                print("  ⚠️ Moderate signal - GP in BO can likely improve")
            elif metrics['r2'] > 0:
                print("  ⚠️ Weak linear signal - may be non-linear")
            else:
                print("  ⚠️ No linear signal - check data or feature selection")
        
        return metrics, loo_preds, loo_actual, issues
    
    def generate_bo_bounds(self, original_X, verbose=True):
        """Generate Bayesian Optimization bounds."""
        if verbose:
            print("\n" + "=" * 60)
            print("BAYESIAN OPTIMIZATION SEARCH SPACE")
            print("=" * 60)
        
        bounds = []
        for feat in self.selected_features:
            if feat in self.binary_features:
                bounds.append({'feature': feat, 'type': 'binary', 'min': 0, 'max': 1, 
                               'observed_min': 0, 'observed_max': 1})
            else:
                fmin, fmax = original_X[feat].min(), original_X[feat].max()
                margin = 0.1 * (fmax - fmin)
                bounds.append({'feature': feat, 'type': 'continuous', 'min': fmin - margin, 
                               'max': fmax + margin, 'observed_min': fmin, 'observed_max': fmax})
        
        self.bo_bounds_df = pd.DataFrame(bounds)
        
        if verbose:
            print("\nSearch Space Bounds:")
            print(self.bo_bounds_df.to_string(index=False))
            
            print("\n" + "-" * 40)
            print("EFFECT DIRECTIONS:")
            for feat in self.selected_features:
                row = self.consensus[self.consensus['feature'] == feat].iloc[0]
                direction = row['correlation']
                if self.maximize_response:
                    suggest = "HIGH" if direction > 0 else "LOW"
                else:
                    suggest = "LOW" if direction > 0 else "HIGH"
                print(f"  {feat}: {'Positive' if direction > 0 else 'Negative'} effect → Suggest {suggest}")
        
        return self.bo_bounds_df
    
    def export_for_bo(self, original_X, original_y, output_prefix='bo', verbose=True):
        """Export data for Bayesian Optimization."""
        if verbose:
            print("\n" + "=" * 60)
            print("EXPORT FOR BAYESIAN OPTIMIZATION")
            print("=" * 60)
        
        bo_data = original_X[self.selected_features].copy()
        bo_data[self.response_column] = original_y.values
        
        best_idx = original_y.idxmax() if self.maximize_response else original_y.idxmin()
        best_val = original_y.max() if self.maximize_response else original_y.min()
        
        if verbose:
            print(f"\nInitial Data:")
            print(f"  Samples: {len(bo_data)}")
            print(f"  Features: {len(self.selected_features)}")
            
            print(f"\nBest Observed {'Maximum' if self.maximize_response else 'Minimum'}:")
            print(f"  {self.response_column} = {best_val:.4f}")
            print(f"  Conditions:")
            for feat in self.selected_features:
                print(f"    {feat}: {original_X.loc[best_idx, feat]:.4f}")
        
        self.bo_bounds_df.to_csv(f'{output_prefix}_bounds.csv', index=False)
        bo_data.to_csv(f'{output_prefix}_initial_data.csv', index=False)
        
        feature_info = self.consensus[self.consensus['feature'].isin(self.selected_features)][
            ['feature', 'type', 'correlation', 'VIP', 'has_interaction']
        ].reset_index(drop=True)
        feature_info.to_csv(f'{output_prefix}_feature_info.csv', index=False)
        
        selected_binary = [f for f in self.selected_features if f in self.binary_features]
        if selected_binary:
            mappings = [{'feature': f, 'value_0': list(self.binary_mappings[f].keys())[0],
                         'value_1': list(self.binary_mappings[f].keys())[1]} for f in selected_binary]
            pd.DataFrame(mappings).to_csv(f'{output_prefix}_binary_mappings.csv', index=False)
        
        if verbose:
            print(f"\n✓ Saved: {output_prefix}_bounds.csv")
            print(f"✓ Saved: {output_prefix}_initial_data.csv")
            print(f"✓ Saved: {output_prefix}_feature_info.csv")
            if selected_binary:
                print(f"✓ Saved: {output_prefix}_binary_mappings.csv")
        
        return bo_data, best_idx, best_val
    
    # =========================================================================
    # PLOTTING METHODS
    # =========================================================================
    
    def plot_feature_vs_response(self, X, y):
        """Plot scatter/box plots of features vs response."""
        print("\n" + "=" * 60)
        print("VISUAL INSPECTION: Feature vs Response")
        print("=" * 60)
        
        n_feat = len(self.feature_cols)
        n_cols = min(4, n_feat)
        n_rows = int(np.ceil(n_feat / n_cols))
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3.5*n_rows))
        axes = axes.flatten() if n_feat > 1 else [axes]
        
        for i, col in enumerate(self.feature_cols):
            ax = axes[i]
            corr = X[col].corr(y)
            
            if col in self.binary_features:
                for val in [0, 1]:
                    data = y[X[col] == val]
                    ax.boxplot([data], positions=[val], widths=0.6)
                ax.set_xticks([0, 1])
                ax.set_xticklabels(['0', '1'])
                ax.set_xlabel(f'{col} (binary)')
            else:
                ax.scatter(X[col], y, alpha=0.6, edgecolors='black', linewidth=0.5)
                z = np.polyfit(X[col], y, 1)
                p = np.poly1d(z)
                x_line = np.linspace(X[col].min(), X[col].max(), 100)
                ax.plot(x_line, p(x_line), 'r--', linewidth=2)
                ax.set_xlabel(col)
            
            ax.set_ylabel(self.response_column)
            ax.set_title(f'r = {corr:.3f}', fontsize=10)
        
        for j in range(i+1, len(axes)):
            axes[j].set_visible(False)
        
        plt.suptitle(f'Features vs {self.response_column}', fontsize=12, y=1.02)
        plt.tight_layout()
        plt.show()
        print("Look for: linear trends, non-linear patterns, outliers")
    
    def plot_response_distribution(self, y):
        """Plot response variable distribution."""
        print("\n" + "=" * 60)
        print("RESPONSE DISTRIBUTION")
        print("=" * 60)
        
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        
        ax1 = axes[0]
        ax1.hist(y, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
        ax1.axvline(y.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {y.mean():.2f}')
        ax1.axvline(y.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {y.median():.2f}')
        ax1.set_xlabel(self.response_column)
        ax1.set_ylabel('Frequency')
        ax1.set_title('Response Distribution')
        ax1.legend()
        
        ax2 = axes[1]
        ax2.boxplot(y, vert=True)
        ax2.set_ylabel(self.response_column)
        ax2.set_title('Response Box Plot')
        
        plt.tight_layout()
        plt.show()
        
        print(f"Response statistics:")
        print(f"  Min:    {y.min():.4f}")
        print(f"  Max:    {y.max():.4f}")
        print(f"  Mean:   {y.mean():.4f}")
        print(f"  Median: {y.median():.4f}")
        print(f"  Std:    {y.std():.4f}")
    
    def plot_correlation(self, corr_df):
        """Plot correlation analysis results."""
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        ax1 = axes[0]
        colors = []
        for _, row in corr_df.iterrows():
            if row['correlation'] > 0:
                colors.append('forestgreen' if row['type'] == 'continuous' else 'steelblue')
            else:
                colors.append('crimson' if row['type'] == 'continuous' else 'darkorange')
        
        ax1.barh(corr_df['feature'][::-1], corr_df['abs_corr'][::-1], color=colors[::-1])
        ax1.axvline(x=self.correlation_strong, color='green', linestyle='--', linewidth=2, 
                    label=f'Strong ({self.correlation_strong})')
        ax1.axvline(x=self.correlation_moderate, color='orange', linestyle='--', linewidth=1.5, 
                    label=f'Moderate ({self.correlation_moderate})')
        ax1.set_xlabel('|Correlation|')
        ax1.set_title('Feature-Response Correlation\n(Green/Blue=Positive, Red/Orange=Negative)')
        ax1.legend(loc='lower right')
        
        ax2 = axes[1]
        colors_signed = ['forestgreen' if c > 0 else 'crimson' for c in corr_df['correlation']]
        ax2.barh(corr_df['feature'][::-1], corr_df['correlation'][::-1], color=colors_signed[::-1])
        ax2.axvline(x=0, color='black', linewidth=1)
        ax2.set_xlabel('Correlation (with sign)')
        ax2.set_title('Direction of Effect\n(Green=Positive, Red=Negative)')
        
        plt.tight_layout()
        plt.show()
    
    def plot_lasso(self, lasso_df):
        """Plot Lasso regression results."""
        plt.figure(figsize=(10, 6))
        colors = ['forestgreen' if s else 'lightgray' for s in lasso_df['selected']]
        plt.barh(lasso_df['feature'][::-1], lasso_df['abs_coef'][::-1], color=colors[::-1])
        plt.xlabel('|Coefficient|')
        plt.title('Lasso Coefficients\n(Green = Selected, Gray = Eliminated)')
        plt.tight_layout()
        plt.show()
    
    def plot_pls(self, pls_df, cv_scores):
        """Plot PLS VIP scores and component selection."""
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        ax1 = axes[0]
        colors = ['darkgreen' if v >= self.vip_important else 'orange' if v >= self.vip_moderate else 'lightcoral' 
                  for v in pls_df['VIP']]
        ax1.barh(pls_df['feature'][::-1], pls_df['VIP'][::-1], color=colors[::-1])
        ax1.axvline(x=self.vip_important, color='green', linestyle='--', linewidth=2, 
                    label=f'Important ({self.vip_important})')
        ax1.axvline(x=self.vip_moderate, color='orange', linestyle='--', linewidth=1.5, 
                    label=f'Moderate ({self.vip_moderate})')
        ax1.set_xlabel('VIP Score')
        ax1.set_title('PLS Variable Importance in Projection')
        ax1.legend()
        
        ax2 = axes[1]
        ax2.plot(range(1, len(cv_scores) + 1), cv_scores, 'bo-', linewidth=2, markersize=8)
        optimal = np.argmax(cv_scores) + 1
        ax2.axvline(x=optimal, color='red', linestyle='--', label=f'Optimal = {optimal}')
        ax2.set_xlabel('Number of Components')
        ax2.set_ylabel('CV R²')
        ax2.set_title('PLS Component Selection')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def plot_interactions(self, interaction_df):
        """Plot interaction screening results."""
        if len(interaction_df) == 0:
            print("No interactions to plot")
            return
        
        fig, ax = plt.subplots(figsize=(10, 5))
        colors = ['crimson' if s > self.interaction_threshold else 'steelblue' for s in interaction_df['strength']]
        ax.barh(interaction_df['interaction'][::-1], interaction_df['strength'][::-1], color=colors[::-1])
        ax.axvline(x=self.interaction_threshold, color='red', linestyle='--', linewidth=2, 
                   label=f'Threshold ({self.interaction_threshold})')
        ax.set_xlabel('Interaction Strength')
        ax.set_title('Interaction Screening\n(Red = Potential Interaction)')
        ax.legend()
        plt.tight_layout()
        plt.show()
    
    def plot_multicollinearity(self, feature_corr):
        """Plot feature correlation heatmap."""
        plt.figure(figsize=(10, 8))
        mask = np.triu(np.ones_like(feature_corr, dtype=bool), k=0)
        sns.heatmap(feature_corr, annot=True, cmap='RdBu_r', center=0, fmt='.2f',
                    mask=mask, square=True, linewidths=0.5)
        plt.title('Feature-Feature Correlations\n(Check for multicollinearity)')
        plt.tight_layout()
        plt.show()
    
    def plot_consensus(self):
        """Plot consensus ranking visualization."""
        print("\n" + "=" * 60)
        print("CONSENSUS VISUALIZATION")
        print("=" * 60)
        
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # 1. Heatmap of rankings
        ax1 = axes[0, 0]
        heatmap_data = self.consensus.set_index('feature')[['rank_corr', 'rank_lasso', 'rank_pls']]
        heatmap_data.columns = ['Correlation', 'Lasso', 'PLS']
        sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='RdYlGn_r', ax=ax1,
                    cbar_kws={'label': 'Rank (lower=better)'})
        ax1.set_title('Rankings Across Methods')
        
        # 2. Method agreement
        ax2 = axes[0, 1]
        colors = ['darkgreen' if a >= 3 else 'orange' if a >= 2 else 'lightcoral' 
                  for a in self.consensus['methods_top3']]
        ax2.barh(self.consensus['feature'][::-1], self.consensus['methods_top3'][::-1], color=colors[::-1])
        ax2.axvline(x=2, color='orange', linestyle='--', linewidth=2)
        ax2.set_xlabel('Methods Ranking Feature in Top 3')
        ax2.set_title('Method Agreement\n(Green=3/3, Orange=2/3, Red=1/3 or less)')
        
        # 3. Average rank
        ax3 = axes[1, 0]
        colors = ['steelblue' if t == 'binary' else 'forestgreen' for t in self.consensus['type']]
        ax3.barh(self.consensus['feature'][::-1], self.consensus['avg_rank'][::-1], color=colors[::-1])
        ax3.set_xlabel('Average Rank (lower = better)')
        ax3.set_title('Consensus Ranking\n(Green=Continuous, Blue=Binary)')
        ax3.invert_xaxis()
        
        # 4. Correlation vs VIP
        ax4 = axes[1, 1]
        for _, row in self.consensus.iterrows():
            color = 'steelblue' if row['type'] == 'binary' else 'forestgreen'
            marker = 's' if row['has_interaction'] else 'o'
            ax4.scatter(abs(row['correlation']), row['VIP'], c=color, s=100, marker=marker, 
                        edgecolors='black', linewidth=0.5)
            ax4.annotate(row['feature'], (abs(row['correlation']), row['VIP']), 
                         fontsize=8, ha='left', va='bottom')
        
        ax4.axhline(y=self.vip_important, color='green', linestyle='--', alpha=0.7, 
                    label=f'VIP={self.vip_important}')
        ax4.axvline(x=self.correlation_strong, color='blue', linestyle='--', alpha=0.7, 
                    label=f'|r|={self.correlation_strong}')
        ax4.set_xlabel('|Correlation|')
        ax4.set_ylabel('VIP Score')
        ax4.set_title('Correlation vs VIP\n(Square=Has Interaction)')
        ax4.legend(loc='lower right')
        
        plt.tight_layout()
        plt.show()
    
    def plot_validation(self, loo_preds, loo_actual, metrics):
        """Plot LOO-CV validation results."""
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        ax1 = axes[0]
        ax1.scatter(loo_actual, loo_preds, alpha=0.7, edgecolors='black', linewidth=0.5)
        min_val, max_val = min(loo_actual.min(), loo_preds.min()), max(loo_actual.max(), loo_preds.max())
        ax1.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect')
        ax1.set_xlabel('Actual')
        ax1.set_ylabel('Predicted')
        ax1.set_title(f'LOO-CV: R² = {metrics["r2"]:.4f}')
        ax1.legend()
        
        ax2 = axes[1]
        residuals = loo_actual - loo_preds
        ax2.hist(residuals, bins=12, edgecolor='black', alpha=0.7)
        ax2.axvline(x=0, color='red', linestyle='--', linewidth=2)
        ax2.set_xlabel('Residual (Actual - Predicted)')
        ax2.set_ylabel('Frequency')
        ax2.set_title(f'Residuals: Mean={residuals.mean():.3f}')
        
        ax3 = axes[2]
        ax3.scatter(loo_preds, residuals, alpha=0.7, edgecolors='black', linewidth=0.5)
        ax3.axhline(y=0, color='red', linestyle='--', linewidth=2)
        ax3.set_xlabel('Predicted')
        ax3.set_ylabel('Residual')
        ax3.set_title('Residuals vs Predicted')
        
        plt.tight_layout()
        plt.show()
    
    def plot_all(self, results):
        """Generate all plots."""
        self.plot_feature_vs_response(results['X_train'], results['y_train'])
        self.plot_response_distribution(results['y_train'])
        self.plot_correlation(results['corr_df'])
        self.plot_lasso(results['lasso_df'])
        self.plot_pls(results['pls_df'], results['cv_scores'])
        if len(results['interaction_df']) > 0:
            self.plot_interactions(results['interaction_df'])
        self.plot_multicollinearity(results['feature_corr'])
        self.plot_consensus()
        self.plot_validation(results['loo_preds'], results['loo_actual'], results['metrics'])
    
    # =========================================================================
    # MAIN FIT METHOD
    # =========================================================================
    
    def fit(self, df, feature_list, manual_features=None, verbose=True, plot=True):
        """
        Run complete feature screening pipeline.
        
        Parameters:
        -----------
        df : pd.DataFrame - Input data
        feature_list : list - List of feature column names
        manual_features : list or None - If provided, override automatic selection
        verbose : bool - Print detailed output
        plot : bool - Generate plots
        
        Returns:
        --------
        results : dict with all analysis results
        """
        if verbose:
            print("=" * 70)
            print("PHASE 1: FEATURE SCREENING FOR BAYESIAN OPTIMIZATION")
            print("=" * 70)
        
        # Classify and prepare
        df = self.classify_features(df.copy(), feature_list, verbose)
        X_train, X_test, X_train_model, X_test_model, y_train, y_test = self.prepare_data(df, verbose)
        
        # Run methods on training data only
        corr_df = self.compute_correlations(X_train, y_train, verbose)
        lasso_df, lasso_alpha = self.run_lasso(X_train_model, y_train, verbose)
        pls_df, optimal_comp, cv_scores = self.run_pls(X_train_model, y_train, verbose)
        interaction_df, strong_interactions, features_with_interactions = self.screen_interactions(
            X_train, y_train, corr_df, verbose)
        high_corr_pairs, feature_corr = self.check_multicollinearity(X_train, verbose)
        
        # Build consensus and recommend
        self.build_consensus(corr_df, lasso_df, pls_df, verbose)
        
        # Feature selection: manual override or automatic
        if manual_features is not None:
            self.set_features_manual(manual_features, verbose)
        else:
            self.recommend_features(verbose)
        
        # Validate and generate bounds
        metrics, loo_preds, loo_actual, issues = self.validate_selection(
            X_train, X_train_model, y_train, verbose)
        self.generate_bo_bounds(self.original_X_train, verbose)
        
        results = {
            'corr_df': corr_df, 'lasso_df': lasso_df, 'pls_df': pls_df,
            'interaction_df': interaction_df, 'strong_interactions': strong_interactions,
            'features_with_interactions': features_with_interactions,
            'high_corr_pairs': high_corr_pairs, 'feature_corr': feature_corr,
            'consensus': self.consensus, 'selected_features': self.selected_features,
            'metrics': metrics, 'loo_preds': loo_preds, 'loo_actual': loo_actual,
            'validation_issues': issues,
            'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test,
            'X_train_model': X_train_model, 'X_test_model': X_test_model,
            'lasso_alpha': lasso_alpha, 'pls_optimal_comp': optimal_comp, 'cv_scores': cv_scores
        }
        
        # Plotting
        if plot:
            self.plot_all(results)
        
        # Final summary
        if verbose:
            self._print_final_summary(results)
        
        return results
    
    def rerun_with_features(self, results, new_features, verbose=True, plot=True):
        """
        Re-run validation with different feature selection.
        
        Parameters:
        -----------
        results : dict - Results from previous fit()
        new_features : list - New list of features to use
        verbose : bool - Print detailed output
        plot : bool - Generate validation plot
        
        Returns:
        --------
        Updated results dict
        """
        self.set_features_manual(new_features, verbose)
        
        metrics, loo_preds, loo_actual, issues = self.validate_selection(
            results['X_train'], results['X_train_model'], results['y_train'], verbose)
        self.generate_bo_bounds(self.original_X_train, verbose)
        
        results['selected_features'] = self.selected_features
        results['metrics'] = metrics
        results['loo_preds'] = loo_preds
        results['loo_actual'] = loo_actual
        results['validation_issues'] = issues
        
        if plot:
            self.plot_validation(loo_preds, loo_actual, metrics)
        
        if verbose:
            self._print_final_summary(results)
        
        return results
    
    def _print_final_summary(self, results):
        """Print final summary."""
        print("\n" + "=" * 70)
        print("PHASE 1 COMPLETE: FEATURE SCREENING SUMMARY")
        print("=" * 70)
        
        print(f"""
DATA:
  Train samples: {len(results['X_train'])}
  Test samples: {len(results['X_test'])}
  Total features: {len(self.feature_cols)}
    - Binary: {len(self.binary_features)}
    - Continuous: {len(self.continuous_features)}

METHODS USED:
  1. Pearson Correlation
  2. Lasso Regression (alpha={results['lasso_alpha']:.4f})
  3. PLS with VIP Scores ({results['pls_optimal_comp']} components)
  4. Interaction Screening
  5. Multicollinearity Check

SELECTED FOR BAYESIAN OPTIMIZATION ({len(self.selected_features)}):
""")
        
        for feat in self.selected_features:
            row = self.consensus[self.consensus['feature'] == feat].iloc[0]
            bounds = self.bo_bounds_df[self.bo_bounds_df['feature'] == feat].iloc[0]
            int_flag = " ⚡" if row['has_interaction'] else ""
            print(f"  • {feat} ({row['type']}){int_flag}")
            print(f"      Correlation: {row['correlation']:+.3f}")
            print(f"      VIP Score: {row['VIP']:.2f}")
            print(f"      Bounds: [{bounds['min']:.2f}, {bounds['max']:.2f}]")
        
        if self.features_with_interactions:
            print(f"\n  ⚡ = Part of detected interaction")
        
        if results['validation_issues']:
            print(f"\n⚠️ VALIDATION ISSUES:")
            for issue in results['validation_issues']:
                print(f"    • {issue}")
        
        print(f"""
VALIDATION:
  LOO-CV R²: {results['metrics']['r2']:.4f}
  LOO-CV RMSE: {results['metrics']['rmse']:.4f}
  LOO-CV MAE: {results['metrics']['mae']:.4f}

NEXT STEPS (Phase 2: Bayesian Optimization):
  1. Load bo_initial_data.csv as initial training data
  2. Use bo_bounds.csv for search space
  3. Fit GP surrogate model
  4. Run acquisition function loop (EI or UCB)
  5. Expected iterations: {20*len(self.selected_features)}-{40*len(self.selected_features)}
""")
        
        print("=" * 70)
        print("READY FOR PHASE 2: BAYESIAN OPTIMIZATION")
        print("=" * 70)


# =============================================================================
# CONVENIENCE FUNCTION
# =============================================================================

def run_feature_screening(df, feature_list, response_column, target_features=4, 
                          maximize_response=False, test_size=0.2, 
                          manual_features=None, export=True, plot=True,
                          verbose=True, **kwargs):
    """
    Convenience function to run complete feature screening.
    
    Parameters:
    -----------
    df : pd.DataFrame - Input data
    feature_list : list - List of feature column names
    response_column : str - Name of response variable
    target_features : int - Number of features to select (default: 4)
    maximize_response : bool - True if higher response is better (default: False)
    test_size : float - Proportion for test set (default: 0.2)
    manual_features : list or None - Override automatic selection with these features
    export : bool - Whether to export CSV files (default: True)
    plot : bool - Whether to generate plots (default: True)
    verbose : bool - Whether to print detailed output (default: True)
    **kwargs : Additional parameters for FeatureScreener thresholds:
        - correlation_strong (default: 0.4)
        - correlation_moderate (default: 0.2)
        - vip_important (default: 1.0)
        - vip_moderate (default: 0.8)
        - multicollinearity_threshold (default: 0.7)
        - interaction_threshold (default: 0.3)
        - random_state (default: 42)
    
    Returns:
    --------
    screener : FeatureScreener object
    results : dict with all analysis results
    
    Examples:
    ---------
    # Automatic selection
    screener, results = run_feature_screening(
        df=df_total,
        feature_list=feature_list,
        response_column="Downy Leak",
        target_features=4
    )
    
    # Manual override
    screener, results = run_feature_screening(
        df=df_total,
        feature_list=feature_list,
        response_column="Downy Leak",
        manual_features=['Feature_A', 'Feature_B', 'Feature_C']
    )
    
    # Re-run with different features after initial fit
    results = screener.rerun_with_features(results, ['New_Feature_1', 'New_Feature_2'])
    """
    screener = FeatureScreener(
        response_column=response_column,
        target_features=target_features,
        maximize_response=maximize_response,
        test_size=test_size,
        **kwargs
    )
    
    results = screener.fit(df, feature_list, manual_features=manual_features, 
                           verbose=verbose, plot=plot)
    
    if export:
        # Combine train and test for export
        original_X = pd.concat([screener.original_X_train, screener.original_X_test]).reset_index(drop=True)
        original_y = pd.concat([screener.original_y_train, screener.original_y_test]).reset_index(drop=True)
        screener.export_for_bo(original_X, original_y, verbose=verbose)
    
    return screener, results

In [None]:
# After Cells 1-6 (data loading and configuration)

screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    target_features=TARGET_FEATURES,
    maximize_response=MAXIMIZE_RESPONSE
)

In [None]:
screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    manual_features=['example']  # Your chosen features
)

In [None]:
# First run automatic to see recommendations
screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    target_features=4,
    export=False  # Don't export yet
)

# Review results, then manually select different features
results = screener.rerun_with_features(
    results, 
    new_features=['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']
)

# Now export with your manual selection
original_X = pd.concat([screener.original_X_train, screener.original_X_test]).reset_index(drop=True)
original_y = pd.concat([screener.original_y_train, screener.original_y_test]).reset_index(drop=True)
screener.export_for_bo(original_X, original_y)

In [None]:
#Quiet Mode
screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    target_features=4,
    verbose=False,
    plot=False
)

In [None]:
screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    target_features=4,
    correlation_strong=0.5,        # Stricter correlation threshold
    correlation_moderate=0.3,
    vip_important=1.2,             # Stricter VIP threshold
    vip_moderate=0.9,
    multicollinearity_threshold=0.6,  # Stricter multicollinearity
    interaction_threshold=0.4,
    test_size=0.25                 # Larger test set
)

In [None]:
#Access results: 
# Selected features
print(screener.selected_features)

# BO bounds
print(screener.bo_bounds_df)

# Consensus ranking
print(results['consensus'])

# Validation metrics
print(f"LOO R²: {results['metrics']['r2']:.4f}")

# Features with interactions
print(screener.features_with_interactions)

# Binary feature mappings
print(screener.binary_mappings)

# Check for validation issues
if results['validation_issues']:
    print("Issues found:", results['validation_issues'])

In [None]:
# Generate individual pllots
# Run without plots first
screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    plot=False
)

# Then generate specific plots as needed
screener.plot_correlation(results['corr_df'])
screener.plot_lasso(results['lasso_df'])
screener.plot_pls(results['pls_df'], results['cv_scores'])
screener.plot_consensus()
screener.plot_validation(results['loo_preds'], results['loo_actual'], results['metrics'])

# Or generate all plots at once
screener.plot_all(results)

In [None]:
# Step 1: Run initial screening
screener, results = run_feature_screening(
    df=df_total,
    feature_list=feature_list,
    response_column=RESPONSE_COLUMN,
    target_features=5,
    export=False
)

# Step 2: View consensus to decide
print("\nConsensus ranking:")
print(results['consensus'][['feature', 'correlation', 'VIP', 'selected', 'score']])

# Step 3: Try different combinations
print("\n--- Trying combination 1 ---")
results = screener.rerun_with_features(results, ['Feat_A', 'Feat_B', 'Feat_C'], plot=False)
print(f"R² = {results['metrics']['r2']:.4f}")

print("\n--- Trying combination 2 ---")
results = screener.rerun_with_features(results, ['Feat_A', 'Feat_D', 'Feat_E'], plot=False)
print(f"R² = {results['metrics']['r2']:.4f}")

# Step 4: Finalize with best combination
results = screener.rerun_with_features(results, ['Feat_A', 'Feat_B', 'Feat_C'], plot=True)

# Step 5: Export
original_X = pd.concat([screener.original_X_train, screener.original_X_test]).reset_index(drop=True)
original_y = pd.concat([screener.original_y_train, screener.original_y_test]).reset_index(drop=True)
screener.export_for_bo(original_X, original_y)