In [None]:
# =============================================================================
# CELL 1: IMPORTS
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.metrics import r2_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("=" * 70)
print("ML FEATURE SCREENING FOR BAYESIAN OPTIMIZATION")
print("=" * 70)
print("""
Purpose: Screen features from observational data before BO

Workflow:
  1. This pipeline → Select 3-5 important features
  2. Bayesian Optimization → Find optimal values for selected features

Appropriate when:
  - Data is observational (not designed factorial)
  - ~20-50 initial experiments
  - Want quick screening without formal DOE inference
""")

In [None]:
# =============================================================================
# CELL 2: CONFIGURATION
# =============================================================================

FILE_PATH = "your_data.xlsx"
RESPONSE_COLUMN = "yield"
EXCLUDE_COLUMNS = ["experiment_id"]

# Feature selection targets
TARGET_FEATURES_FOR_BO = 4         # Aim for 3-5 features
MIN_FEATURES = 3                   # Don't go below this
MAX_FEATURES = 6                   # Don't exceed this

# Thresholds
CORRELATION_STRONG = 0.4
CORRELATION_MODERATE = 0.2
VIP_IMPORTANT = 1.0
VIP_MODERATE = 0.8
MULTICOLLINEARITY_THRESHOLD = 0.7

# BO Configuration
MAXIMIZE_RESPONSE = True           # True if higher response is better

print("Configuration:")
print(f"  Response: {RESPONSE_COLUMN}")
print(f"  Target features for BO: {TARGET_FEATURES_FOR_BO}")
print(f"  Optimization direction: {'Maximize' if MAXIMIZE_RESPONSE else 'Minimize'}")

In [None]:
# =============================================================================
# CELL 3: LOAD DATA
# =============================================================================

try:
    df = pd.read_excel(FILE_PATH)
    print(f"✓ Loaded: {df.shape[0]} samples, {df.shape[1]} columns")
except FileNotFoundError:
    print("Creating example observational data...")
    
    np.random.seed(42)
    n = 30
    
    df = pd.DataFrame({
        'experiment_id': range(1, n+1),
        'temperature_C': np.random.uniform(50, 90, n),
        'pressure_bar': np.random.uniform(1, 5, n),
        'pH': np.random.uniform(5, 9, n),
        'concentration_M': np.random.uniform(0.1, 1.0, n),
        'reaction_time_min': np.random.uniform(15, 60, n),
        'catalyst_loading_g': np.random.uniform(0.5, 3.0, n),
        'stirring_rpm': np.random.uniform(200, 600, n),
        'solvent_ratio': np.random.uniform(0.3, 0.7, n),
        'humidity_pct': np.random.uniform(40, 60, n),
        'particle_size_um': np.random.uniform(20, 80, n),
    })
    
    # True relationships (unknown to experimenter)
    df['yield'] = (
        35 +
        0.4 * df['temperature_C'] +
        3.0 * df['catalyst_loading_g'] +
        -2.5 * df['pH'] +
        0.15 * df['reaction_time_min'] +
        0.05 * df['temperature_C'] * df['catalyst_loading_g'] / 10 +  # Interaction
        np.random.normal(0, 4, n)
    ).clip(20, 95)
    
    print(f"✓ Example data created: {n} samples")

print(f"\nColumns: {list(df.columns)}")
df.head()

In [None]:
# =============================================================================
# CELL 4: PREPARE DATA
# =============================================================================

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in EXCLUDE_COLUMNS + [RESPONSE_COLUMN]]

X = df[feature_cols].copy()
y = df[RESPONSE_COLUMN].copy()

n_samples = len(X)
n_features = len(feature_cols)

print(f"Features ({n_features}): {feature_cols}")
print(f"Response: {RESPONSE_COLUMN}")
print(f"Samples: {n_samples}")
print(f"Samples/Features ratio: {n_samples/n_features:.1f}")

if n_samples / n_features < 5:
    print("\n⚠️  Low sample-to-feature ratio")
    print("   Using regularized methods only (correlation, Lasso, Ridge, PLS)")

In [None]:
# =============================================================================
# CELL 5: HANDLE MISSING VALUES
# =============================================================================

print("=" * 70)
print("DATA CLEANING")
print("=" * 70)

# Missing values
missing_X = X.isnull().sum()
missing_y = y.isnull().sum()

if missing_X.sum() > 0:
    print(f"Missing in features: {missing_X[missing_X > 0].to_dict()}")
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
    print("✓ Imputed with median")
else:
    print("✓ No missing values in features")

if missing_y > 0:
    valid_mask = ~y.isnull()
    X = X[valid_mask]
    y = y[valid_mask]
    print(f"✓ Removed {missing_y} rows with missing response")

print(f"\nClean dataset: {len(X)} samples")

In [None]:
# =============================================================================
# CELL 6: STORE ORIGINAL RANGES (FOR BO BOUNDS)
# =============================================================================

print("=" * 70)
print("FEATURE RANGES (Original Scale)")
print("=" * 70)
print("These will become your BO search bounds\n")

feature_ranges = pd.DataFrame({
    'feature': feature_cols,
    'min': X.min().values,
    'max': X.max().values,
    'mean': X.mean().values,
    'std': X.std().values,
    'range': (X.max() - X.min()).values
})

print(feature_ranges.to_string(index=False))

# Store for later
original_X = X.copy()

In [None]:
# =============================================================================
# CELL 7: STANDARDIZE FOR MODELING
# =============================================================================

scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print("✓ Features standardized for modeling")
print("  (Original ranges preserved for BO bounds)")

In [None]:
# =============================================================================
# CELL 8: VISUAL INSPECTION
# =============================================================================

print("=" * 70)
print("VISUAL INSPECTION: Feature vs Response")
print("=" * 70)

n_cols = min(4, n_features)
n_rows = int(np.ceil(n_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3.5*n_rows))
axes = axes.flatten() if n_features > 1 else [axes]

for i, col in enumerate(feature_cols):
    ax = axes[i]
    ax.scatter(X[col], y, alpha=0.6, edgecolors='black', linewidth=0.5)
    
    # Trend line
    z = np.polyfit(X[col], y, 1)
    p = np.poly1d(z)
    x_line = np.linspace(X[col].min(), X[col].max(), 100)
    ax.plot(x_line, p(x_line), 'r--', linewidth=2)
    
    corr = X[col].corr(y)
    ax.set_xlabel(col)
    ax.set_ylabel(RESPONSE_COLUMN)
    ax.set_title(f'r = {corr:.3f}')

for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Feature vs Response (Look for linear and non-linear patterns)', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 9: METHOD 1 - CORRELATION
# =============================================================================

print("=" * 70)
print("METHOD 1: PEARSON CORRELATION")
print("=" * 70)

correlations = X.corrwith(y)

corr_results = pd.DataFrame({
    'feature': X.columns,
    'correlation': correlations.values,
    'abs_correlation': np.abs(correlations.values)
}).sort_values('abs_correlation', ascending=False).reset_index(drop=True)

corr_results['rank_corr'] = range(1, len(corr_results) + 1)
corr_results['direction'] = ['Positive' if c > 0 else 'Negative' for c in corr_results['correlation']]

def strength(r):
    r = abs(r)
    if r >= CORRELATION_STRONG: return 'Strong'
    elif r >= CORRELATION_MODERATE: return 'Moderate'
    else: return 'Weak'

corr_results['strength'] = corr_results['correlation'].apply(strength)

print("\nResults:")
print(corr_results[['rank_corr', 'feature', 'correlation', 'direction', 'strength']].to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
colors = ['forestgreen' if c > 0 else 'crimson' for c in corr_results['correlation']]
plt.barh(corr_results['feature'][::-1], corr_results['abs_correlation'][::-1], color=colors[::-1])
plt.axvline(x=CORRELATION_STRONG, color='green', linestyle='--', label=f'Strong ({CORRELATION_STRONG})')
plt.axvline(x=CORRELATION_MODERATE, color='orange', linestyle='--', label=f'Moderate ({CORRELATION_MODERATE})')
plt.xlabel('|Correlation|')
plt.title('Correlation with Response\n(Green=Positive, Red=Negative)')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 10: METHOD 2 - MUTUAL INFORMATION
# =============================================================================

print("=" * 70)
print("METHOD 2: MUTUAL INFORMATION")
print("=" * 70)

mi_scores = mutual_info_regression(X_scaled, y, random_state=RANDOM_STATE)

mi_results = pd.DataFrame({
    'feature': X_scaled.columns,
    'MI_score': mi_scores
}).sort_values('MI_score', ascending=False).reset_index(drop=True)

mi_results['rank_mi'] = range(1, len(mi_results) + 1)

print("\nResults:")
print(mi_results[['rank_mi', 'feature', 'MI_score']].to_string(index=False))

# Check for non-linear relationships
comparison = corr_results[['feature', 'rank_corr']].merge(mi_results[['feature', 'rank_mi']], on='feature')
comparison['rank_diff'] = comparison['rank_corr'] - comparison['rank_mi']
potential_nonlinear = comparison[comparison['rank_diff'] >= 3]['feature'].tolist()

if potential_nonlinear:
    print(f"\n⚠️  Possible non-linear relationships: {potential_nonlinear}")
    print("   (MI rank much higher than correlation rank)")

# Plot
plt.figure(figsize=(10, 6))
plt.barh(mi_results['feature'][::-1], mi_results['MI_score'][::-1], color='steelblue')
plt.xlabel('Mutual Information')
plt.title('Mutual Information (Captures Non-linear Relationships)')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 11: METHOD 3 - LASSO
# =============================================================================

print("=" * 70)
print("METHOD 3: LASSO (Automatic Selection)")
print("=" * 70)

lasso = LassoCV(cv=5, max_iter=10000, random_state=RANDOM_STATE)
lasso.fit(X_scaled, y)

lasso_results = pd.DataFrame({
    'feature': X_scaled.columns,
    'coefficient': lasso.coef_,
    'abs_coefficient': np.abs(lasso.coef_)
}).sort_values('abs_coefficient', ascending=False).reset_index(drop=True)

lasso_results['rank_lasso'] = range(1, len(lasso_results) + 1)
lasso_results['selected'] = lasso_results['coefficient'] != 0

print(f"\nLasso alpha: {lasso.alpha_:.4f}")
print(f"Features selected: {lasso_results['selected'].sum()}/{len(lasso_results)}")
print("\nResults:")
print(lasso_results[['rank_lasso', 'feature', 'coefficient', 'selected']].to_string(index=False))

selected_by_lasso = lasso_results[lasso_results['selected']]['feature'].tolist()
print(f"\n✓ Lasso selected: {selected_by_lasso}")

# Plot
plt.figure(figsize=(10, 6))
colors = ['forestgreen' if s else 'lightgray' for s in lasso_results['selected']]
plt.barh(lasso_results['feature'][::-1], lasso_results['abs_coefficient'][::-1], color=colors[::-1])
plt.xlabel('|Coefficient|')
plt.title('Lasso (Green=Selected, Gray=Eliminated)')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 12: METHOD 4 - RIDGE
# =============================================================================

print("=" * 70)
print("METHOD 4: RIDGE (Stable Coefficients)")
print("=" * 70)

ridge = RidgeCV(alphas=[0.01, 0.1, 1, 10, 100, 1000], cv=5)
ridge.fit(X_scaled, y)

ridge_results = pd.DataFrame({
    'feature': X_scaled.columns,
    'coefficient': ridge.coef_,
    'abs_coefficient': np.abs(ridge.coef_)
}).sort_values('abs_coefficient', ascending=False).reset_index(drop=True)

ridge_results['rank_ridge'] = range(1, len(ridge_results) + 1)

print(f"\nRidge alpha: {ridge.alpha_:.4f}")
print("\nResults:")
print(ridge_results[['rank_ridge', 'feature', 'coefficient', 'abs_coefficient']].to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
colors = ['forestgreen' if c > 0 else 'crimson' for c in ridge_results['coefficient']]
plt.barh(ridge_results['feature'][::-1], ridge_results['abs_coefficient'][::-1], color=colors[::-1])
plt.xlabel('|Coefficient|')
plt.title('Ridge Coefficients (Green=Positive, Red=Negative)')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 13: METHOD 5 - PLS WITH VIP
# =============================================================================

print("=" * 70)
print("METHOD 5: PLS WITH VIP SCORES")
print("=" * 70)

# Optimal components
max_comp = min(5, X_scaled.shape[1], len(X_scaled) - 1)
cv_scores = []
for n_comp in range(1, max_comp + 1):
    pls_temp = PLSRegression(n_components=n_comp)
    scores = cross_val_score(pls_temp, X_scaled, y, cv=5, scoring='r2')
    cv_scores.append(scores.mean())

optimal_comp = np.argmax(cv_scores) + 1
print(f"Optimal components: {optimal_comp}")

# Fit PLS
pls = PLSRegression(n_components=optimal_comp)
pls.fit(X_scaled, y)

# Calculate VIP
def calculate_vip(model):
    t = model.x_scores_
    w = model.x_weights_
    q = model.y_loadings_
    n_features, n_components = w.shape
    ss_y = np.sum(t ** 2, axis=0) * q.flatten() ** 2
    ss_y_total = np.sum(ss_y)
    vip = np.zeros(n_features)
    for i in range(n_features):
        weight_sum = 0
        for j in range(n_components):
            weight_sum += (w[i, j] ** 2) * ss_y[j] / (np.sum(w[:, j] ** 2))
        vip[i] = np.sqrt(n_features * weight_sum / ss_y_total)
    return vip

vip_scores = calculate_vip(pls)

pls_results = pd.DataFrame({
    'feature': X_scaled.columns,
    'VIP': vip_scores,
    'coefficient': pls.coef_.flatten()
}).sort_values('VIP', ascending=False).reset_index(drop=True)

pls_results['rank_pls'] = range(1, len(pls_results) + 1)

def vip_category(v):
    if v >= VIP_IMPORTANT: return 'Important'
    elif v >= VIP_MODERATE: return 'Moderate'
    else: return 'Less Important'

pls_results['VIP_category'] = pls_results['VIP'].apply(vip_category)

print("\nResults:")
print(pls_results[['rank_pls', 'feature', 'VIP', 'VIP_category']].to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
colors = ['darkgreen' if v >= VIP_IMPORTANT else 'orange' if v >= VIP_MODERATE else 'lightcoral' 
          for v in pls_results['VIP']]
plt.barh(pls_results['feature'][::-1], pls_results['VIP'][::-1], color=colors[::-1])
plt.axvline(x=VIP_IMPORTANT, color='green', linestyle='--', label=f'Important ({VIP_IMPORTANT})')
plt.axvline(x=VIP_MODERATE, color='orange', linestyle='--', label=f'Moderate ({VIP_MODERATE})')
plt.xlabel('VIP Score')
plt.title('PLS Variable Importance in Projection')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 14: CHECK FOR INTERACTIONS (NEW - IMPORTANT FOR BO)
# =============================================================================

print("=" * 70)
print("INTERACTION SCREENING (Important for BO)")
print("=" * 70)
print("""
Why this matters:
  If Feature A × Feature B interaction is strong, you need BOTH in BO
  even if individual effects are weak.
  
Method: Check if effect of A depends on level of B
""")

from itertools import combinations

# Get top features to check interactions
top_n_for_interactions = 6
top_features_for_int = corr_results.head(top_n_for_interactions)['feature'].tolist()

interaction_results = []

for f1, f2 in combinations(top_features_for_int, 2):
    # Split by median of f2
    median_f2 = X[f2].median()
    low_f2 = X[f2] <= median_f2
    high_f2 = X[f2] > median_f2
    
    # Correlation of f1 with y in each group
    corr_low = X.loc[low_f2, f1].corr(y[low_f2]) if low_f2.sum() > 3 else np.nan
    corr_high = X.loc[high_f2, f1].corr(y[high_f2]) if high_f2.sum() > 3 else np.nan
    
    if not np.isnan(corr_low) and not np.isnan(corr_high):
        interaction_strength = abs(corr_high - corr_low)
        interaction_results.append({
            'interaction': f'{f1} × {f2}',
            'feature_1': f1,
            'feature_2': f2,
            'corr_at_low_f2': corr_low,
            'corr_at_high_f2': corr_high,
            'interaction_strength': interaction_strength
        })

interaction_df = pd.DataFrame(interaction_results).sort_values('interaction_strength', ascending=False)

print("\nInteraction Screening (Top features):")
print(interaction_df[['interaction', 'corr_at_low_f2', 'corr_at_high_f2', 'interaction_strength']].head(10).to_string(index=False))

# Flag strong interactions
INTERACTION_THRESHOLD = 0.3
strong_interactions = interaction_df[interaction_df['interaction_strength'] > INTERACTION_THRESHOLD]

if len(strong_interactions) > 0:
    print(f"\n⚠️  Potential interactions detected (strength > {INTERACTION_THRESHOLD}):")
    for _, row in strong_interactions.iterrows():
        print(f"   {row['interaction']}: strength = {row['interaction_strength']:.3f}")
        print(f"      → Include BOTH {row['feature_1']} and {row['feature_2']} in BO")
else:
    print(f"\n✓ No strong interactions detected (threshold: {INTERACTION_THRESHOLD})")

# Store for later
features_with_interactions = set()
for _, row in strong_interactions.iterrows():
    features_with_interactions.add(row['feature_1'])
    features_with_interactions.add(row['feature_2'])

In [None]:
# =============================================================================
# CELL 15: MULTICOLLINEARITY CHECK
# =============================================================================

print("=" * 70)
print("MULTICOLLINEARITY CHECK")
print("=" * 70)

feature_corr = X.corr()

high_corr_pairs = []
for i in range(len(feature_corr.columns)):
    for j in range(i+1, len(feature_corr.columns)):
        r = feature_corr.iloc[i, j]
        if abs(r) > MULTICOLLINEARITY_THRESHOLD:
            high_corr_pairs.append({
                'feature_1': feature_corr.columns[i],
                'feature_2': feature_corr.columns[j],
                'correlation': r
            })

if high_corr_pairs:
    print(f"\n⚠️  Highly correlated pairs (|r| > {MULTICOLLINEARITY_THRESHOLD}):")
    for pair in high_corr_pairs:
        print(f"   {pair['feature_1']} ↔ {pair['feature_2']}: r = {pair['correlation']:.3f}")
        print(f"   → Keep only ONE in BO (saves dimensions)")
else:
    print("\n✓ No highly correlated feature pairs")

# Heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(feature_corr, dtype=bool), k=1)
sns.heatmap(feature_corr, annot=True, cmap='RdBu_r', center=0, fmt='.2f', mask=mask)
plt.title('Feature-Feature Correlations')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 16: CONSENSUS RANKING
# =============================================================================

print("=" * 70)
print("CONSENSUS RANKING")
print("=" * 70)

# Merge all rankings
consensus = corr_results[['feature', 'rank_corr', 'correlation', 'direction']].copy()
consensus = consensus.merge(mi_results[['feature', 'rank_mi']], on='feature')
consensus = consensus.merge(lasso_results[['feature', 'rank_lasso', 'selected']], on='feature')
consensus = consensus.merge(ridge_results[['feature', 'rank_ridge']], on='feature')
consensus = consensus.merge(pls_results[['feature', 'rank_pls', 'VIP']], on='feature')

# Average rank
rank_cols = ['rank_corr', 'rank_mi', 'rank_lasso', 'rank_ridge', 'rank_pls']
consensus['avg_rank'] = consensus[rank_cols].mean(axis=1)
consensus = consensus.sort_values('avg_rank').reset_index(drop=True)
consensus['final_rank'] = range(1, len(consensus) + 1)

# Method agreement
def count_top_k(row, k=3):
    return sum(1 for col in rank_cols if row[col] <= k)

consensus['methods_in_top3'] = consensus.apply(lambda r: count_top_k(r, 3), axis=1)

# Add interaction flag
consensus['has_interaction'] = consensus['feature'].isin(features_with_interactions)

print("\nConsensus Ranking:")
display_cols = ['final_rank', 'feature', 'correlation', 'VIP', 'selected', 
                'avg_rank', 'methods_in_top3', 'has_interaction']
print(consensus[display_cols].to_string(index=False))

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Heatmap
ax1 = axes[0]
heatmap_data = consensus.set_index('feature')[rank_cols]
heatmap_data.columns = ['Corr', 'MI', 'Lasso', 'Ridge', 'PLS']
sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='RdYlGn_r', ax=ax1)
ax1.set_title('Rankings Across Methods\n(Lower = More Important)')

# Agreement
ax2 = axes[1]
colors = ['darkgreen' if a >= 4 else 'orange' if a >= 3 else 'lightcoral' 
          for a in consensus['methods_in_top3']]
ax2.barh(consensus['feature'][::-1], consensus['methods_in_top3'][::-1], color=colors[::-1])
ax2.axvline(x=3, color='orange', linestyle='--', linewidth=2)
ax2.set_xlabel('Methods Ranking in Top 3')
ax2.set_title('Method Agreement')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 17: AUTOMATIC FEATURE RECOMMENDATION
# =============================================================================

print("=" * 70)
print("AUTOMATIC FEATURE RECOMMENDATION")
print("=" * 70)

# Score each feature
def score_feature(row):
    score = 0
    
    # Correlation strength
    if abs(row['correlation']) >= CORRELATION_STRONG:
        score += 3
    elif abs(row['correlation']) >= CORRELATION_MODERATE:
        score += 2
    
    # VIP
    if row['VIP'] >= VIP_IMPORTANT:
        score += 3
    elif row['VIP'] >= VIP_MODERATE:
        score += 2
    
    # Lasso selected
    if row['selected']:
        score += 2
    
    # Method agreement
    score += row['methods_in_top3']
    
    # Interaction bonus (important for BO)
    if row['has_interaction']:
        score += 2
    
    return score

consensus['selection_score'] = consensus.apply(score_feature, axis=1)
consensus = consensus.sort_values('selection_score', ascending=False).reset_index(drop=True)

print("\nFeature Scoring:")
print(consensus[['feature', 'correlation', 'VIP', 'selected', 'has_interaction', 'selection_score']].to_string(index=False))

# Recommend features
recommended_features = []
reasons = {}

for _, row in consensus.iterrows():
    include = False
    reason = []
    
    if row['selection_score'] >= 6:
        include = True
        reason.append(f"High score ({row['selection_score']})")
    
    if abs(row['correlation']) >= CORRELATION_STRONG:
        include = True
        reason.append(f"Strong correlation ({row['correlation']:.2f})")
    
    if row['VIP'] >= VIP_IMPORTANT:
        include = True
        reason.append(f"VIP ≥ {VIP_IMPORTANT}")
    
    if row['has_interaction'] and row['selection_score'] >= 4:
        include = True
        reason.append("Part of interaction")
    
    if include and len(recommended_features) < MAX_FEATURES:
        recommended_features.append(row['feature'])
        reasons[row['feature']] = '; '.join(reason)

# Ensure minimum features
if len(recommended_features) < MIN_FEATURES:
    for _, row in consensus.iterrows():
        if row['feature'] not in recommended_features:
            recommended_features.append(row['feature'])
            reasons[row['feature']] = f"Added to meet minimum ({MIN_FEATURES})"
        if len(recommended_features) >= MIN_FEATURES:
            break

print(f"\n{'='*50}")
print(f"RECOMMENDED FEATURES FOR BO ({len(recommended_features)})")
print(f"{'='*50}")

for feat in recommended_features:
    row = consensus[consensus['feature'] == feat].iloc[0]
    print(f"\n  ✓ {feat}")
    print(f"      Correlation: {row['correlation']:.3f} ({row['direction']})")
    print(f"      VIP: {row['VIP']:.2f}")
    print(f"      Reason: {reasons[feat]}")

In [None]:
# =============================================================================
# CELL 18: MANUAL ADJUSTMENT (YOUR INPUT)
# =============================================================================

print("=" * 70)
print("MANUAL FEATURE SELECTION")
print("=" * 70)
print("""
Review the recommendations above and adjust if needed.

Consider:
  1. Domain knowledge - does this feature make scientific sense?
  2. Controllability - can you actually vary this in experiments?
  3. Cost - is this feature expensive to change?
  4. Interactions - if two features interact, keep both
""")

# Option 1: Accept recommendations
selected_features = recommended_features.copy()

# Option 2: Manual override - uncomment and modify
# selected_features = [
#     'temperature_C',
#     'catalyst_loading_g',
#     'pH',
#     'reaction_time_min',
# ]

print(f"\nSelected features ({len(selected_features)}): {selected_features}")

# Validate selection
print("\n" + "=" * 50)
print("SELECTION VALIDATION")
print("=" * 50)

# Check multicollinearity in selected set
selected_corr = X[selected_features].corr()
for i in range(len(selected_features)):
    for j in range(i+1, len(selected_features)):
        r = selected_corr.iloc[i, j]
        if abs(r) > MULTICOLLINEARITY_THRESHOLD:
            print(f"⚠️  {selected_features[i]} ↔ {selected_features[j]}: r={r:.2f}")
            print(f"   Consider removing one")

# Check for broken interactions
for _, row in strong_interactions.iterrows():
    f1, f2 = row['feature_1'], row['feature_2']
    if (f1 in selected_features) != (f2 in selected_features):
        print(f"⚠️  Interaction {f1} × {f2} is broken")
        print(f"   {f1} is {'selected' if f1 in selected_features else 'NOT selected'}")
        print(f"   {f2} is {'selected' if f2 in selected_features else 'NOT selected'}")

if len(selected_features) > MAX_FEATURES:
    print(f"⚠️  {len(selected_features)} features selected, but target is {TARGET_FEATURES_FOR_BO}")
    print(f"   BO may need more experiments to converge")

In [None]:
# =============================================================================
# CELL 19: VALIDATE WITH LOO-CV
# =============================================================================

print("=" * 70)
print("VALIDATION: Leave-One-Out CV")
print("=" * 70)

X_selected = X_scaled[selected_features]

# LOO-CV
loo = LeaveOneOut()
loo_predictions = []
loo_actuals = []

for train_idx, test_idx in loo.split(X_selected):
    ridge_temp = RidgeCV(alphas=[0.1, 1, 10, 100], cv=3)
    ridge_temp.fit(X_selected.iloc[train_idx], y.iloc[train_idx])
    pred = ridge_temp.predict(X_selected.iloc[test_idx])[0]
    loo_predictions.append(pred)
    loo_actuals.append(y.iloc[test_idx].values[0])

loo_predictions = np.array(loo_predictions)
loo_actuals = np.array(loo_actuals)

loo_r2 = r2_score(loo_actuals, loo_predictions)
loo_rmse = np.sqrt(np.mean((loo_actuals - loo_predictions)**2))

print(f"\nLOO-CV Results ({len(selected_features)} features):")
print(f"  R²:   {loo_r2:.4f}")
print(f"  RMSE: {loo_rmse:.4f}")

if loo_r2 > 0.5:
    print("\n✓ Good signal - features are predictive")
elif loo_r2 > 0.2:
    print("\n⚠️ Moderate signal - GP in BO can likely capture patterns")
else:
    print("\n⚠️ Weak linear signal - relationships may be non-linear")
    print("   GP surrogate in BO can handle this")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

ax1 = axes[0]
ax1.scatter(loo_actuals, loo_predictions, alpha=0.6, edgecolors='black')
min_val, max_val = min(loo_actuals.min(), loo_predictions.min()), max(loo_actuals.max(), loo_predictions.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
ax1.set_xlabel('Actual')
ax1.set_ylabel('Predicted')
ax1.set_title(f'LOO-CV: R² = {loo_r2:.4f}')

ax2 = axes[1]
residuals = loo_actuals - loo_predictions
ax2.hist(residuals, bins=12, edgecolor='black', alpha=0.7)
ax2.axvline(x=0, color='red', linestyle='--')
ax2.set_xlabel('Residual')
ax2.set_title('Residual Distribution')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 20: DEFINE BO SEARCH SPACE
# =============================================================================

print("=" * 70)
print("BAYESIAN OPTIMIZATION SEARCH SPACE")
print("=" * 70)

# Get bounds from original data
bo_bounds = []
for feat in selected_features:
    feat_min = original_X[feat].min()
    feat_max = original_X[feat].max()
    feat_range = feat_max - feat_min
    
    # Option: Extend bounds slightly beyond observed range
    extend_pct = 0.1  # 10% extension
    extended_min = feat_min - extend_pct * feat_range
    extended_max = feat_max + extend_pct * feat_range
    
    bo_bounds.append({
        'feature': feat,
        'observed_min': feat_min,
        'observed_max': feat_max,
        'bo_min': extended_min,
        'bo_max': extended_max,
        'type': 'continuous'
    })

bo_bounds_df = pd.DataFrame(bo_bounds)

print("\nSearch Space Bounds:")
print(bo_bounds_df.to_string(index=False))

print(f"""
\nNotes:
  - BO bounds extended 10% beyond observed range
  - Adjust if physical constraints exist (e.g., temperature > 0)
  - Mark categorical features if any
""")

# Effect directions (helps with initial search)
print("\nEffect Directions (for initial BO region):")
for feat in selected_features:
    row = consensus[consensus['feature'] == feat].iloc[0]
    direction = row['direction']
    if MAXIMIZE_RESPONSE:
        suggest = "HIGH" if direction == 'Positive' else "LOW"
    else:
        suggest = "LOW" if direction == 'Positive' else "HIGH"
    print(f"  {feat}: {direction} effect → Start search at {suggest} values")

In [None]:
# =============================================================================
# CELL 21: PREPARE INITIAL DATA FOR BO
# =============================================================================

print("=" * 70)
print("INITIAL DATA FOR BAYESIAN OPTIMIZATION")
print("=" * 70)

# Create initial dataset with selected features + response
bo_initial_data = original_X[selected_features].copy()
bo_initial_data[RESPONSE_COLUMN] = y.values

print(f"\nInitial dataset shape: {bo_initial_data.shape}")
print(f"  - {len(bo_initial_data)} experiments")
print(f"  - {len(selected_features)} features")
print(f"  - 1 response ({RESPONSE_COLUMN})")

print(f"\nResponse statistics:")
print(f"  Min:  {y.min():.2f}")
print(f"  Max:  {y.max():.2f}")
print(f"  Mean: {y.mean():.2f}")
print(f"  Std:  {y.std():.2f}")

# Best point so far
if MAXIMIZE_RESPONSE:
    best_idx = y.idxmax()
    best_response = y.max()
else:
    best_idx = y.idxmin()
    best_response = y.min()

print(f"\nBest observed point:")
print(f"  Response: {best_response:.2f}")
print(f"  Conditions:")
for feat in selected_features:
    print(f"    {feat}: {original_X.loc[best_idx, feat]:.3f}")

In [None]:
# =============================================================================
# CELL 22: EXPORT FOR BO
# =============================================================================

print("=" * 70)
print("EXPORT FILES")
print("=" * 70)

# 1. Selected features
with open('bo_selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print("✓ bo_selected_features.txt")

# 2. Search space bounds
bo_bounds_df.to_csv('bo_search_bounds.csv', index=False)
print("✓ bo_search_bounds.csv")

# 3. Initial data
bo_initial_data.to_csv('bo_initial_data.csv', index=False)
print("✓ bo_initial_data.csv")

# 4. Feature screening evidence
consensus.to_csv('feature_screening_results.csv', index=False)
print("✓ feature_screening_results.csv")

# 5. Interaction information
if len(strong_interactions) > 0:
    strong_interactions.to_csv('detected_interactions.csv', index=False)
    print("✓ detected_interactions.csv")

# 6. BO configuration summary
config_summary = {
    'n_features': len(selected_features),
    'n_initial_points': len(bo_initial_data),
    'response_column': RESPONSE_COLUMN,
    'maximize': MAXIMIZE_RESPONSE,
    'best_observed': best_response,
    'loo_cv_r2': loo_r2,
}

with open('bo_config.txt', 'w') as f:
    for key, value in config_summary.items():
        f.write(f"{key}: {value}\n")
print("✓ bo_config.txt")

In [None]:
# =============================================================================
# CELL 23: FINAL SUMMARY & BO RECOMMENDATIONS
# =============================================================================

print("=" * 70)
print("FEATURE SCREENING SUMMARY")
print("=" * 70)

print(f"""
DATA:
  Initial experiments: {len(original_X)}
  Features screened: {len(feature_cols)}
  
SCREENING METHODS USED:
  1. Pearson Correlation
  2. Mutual Information
  3. Lasso (automatic selection)
  4. Ridge (stable coefficients)
  5. PLS with VIP scores
  
SELECTED FEATURES FOR BO ({len(selected_features)}):
""")

for i, feat in enumerate(selected_features, 1):
    row = consensus[consensus['feature'] == feat].iloc[0]
    bounds = bo_bounds_df[bo_bounds_df['feature'] == feat].iloc[0]
    print(f"  {i}. {feat}")
    print(f"       Correlation: {row['correlation']:.3f}, VIP: {row['VIP']:.2f}")
    print(f"       BO bounds: [{bounds['bo_min']:.2f}, {bounds['bo_max']:.2f}]")

if len(strong_interactions) > 0:
    print(f"\nDETECTED INTERACTIONS:")
    for _, row in strong_interactions.iterrows():
        print(f"  - {row['interaction']}")

print(f"""
VALIDATION:
  LOO-CV R²: {loo_r2:.4f}
  LOO-CV RMSE: {loo_rmse:.4f}
  
BAYESIAN OPTIMIZATION RECOMMENDATIONS:
  1. Use GP surrogate with Matérn kernel
  2. Acquisition function: Expected Improvement (EI) or UCB
  3. Initial points: {len(bo_initial_data)} (your screening data)
  4. Expected iterations: {20 * len(selected_features)}-{40 * len(selected_features)} for convergence
  
  Estimated total experiments:
    Screening: {len(original_X)} (done)
    BO phase:  {20 * len(selected_features)}-{40 * len(selected_features)} (planned)
    Total:     {len(original_X) + 20 * len(selected_features)}-{len(original_X) + 40 * len(selected_features)}

FILES EXPORTED:
  - bo_selected_features.txt
  - bo_search_bounds.csv
  - bo_initial_data.csv
  - feature_screening_results.csv
  - bo_config.txt
""")

print("=" * 70)
print("READY FOR BAYESIAN OPTIMIZATION")
print("=" * 70)