In [None]:
# =============================================================================
# CELL 1: IMPORTS
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.metrics import r2_score
from itertools import combinations

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("=" * 70)
print("PHASE 1: FEATURE SCREENING FOR BAYESIAN OPTIMIZATION")
print("=" * 70)
print("✓ Imports complete")

In [None]:
# =============================================================================
# CELL 2: LOAD AND CLEAN DATA
# =============================================================================

# Load Excel file
xls = pd.ExcelFile('../data/Encapsys Historic DOE data sets.xlsx', engine='openpyxl')
sheet_name = 'Gelatin_PU_DSD'
df = pd.read_excel(xls, sheet_name=sheet_name, header=5)

print(f"✓ Loaded sheet: '{sheet_name}'")
print(f"  Initial shape: {df.shape}")

In [None]:
# =============================================================================
# CELL 3: SPLIT DATAFRAME AT KEYWORD
# =============================================================================

split_word = "PREDICTED OPTIMUM RUNS"
split_index = df.index[df['Run'] == split_word].tolist()

if split_index:
    split_index = split_index[0]
    df_initial = df.iloc[:split_index]
    df_optimum = df.iloc[split_index+1:]
    print(f"✓ Split at '{split_word}'")
    print(f"  df_initial: {len(df_initial)} rows")
    print(f"  df_optimum: {len(df_optimum)} rows")
else:
    print(f"⚠️ '{split_word}' not found")
    df_initial = df.copy()
    df_optimum = pd.DataFrame()

In [None]:
# =============================================================================
# CELL 4: CLEAN DATAFRAMES
# =============================================================================

# Clean df_initial
df_initial = df_initial.drop(index=0)  # Drop units row
df_initial = df_initial.dropna(how='all')
df_initial = df_initial.reset_index(drop=True)

# Clean df_optimum if exists
if len(df_optimum) > 0:
    df_optimum = df_optimum.dropna(how='all')
    df_optimum = df_optimum.reset_index(drop=True)

# Combine
if len(df_optimum) > 0:
    df_total = pd.concat([df_initial, df_optimum], axis=0, ignore_index=True)
    print(f"✓ Combined: {len(df_total)} total rows")
else:
    df_total = df_initial.copy()
    print(f"✓ Using df_initial: {len(df_total)} rows")

In [None]:
# =============================================================================
# CELL 5: DEFINE FEATURE COLUMNS
# =============================================================================

stop_feature = "Batch ID"
columns = df_total.columns.tolist()

if stop_feature in columns:
    feature_list = columns[:columns.index(stop_feature)]
    print(f"✓ Features up to '{stop_feature}':")
else:
    feature_list = columns
    print(f"⚠️ '{stop_feature}' not found, using all columns")

for i, feat in enumerate(feature_list, 1):
    print(f"    {i}. {feat}")

In [None]:
# =============================================================================
# CELL 6: CONFIGURATION
# =============================================================================

RESPONSE_COLUMN = "Downy Leak"          # Your response variable
TARGET_FEATURES = 4                      # Aim for 3-5 features
MAXIMIZE_RESPONSE = False                # True if higher is better

# Thresholds
CORRELATION_STRONG = 0.4
CORRELATION_MODERATE = 0.2
VIP_IMPORTANT = 1.0
VIP_MODERATE = 0.8
MULTICOLLINEARITY_THRESHOLD = 0.7
INTERACTION_THRESHOLD = 0.3

print("=" * 60)
print("CONFIGURATION")
print("=" * 60)
print(f"  Response: {RESPONSE_COLUMN}")
print(f"  Target features: {TARGET_FEATURES}")
print(f"  Optimization: {'Maximize' if MAXIMIZE_RESPONSE else 'Minimize'}")

In [None]:
# =============================================================================
# CELL 7: IDENTIFY FEATURE TYPES (Binary vs Continuous)
# =============================================================================

print("=" * 60)
print("FEATURE TYPE CLASSIFICATION")
print("=" * 60)
print("""
Classification rule:
  - Binary: Exactly 2 unique values → encoded as 0/1
  - Continuous: 3+ unique values → standardized
""")

# Get all numeric columns from feature list
numeric_features = df_total[feature_list].select_dtypes(include=[np.number]).columns.tolist()

# Remove response if present
numeric_features = [c for c in numeric_features if c != RESPONSE_COLUMN]

# Classify each feature
binary_cols = []
continuous_cols = []
binary_mappings = {}

print("Feature Classification:")
print("-" * 60)

for col in numeric_features:
    n_unique = df_total[col].nunique()
    
    if n_unique == 2:
        # Binary
        binary_cols.append(col)
        unique_vals = df_total[col].dropna().unique()
        mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
        df_total[col] = df_total[col].map(mapping)
        binary_mappings[col] = mapping
        print(f"  {col}: {n_unique} unique → BINARY")
        print(f"      Encoding: {unique_vals[0]} → 0, {unique_vals[1]} → 1")
    else:
        # Continuous (3+ unique values)
        continuous_cols.append(col)
        print(f"  {col}: {n_unique} unique → CONTINUOUS")

# Summary
print("\n" + "-" * 60)
print("SUMMARY:")
print(f"  Binary: {len(binary_cols)} features")
print(f"  Continuous: {len(continuous_cols)} features")

# Store for later use
BINARY_FEATURES = binary_cols
CONTINUOUS_FEATURES = continuous_cols

In [None]:
# =============================================================================
# CELL 8: PREPARE X AND y
# =============================================================================

print("=" * 60)
print("DATA PREPARATION")
print("=" * 60)

# Combine binary and continuous
feature_cols = BINARY_FEATURES + CONTINUOUS_FEATURES

# Create X and y
X = df_total[feature_cols].copy()
y = df_total[RESPONSE_COLUMN].copy()

# Drop rows with missing response
valid = ~y.isnull()
n_dropped = (~valid).sum()
X = X[valid].reset_index(drop=True)
y = y[valid].reset_index(drop=True)

# Store original
original_X = X.copy()
original_y = y.copy()

# Print summary
print(f"\nDataset:")
print(f"  Samples: {len(X)}")
if n_dropped > 0:
    print(f"  Dropped (missing response): {n_dropped}")

print(f"\nFeatures ({len(feature_cols)} total):")

print(f"\n  Binary ({len(BINARY_FEATURES)}):")
if BINARY_FEATURES:
    for f in BINARY_FEATURES:
        print(f"    • {f}")
else:
    print(f"    (none)")

print(f"\n  Continuous ({len(CONTINUOUS_FEATURES)}):")
if CONTINUOUS_FEATURES:
    for f in CONTINUOUS_FEATURES:
        print(f"    • {f}")
else:
    print(f"    (none)")

print(f"\nResponse: {RESPONSE_COLUMN}")
print(f"Samples/Features ratio: {len(X)/len(feature_cols):.1f}")

if len(X) / len(feature_cols) < 5:
    print(f"\n⚠️ Low ratio - using regularized methods only")

In [None]:
# =============================================================================
# CELL 9: STANDARDIZE FEATURES
# =============================================================================

print("=" * 60)
print("STANDARDIZATION")
print("=" * 60)

scaler = StandardScaler()
X_scaled = X.copy()

# Standardize continuous only
if CONTINUOUS_FEATURES:
    X_scaled[CONTINUOUS_FEATURES] = scaler.fit_transform(X[CONTINUOUS_FEATURES])
    print(f"✓ Standardized {len(CONTINUOUS_FEATURES)} continuous features")

# For modeling: scale binary to [-1, +1]
X_model = X_scaled.copy()
for col in BINARY_FEATURES:
    X_model[col] = X_model[col] * 2 - 1

print(f"✓ Binary features preserved as 0/1 (scaled to [-1,+1] for modeling)")

In [None]:
# =============================================================================
# CELL 10: VISUAL INSPECTION - SCATTER PLOTS
# =============================================================================

print("=" * 60)
print("VISUAL INSPECTION: Feature vs Response")
print("=" * 60)

n_feat = len(feature_cols)
n_cols = min(4, n_feat)
n_rows = int(np.ceil(n_feat / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3.5*n_rows))
axes = axes.flatten() if n_feat > 1 else [axes]

for i, col in enumerate(feature_cols):
    ax = axes[i]
    corr = X[col].corr(y)
    
    if col in BINARY_FEATURES:
        # Box plot for binary
        for val in [0, 1]:
            data = y[X[col] == val]
            ax.boxplot([data], positions=[val], widths=0.6)
        ax.set_xticks([0, 1])
        ax.set_xticklabels(['0', '1'])
        ax.set_xlabel(f'{col} (binary)')
    else:
        # Scatter for continuous
        ax.scatter(X[col], y, alpha=0.6, edgecolors='black', linewidth=0.5)
        # Trend line
        z = np.polyfit(X[col], y, 1)
        p = np.poly1d(z)
        x_line = np.linspace(X[col].min(), X[col].max(), 100)
        ax.plot(x_line, p(x_line), 'r--', linewidth=2)
        ax.set_xlabel(col)
    
    ax.set_ylabel(RESPONSE_COLUMN)
    ax.set_title(f'r = {corr:.3f}', fontsize=10)

# Hide empty subplots
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle(f'Features vs {RESPONSE_COLUMN}', fontsize=12, y=1.02)
plt.tight_layout()
plt.show()

print("Look for: linear trends, non-linear patterns, outliers")

In [None]:
# =============================================================================
# CELL 11: RESPONSE DISTRIBUTION
# =============================================================================

print("=" * 60)
print("RESPONSE DISTRIBUTION")
print("=" * 60)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Histogram
ax1 = axes[0]
ax1.hist(y, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
ax1.axvline(y.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {y.mean():.2f}')
ax1.axvline(y.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {y.median():.2f}')
ax1.set_xlabel(RESPONSE_COLUMN)
ax1.set_ylabel('Frequency')
ax1.set_title('Response Distribution')
ax1.legend()

# Box plot
ax2 = axes[1]
ax2.boxplot(y, vert=True)
ax2.set_ylabel(RESPONSE_COLUMN)
ax2.set_title('Response Box Plot')

plt.tight_layout()
plt.show()

print(f"Response statistics:")
print(f"  Min:    {y.min():.4f}")
print(f"  Max:    {y.max():.4f}")
print(f"  Mean:   {y.mean():.4f}")
print(f"  Median: {y.median():.4f}")
print(f"  Std:    {y.std():.4f}")

In [None]:
# =============================================================================
# CELL 12: METHOD 1 - CORRELATION ANALYSIS
# =============================================================================

print("=" * 60)
print("METHOD 1: PEARSON CORRELATION")
print("=" * 60)

correlations = X.corrwith(y)

corr_df = pd.DataFrame({
    'feature': feature_cols,
    'correlation': correlations.values,
    'abs_corr': np.abs(correlations.values),
    'type': ['binary' if f in BINARY_FEATURES else 'continuous' for f in feature_cols],
    'direction': ['Positive' if c > 0 else 'Negative' for c in correlations.values]
}).sort_values('abs_corr', ascending=False).reset_index(drop=True)

corr_df['rank_corr'] = range(1, len(corr_df) + 1)

# Categorize strength
def corr_strength(r):
    if abs(r) >= CORRELATION_STRONG: return 'Strong'
    elif abs(r) >= CORRELATION_MODERATE: return 'Moderate'
    else: return 'Weak'

corr_df['strength'] = corr_df['correlation'].apply(corr_strength)

print("\nResults:")
print(corr_df[['rank_corr', 'feature', 'correlation', 'direction', 'strength', 'type']].to_string(index=False))

# Binary interpretation
if BINARY_FEATURES:
    print("\n" + "-" * 40)
    print("Binary Feature Effects:")
    for _, row in corr_df[corr_df['type'] == 'binary'].iterrows():
        effect = "INCREASES" if row['correlation'] > 0 else "DECREASES"
        print(f"  {row['feature']}: When = 1, response {effect} (r = {row['correlation']:.3f})")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
ax1 = axes[0]
colors = []
for _, row in corr_df.iterrows():
    if row['correlation'] > 0:
        colors.append('forestgreen' if row['type'] == 'continuous' else 'steelblue')
    else:
        colors.append('crimson' if row['type'] == 'continuous' else 'darkorange')

ax1.barh(corr_df['feature'][::-1], corr_df['abs_corr'][::-1], color=colors[::-1])
ax1.axvline(x=CORRELATION_STRONG, color='green', linestyle='--', linewidth=2, label=f'Strong ({CORRELATION_STRONG})')
ax1.axvline(x=CORRELATION_MODERATE, color='orange', linestyle='--', linewidth=1.5, label=f'Moderate ({CORRELATION_MODERATE})')
ax1.set_xlabel('|Correlation|')
ax1.set_title('Feature-Response Correlation\n(Green/Blue=Positive, Red/Orange=Negative)')
ax1.legend(loc='lower right')

# Signed correlation
ax2 = axes[1]
colors_signed = ['forestgreen' if c > 0 else 'crimson' for c in corr_df['correlation']]
ax2.barh(corr_df['feature'][::-1], corr_df['correlation'][::-1], color=colors_signed[::-1])
ax2.axvline(x=0, color='black', linewidth=1)
ax2.set_xlabel('Correlation (with sign)')
ax2.set_title('Direction of Effect\n(Green=Positive, Red=Negative)')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 13: METHOD 2 - LASSO (Automatic Selection)
# =============================================================================

print("=" * 60)
print("METHOD 2: LASSO REGRESSION")
print("=" * 60)

lasso = LassoCV(cv=5, max_iter=10000, random_state=RANDOM_STATE)
lasso.fit(X_model, y)

lasso_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lasso.coef_,
    'abs_coef': np.abs(lasso.coef_),
    'selected': lasso.coef_ != 0,
    'type': ['binary' if f in BINARY_FEATURES else 'continuous' for f in feature_cols]
}).sort_values('abs_coef', ascending=False).reset_index(drop=True)

lasso_df['rank_lasso'] = range(1, len(lasso_df) + 1)

print(f"\nLasso alpha: {lasso.alpha_:.4f}")
print(f"Features selected: {lasso_df['selected'].sum()}/{len(lasso_df)}")
print("\nResults:")
print(lasso_df[['rank_lasso', 'feature', 'coefficient', 'selected', 'type']].to_string(index=False))

selected_by_lasso = lasso_df[lasso_df['selected']]['feature'].tolist()
print(f"\n✓ Lasso selected: {selected_by_lasso}")

# Plot
plt.figure(figsize=(10, 6))
colors = ['forestgreen' if s else 'lightgray' for s in lasso_df['selected']]
plt.barh(lasso_df['feature'][::-1], lasso_df['abs_coef'][::-1], color=colors[::-1])
plt.xlabel('|Coefficient|')
plt.title('Lasso Coefficients\n(Green = Selected, Gray = Eliminated)')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 14: METHOD 3 - PLS WITH VIP SCORES
# =============================================================================

print("=" * 60)
print("METHOD 3: PLS (VIP Scores)")
print("=" * 60)

# Find optimal components
max_comp = min(5, len(feature_cols), len(X) - 1)
cv_scores = []

print("Finding optimal components...")
for n in range(1, max_comp + 1):
    scores = cross_val_score(PLSRegression(n_components=n), X_model, y, cv=5, scoring='r2')
    cv_scores.append(scores.mean())
    print(f"  {n} components: CV R² = {scores.mean():.4f}")

optimal_comp = np.argmax(cv_scores) + 1
print(f"\n✓ Optimal: {optimal_comp} components (CV R² = {max(cv_scores):.4f})")

# Fit PLS
pls = PLSRegression(n_components=optimal_comp)
pls.fit(X_model, y)

# Calculate VIP
def calc_vip(model):
    t, w, q = model.x_scores_, model.x_weights_, model.y_loadings_
    m, p = w.shape
    ss = np.sum(t**2, axis=0) * q.flatten()**2
    total_ss = np.sum(ss)
    vip = np.zeros(m)
    for i in range(m):
        weight = sum((w[i,j]**2) * ss[j] / np.sum(w[:,j]**2) for j in range(p))
        vip[i] = np.sqrt(m * weight / total_ss)
    return vip

vip_scores = calc_vip(pls)

pls_df = pd.DataFrame({
    'feature': feature_cols,
    'VIP': vip_scores,
    'type': ['binary' if f in BINARY_FEATURES else 'continuous' for f in feature_cols]
}).sort_values('VIP', ascending=False).reset_index(drop=True)

pls_df['rank_pls'] = range(1, len(pls_df) + 1)

# Categorize VIP
def vip_category(v):
    if v >= VIP_IMPORTANT: return 'Important'
    elif v >= VIP_MODERATE: return 'Moderate'
    else: return 'Less Important'

pls_df['category'] = pls_df['VIP'].apply(vip_category)

print("\nResults:")
print(pls_df[['rank_pls', 'feature', 'VIP', 'category', 'type']].to_string(index=False))

# Summary
important = pls_df[pls_df['category'] == 'Important']['feature'].tolist()
moderate = pls_df[pls_df['category'] == 'Moderate']['feature'].tolist()
print(f"\n✓ Important (VIP ≥ {VIP_IMPORTANT}): {important if important else 'None'}")
print(f"⚠ Moderate (VIP ≥ {VIP_MODERATE}): {moderate if moderate else 'None'}")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# VIP scores
ax1 = axes[0]
colors = ['darkgreen' if v >= VIP_IMPORTANT else 'orange' if v >= VIP_MODERATE else 'lightcoral' 
          for v in pls_df['VIP']]
ax1.barh(pls_df['feature'][::-1], pls_df['VIP'][::-1], color=colors[::-1])
ax1.axvline(x=VIP_IMPORTANT, color='green', linestyle='--', linewidth=2, label=f'Important ({VIP_IMPORTANT})')
ax1.axvline(x=VIP_MODERATE, color='orange', linestyle='--', linewidth=1.5, label=f'Moderate ({VIP_MODERATE})')
ax1.set_xlabel('VIP Score')
ax1.set_title('PLS Variable Importance in Projection')
ax1.legend()

# Component selection
ax2 = axes[1]
ax2.plot(range(1, max_comp + 1), cv_scores, 'bo-', linewidth=2, markersize=8)
ax2.axvline(x=optimal_comp, color='red', linestyle='--', label=f'Optimal = {optimal_comp}')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('CV R²')
ax2.set_title('PLS Component Selection')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 15: INTERACTION SCREENING
# =============================================================================

print("=" * 60)
print("INTERACTION SCREENING")
print("=" * 60)
print(f"""
Checking if effect of Feature A depends on level of Feature B.

Why it matters:
  - If A × B interaction is strong, include BOTH in BO
  - Even if individual effects are weak, interaction may be important

Threshold: {INTERACTION_THRESHOLD}
""")

# Use top features for interaction screening
top_n = min(6, len(feature_cols))
top_features = corr_df.head(top_n)['feature'].tolist()

interaction_results = []

for f1, f2 in combinations(top_features, 2):
    # Split by median of f2
    median_f2 = X[f2].median()
    low_f2 = X[f2] <= median_f2
    high_f2 = X[f2] > median_f2
    
    # Check we have enough samples in each group
    if low_f2.sum() >= 3 and high_f2.sum() >= 3:
        # Correlation of f1 with y in each group
        corr_low = X.loc[low_f2, f1].corr(y[low_f2])
        corr_high = X.loc[high_f2, f1].corr(y[high_f2])
        
        if not np.isnan(corr_low) and not np.isnan(corr_high):
            interaction_strength = abs(corr_high - corr_low)
            
            interaction_results.append({
                'interaction': f'{f1} × {f2}',
                'feature_1': f1,
                'feature_2': f2,
                'corr_low_f2': corr_low,
                'corr_high_f2': corr_high,
                'strength': interaction_strength,
                'interpretation': 'Effect changes' if interaction_strength > INTERACTION_THRESHOLD else 'No interaction'
            })

if interaction_results:
    interaction_df = pd.DataFrame(interaction_results).sort_values('strength', ascending=False)
    
    print("Interaction Analysis Results:")
    print(interaction_df[['interaction', 'corr_low_f2', 'corr_high_f2', 'strength', 'interpretation']].to_string(index=False))
    
    # Identify strong interactions
    strong_interactions = interaction_df[interaction_df['strength'] > INTERACTION_THRESHOLD]
    
    if len(strong_interactions) > 0:
        print(f"\n⚠️ POTENTIAL INTERACTIONS DETECTED:")
        features_with_interactions = set()
        for _, row in strong_interactions.iterrows():
            print(f"\n  {row['interaction']}: strength = {row['strength']:.3f}")
            print(f"    Correlation of {row['feature_1']} with response:")
            print(f"      When {row['feature_2']} is LOW:  r = {row['corr_low_f2']:.3f}")
            print(f"      When {row['feature_2']} is HIGH: r = {row['corr_high_f2']:.3f}")
            print(f"    → Include BOTH features in BO!")
            features_with_interactions.add(row['feature_1'])
            features_with_interactions.add(row['feature_2'])
        
        features_with_interactions = list(features_with_interactions)
    else:
        print(f"\n✓ No strong interactions detected (threshold: {INTERACTION_THRESHOLD})")
        features_with_interactions = []
    
    # Visualize top interactions
    if len(interaction_df) > 0:
        fig, ax = plt.subplots(figsize=(10, 5))
        colors = ['crimson' if s > INTERACTION_THRESHOLD else 'steelblue' for s in interaction_df['strength']]
        ax.barh(interaction_df['interaction'][::-1], interaction_df['strength'][::-1], color=colors[::-1])
        ax.axvline(x=INTERACTION_THRESHOLD, color='red', linestyle='--', linewidth=2, 
                   label=f'Threshold ({INTERACTION_THRESHOLD})')
        ax.set_xlabel('Interaction Strength')
        ax.set_title('Interaction Screening\n(Red = Potential Interaction)')
        ax.legend()
        plt.tight_layout()
        plt.show()
else:
    print("✓ No interactions to analyze (insufficient data or features)")
    interaction_df = pd.DataFrame()
    strong_interactions = pd.DataFrame()
    features_with_interactions = []

In [None]:
# =============================================================================
# CELL 16: MULTICOLLINEARITY CHECK
# =============================================================================

print("=" * 60)
print("MULTICOLLINEARITY CHECK")
print("=" * 60)
print(f"""
Checking correlations BETWEEN features.

Why it matters:
  - Highly correlated features contain redundant information
  - Including both wastes BO dimensions
  - Keep only ONE from each correlated pair

Threshold: |r| > {MULTICOLLINEARITY_THRESHOLD}
""")

# Feature correlation matrix
feature_corr = X.corr()

# Find high correlation pairs
high_corr_pairs = []
for i in range(len(feature_cols)):
    for j in range(i+1, len(feature_cols)):
        r = feature_corr.iloc[i, j]
        if abs(r) > MULTICOLLINEARITY_THRESHOLD:
            high_corr_pairs.append({
                'feature_1': feature_cols[i],
                'feature_2': feature_cols[j],
                'correlation': r
            })

if high_corr_pairs:
    print("⚠️ HIGHLY CORRELATED PAIRS:")
    for pair in high_corr_pairs:
        print(f"\n  {pair['feature_1']} ↔ {pair['feature_2']}: r = {pair['correlation']:.3f}")
        print(f"    → Consider keeping only ONE in BO")
else:
    print("✓ No highly correlated feature pairs found")

# Heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(feature_corr, dtype=bool), k=0)
sns.heatmap(feature_corr, annot=True, cmap='RdBu_r', center=0, fmt='.2f',
            mask=mask, square=True, linewidths=0.5)
plt.title('Feature-Feature Correlations\n(Check for multicollinearity)')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 17: CONSENSUS RANKING
# =============================================================================

print("=" * 60)
print("CONSENSUS RANKING (All Methods)")
print("=" * 60)

# Merge all rankings
consensus = corr_df[['feature', 'rank_corr', 'correlation', 'direction', 'type', 'strength']].merge(
    lasso_df[['feature', 'rank_lasso', 'selected']], on='feature'
).merge(
    pls_df[['feature', 'rank_pls', 'VIP', 'category']], on='feature'
)

# Calculate average rank
consensus['avg_rank'] = consensus[['rank_corr', 'rank_lasso', 'rank_pls']].mean(axis=1)
consensus = consensus.sort_values('avg_rank').reset_index(drop=True)
consensus['final_rank'] = range(1, len(consensus) + 1)

# Method agreement (how many methods rank in top 3)
def count_top_k(row, k=3):
    count = 0
    if row['rank_corr'] <= k: count += 1
    if row['rank_lasso'] <= k: count += 1
    if row['rank_pls'] <= k: count += 1
    return count

consensus['methods_top3'] = consensus.apply(lambda r: count_top_k(r, 3), axis=1)

# Add interaction flag
consensus['has_interaction'] = consensus['feature'].isin(features_with_interactions)

print("\nConsensus Ranking:")
display_cols = ['final_rank', 'feature', 'type', 'correlation', 'VIP', 'selected', 
                'avg_rank', 'methods_top3', 'has_interaction']
print(consensus[display_cols].to_string(index=False))

# Summary by agreement
high_agreement = consensus[consensus['methods_top3'] >= 3]['feature'].tolist()
moderate_agreement = consensus[consensus['methods_top3'] == 2]['feature'].tolist()

print(f"\n✓ High agreement (3/3 methods in top 3): {high_agreement if high_agreement else 'None'}")
print(f"⚠ Moderate agreement (2/3 methods): {moderate_agreement if moderate_agreement else 'None'}")

In [None]:
# =============================================================================
# CELL 18: CONSENSUS VISUALIZATION
# =============================================================================

print("=" * 60)
print("CONSENSUS VISUALIZATION")
print("=" * 60)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Heatmap of rankings
ax1 = axes[0, 0]
heatmap_data = consensus.set_index('feature')[['rank_corr', 'rank_lasso', 'rank_pls']]
heatmap_data.columns = ['Correlation', 'Lasso', 'PLS']
sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='RdYlGn_r', ax=ax1,
            cbar_kws={'label': 'Rank (lower=better)'})
ax1.set_title('Rankings Across Methods')

# 2. Method agreement
ax2 = axes[0, 1]
colors = ['darkgreen' if a >= 3 else 'orange' if a >= 2 else 'lightcoral' 
          for a in consensus['methods_top3']]
ax2.barh(consensus['feature'][::-1], consensus['methods_top3'][::-1], color=colors[::-1])
ax2.axvline(x=2, color='orange', linestyle='--', linewidth=2)
ax2.set_xlabel('Methods Ranking Feature in Top 3')
ax2.set_title('Method Agreement\n(Green=3/3, Orange=2/3, Red=1/3 or less)')

# 3. Average rank
ax3 = axes[1, 0]
colors = ['steelblue' if t == 'binary' else 'forestgreen' for t in consensus['type']]
ax3.barh(consensus['feature'][::-1], consensus['avg_rank'][::-1], color=colors[::-1])
ax3.set_xlabel('Average Rank (lower = better)')
ax3.set_title('Consensus Ranking\n(Green=Continuous, Blue=Binary)')
ax3.invert_xaxis()

# 4. Correlation vs VIP
ax4 = axes[1, 1]
for _, row in consensus.iterrows():
    color = 'steelblue' if row['type'] == 'binary' else 'forestgreen'
    marker = 's' if row['has_interaction'] else 'o'
    ax4.scatter(abs(row['correlation']), row['VIP'], c=color, s=100, marker=marker, 
                edgecolors='black', linewidth=0.5)
    ax4.annotate(row['feature'], (abs(row['correlation']), row['VIP']), 
                 fontsize=8, ha='left', va='bottom')

ax4.axhline(y=VIP_IMPORTANT, color='green', linestyle='--', alpha=0.7, label=f'VIP={VIP_IMPORTANT}')
ax4.axvline(x=CORRELATION_STRONG, color='blue', linestyle='--', alpha=0.7, label=f'|r|={CORRELATION_STRONG}')
ax4.set_xlabel('|Correlation|')
ax4.set_ylabel('VIP Score')
ax4.set_title('Correlation vs VIP\n(Square=Has Interaction)')
ax4.legend(loc='lower right')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 19: AUTOMATIC FEATURE RECOMMENDATION
# =============================================================================

print("=" * 60)
print("FEATURE RECOMMENDATION")
print("=" * 60)

# Score each feature
def score_feature(row):
    score = 0
    
    # Correlation
    if abs(row['correlation']) >= CORRELATION_STRONG: score += 3
    elif abs(row['correlation']) >= CORRELATION_MODERATE: score += 2
    
    # VIP
    if row['VIP'] >= VIP_IMPORTANT: score += 3
    elif row['VIP'] >= VIP_MODERATE: score += 2
    
    # Lasso selected
    if row['selected']: score += 2
    
    # Method agreement
    score += row['methods_top3']
    
    # Interaction bonus
    if row['has_interaction']: score += 2
    
    return score

consensus['score'] = consensus.apply(score_feature, axis=1)
consensus = consensus.sort_values('score', ascending=False).reset_index(drop=True)

print("\nFeature Scores:")
print(consensus[['feature', 'type', 'correlation', 'VIP', 'selected', 'has_interaction', 'score']].to_string(index=False))

# Generate recommendations
recommended = []
reasons = {}

for _, row in consensus.iterrows():
    include = False
    reason = []
    
    if row['score'] >= 6:
        include = True
        reason.append(f"High score ({row['score']})")
    
    if abs(row['correlation']) >= CORRELATION_STRONG:
        include = True
        reason.append(f"Strong correlation")
    
    if row['VIP'] >= VIP_IMPORTANT:
        include = True
        reason.append(f"VIP ≥ {VIP_IMPORTANT}")
    
    if row['has_interaction'] and row['score'] >= 4:
        include = True
        reason.append("Part of interaction")
    
    if include and len(recommended) < 6:  # Max 6
        recommended.append(row['feature'])
        reasons[row['feature']] = ', '.join(reason)

# Ensure minimum
MIN_FEATURES = 3
if len(recommended) < MIN_FEATURES:
    for _, row in consensus.iterrows():
        if row['feature'] not in recommended:
            recommended.append(row['feature'])
            reasons[row['feature']] = "Added to meet minimum"
        if len(recommended) >= MIN_FEATURES:
            break

print(f"\n{'='*50}")
print(f"RECOMMENDED ({len(recommended)} features):")
print(f"{'='*50}")

for feat in recommended:
    row = consensus[consensus['feature'] == feat].iloc[0]
    feat_type = "[binary]" if row['type'] == 'binary' else "[continuous]"
    print(f"\n  ✓ {feat} {feat_type}")
    print(f"      Correlation: {row['correlation']:.3f}")
    print(f"      VIP: {row['VIP']:.2f}")
    print(f"      Reason: {reasons[feat]}")

In [None]:
# =============================================================================
# CELL 20: MANUAL FEATURE SELECTION
# =============================================================================

print("=" * 60)
print("FEATURE SELECTION")
print("=" * 60)

# Option 1: Use recommendations
selected_features = recommended[:TARGET_FEATURES]

# Option 2: Manual override - uncomment and modify
# selected_features = [
#     'feature_1',
#     'feature_2',
#     'feature_3',
# ]

print(f"\nSelected features ({len(selected_features)}):")
for i, feat in enumerate(selected_features, 1):
    row = consensus[consensus['feature'] == feat].iloc[0]
    print(f"  {i}. {feat} ({row['type']}) - corr: {row['correlation']:.3f}, VIP: {row['VIP']:.2f}")

# Validation
print("\n" + "-" * 40)
print("VALIDATION CHECKS:")

# Check multicollinearity in selection
if len(selected_features) > 1:
    sel_corr = X[selected_features].corr()
    issues = []
    for i in range(len(selected_features)):
        for j in range(i+1, len(selected_features)):
            r = sel_corr.iloc[i, j]
            if abs(r) > MULTICOLLINEARITY_THRESHOLD:
                issues.append(f"{selected_features[i]} ↔ {selected_features[j]}: r={r:.2f}")
    
    if issues:
        print(f"  ⚠️ Multicollinearity:")
        for issue in issues:
            print(f"      {issue}")
    else:
        print(f"  ✓ No multicollinearity issues")

# Check broken interactions
if len(strong_interactions) > 0:
    broken = []
    for _, row in strong_interactions.iterrows():
        f1, f2 = row['feature_1'], row['feature_2']
        if (f1 in selected_features) != (f2 in selected_features):
            broken.append((f1, f2))
    
    if broken:
        print(f"  ⚠️ Broken interactions:")
        for f1, f2 in broken:
            in1 = "IN" if f1 in selected_features else "OUT"
            in2 = "IN" if f2 in selected_features else "OUT"
            print(f"      {f1} ({in1}) × {f2} ({in2})")
    else:
        print(f"  ✓ No broken interactions")
else:
    print(f"  ✓ No interactions to check")

In [None]:
# =============================================================================
# CELL 21: VALIDATE WITH LOO-CV
# =============================================================================

print("=" * 60)
print("VALIDATION: Leave-One-Out Cross-Validation")
print("=" * 60)

# Prepare data
X_sel = X_model[selected_features]

# LOO-CV
loo_preds, loo_actual = [], []
for train_idx, test_idx in LeaveOneOut().split(X_sel):
    model = RidgeCV(alphas=[0.1, 1, 10, 100], cv=3)
    model.fit(X_sel.iloc[train_idx], y.iloc[train_idx])
    loo_preds.append(model.predict(X_sel.iloc[test_idx])[0])
    loo_actual.append(y.iloc[test_idx].values[0])

loo_preds = np.array(loo_preds)
loo_actual = np.array(loo_actual)

# Metrics
loo_r2 = r2_score(loo_actual, loo_preds)
loo_rmse = np.sqrt(np.mean((loo_actual - loo_preds)**2))
loo_mae = np.mean(np.abs(loo_actual - loo_preds))

print(f"\nLOO-CV Results ({len(selected_features)} features):")
print(f"  R²:   {loo_r2:.4f}")
print(f"  RMSE: {loo_rmse:.4f}")
print(f"  MAE:  {loo_mae:.4f}")

# Interpretation
print(f"\nInterpretation:")
if loo_r2 > 0.5:
    print("  ✓ Good signal - features are predictive")
elif loo_r2 > 0.2:
    print("  ⚠️ Moderate signal - GP in BO can likely improve")
elif loo_r2 > 0:
    print("  ⚠️ Weak linear signal - may be non-linear")
else:
    print("  ⚠️ No linear signal - check data or feature selection")

# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Predicted vs Actual
ax1 = axes[0]
ax1.scatter(loo_actual, loo_preds, alpha=0.7, edgecolors='black', linewidth=0.5)
min_val, max_val = min(loo_actual.min(), loo_preds.min()), max(loo_actual.max(), loo_preds.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect')
ax1.set_xlabel('Actual')
ax1.set_ylabel('Predicted')
ax1.set_title(f'LOO-CV: R² = {loo_r2:.4f}')
ax1.legend()

# Residuals histogram
ax2 = axes[1]
residuals = loo_actual - loo_preds
ax2.hist(residuals, bins=12, edgecolor='black', alpha=0.7)
ax2.axvline(x=0, color='red', linestyle='--', linewidth=2)
ax2.set_xlabel('Residual (Actual - Predicted)')
ax2.set_ylabel('Frequency')
ax2.set_title(f'Residuals: Mean={residuals.mean():.3f}')

# Residuals vs Predicted
ax3 = axes[2]
ax3.scatter(loo_preds, residuals, alpha=0.7, edgecolors='black', linewidth=0.5)
ax3.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax3.set_xlabel('Predicted')
ax3.set_ylabel('Residual')
ax3.set_title('Residuals vs Predicted')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 22: DEFINE BO SEARCH SPACE
# =============================================================================

print("=" * 60)
print("BAYESIAN OPTIMIZATION SEARCH SPACE")
print("=" * 60)

# Create bounds
bo_bounds = []
for feat in selected_features:
    if feat in BINARY_FEATURES:
        bo_bounds.append({
            'feature': feat,
            'type': 'binary',
            'min': 0,
            'max': 1,
            'observed_min': 0,
            'observed_max': 1
        })
    else:
        feat_min = original_X[feat].min()
        feat_max = original_X[feat].max()
        feat_range = feat_max - feat_min
        margin = 0.1 * feat_range
        bo_bounds.append({
            'feature': feat,
            'type': 'continuous',
            'min': feat_min - margin,
            'max': feat_max + margin,
            'observed_min': feat_min,
            'observed_max': feat_max
        })

bo_bounds_df = pd.DataFrame(bo_bounds)

print("\nSearch Space Bounds:")
print(bo_bounds_df.to_string(index=False))

# Effect directions
print("\n" + "-" * 40)
print("EFFECT DIRECTIONS:")
for feat in selected_features:
    row = consensus[consensus['feature'] == feat].iloc[0]
    direction = row['correlation']
    
    if MAXIMIZE_RESPONSE:
        suggest = "HIGH" if direction > 0 else "LOW"
    else:
        suggest = "LOW" if direction > 0 else "HIGH"
    
    print(f"  {feat}: {'Positive' if direction > 0 else 'Negative'} effect → Suggest {suggest}")

In [None]:
# =============================================================================
# CELL 23: EXPORT FOR BO
# =============================================================================

print("=" * 60)
print("EXPORT FOR BAYESIAN OPTIMIZATION")
print("=" * 60)

# Initial data
bo_data = original_X[selected_features].copy()
bo_data[RESPONSE_COLUMN] = original_y.values

# Best point so far
if MAXIMIZE_RESPONSE:
    best_idx = original_y.idxmax()
    best_val = original_y.max()
else:
    best_idx = original_y.idxmin()
    best_val = original_y.min()

print(f"\nInitial Data:")
print(f"  Samples: {len(bo_data)}")
print(f"  Features: {len(selected_features)}")

print(f"\nBest Observed {'Maximum' if MAXIMIZE_RESPONSE else 'Minimum'}:")
print(f"  {RESPONSE_COLUMN} = {best_val:.4f}")
print(f"  Conditions:")
for feat in selected_features:
    print(f"    {feat}: {original_X.loc[best_idx, feat]:.4f}")

# Save files
bo_bounds_df.to_csv('bo_bounds.csv', index=False)
bo_data.to_csv('bo_initial_data.csv', index=False)

# Save feature info
feature_info = consensus[consensus['feature'].isin(selected_features)][
    ['feature', 'type', 'correlation', 'VIP', 'has_interaction']
].reset_index(drop=True)
feature_info.to_csv('bo_feature_info.csv', index=False)

# Save binary mappings for selected binary features
selected_binary = [f for f in selected_features if f in BINARY_FEATURES]
if selected_binary:
    mapping_records = [{'feature': f, 'value_0': list(binary_mappings[f].keys())[0], 
                        'value_1': list(binary_mappings[f].keys())[1]} 
                       for f in selected_binary]
    pd.DataFrame(mapping_records).to_csv('bo_binary_mappings.csv', index=False)
    print(f"\n✓ Saved: bo_binary_mappings.csv")

print(f"\n✓ Saved: bo_bounds.csv")
print(f"✓ Saved: bo_initial_data.csv")
print(f"✓ Saved: bo_feature_info.csv")

In [None]:
# =============================================================================
# CELL 24: FINAL SUMMARY
# =============================================================================

print("=" * 70)
print("PHASE 1 COMPLETE: FEATURE SCREENING SUMMARY")
print("=" * 70)

print(f"""
DATA:
  Total samples: {len(X)}
  Total features: {len(feature_cols)}
    - Binary: {len(BINARY_FEATURES)}
    - Continuous: {len(CONTINUOUS_FEATURES)}

METHODS USED:
  1. Pearson Correlation
  2. Lasso Regression (automatic selection)
  3. PLS with VIP Scores
  4. Interaction Screening
  5. Multicollinearity Check

SELECTED FOR BAYESIAN OPTIMIZATION ({len(selected_features)}):
""")

for feat in selected_features:
    row = consensus[consensus['feature'] == feat].iloc[0]
    bounds = bo_bounds_df[bo_bounds_df['feature'] == feat].iloc[0]
    int_flag = " ⚡" if row['has_interaction'] else ""
    print(f"  • {feat} ({row['type']}){int_flag}")
    print(f"      Correlation: {row['correlation']:+.3f}")
    print(f"      VIP Score: {row['VIP']:.2f}")
    print(f"      Bounds: [{bounds['min']:.2f}, {bounds['max']:.2f}]")

if features_with_interactions:
    print(f"\n  ⚡ = Part of detected interaction")

print(f"""
VALIDATION:
  LOO-CV R²: {loo_r2:.4f}
  LOO-CV RMSE: {loo_rmse:.4f}

NEXT STEPS (Phase 2: Bayesian Optimization):
  1. Load bo_initial_data.csv as initial training data
  2. Use bo_bounds.csv for search space
  3. Fit GP surrogate model
  4. Run acquisition function loop (EI or UCB)
  5. Expected iterations: {20*len(selected_features)}-{40*len(selected_features)}

OUTPUT FILES:
  • bo_bounds.csv - Feature bounds for BO
  • bo_initial_data.csv - Initial GP training data
  • bo_feature_info.csv - Feature details
""")

if selected_binary:
    print(f"  • bo_binary_mappings.csv - Binary encoding reference")

print("=" * 70)
print("READY FOR PHASE 2: BAYESIAN OPTIMIZATION")
print("=" * 70)