In [None]:
# =============================================================================
# CELL 1: IMPORTS (UPDATED)
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel, WhiteKernel
from sklearn.cross_decomposition import PLSRegression  # NEW
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_squared_error

# NEW: XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("✓ XGBoost available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠ XGBoost not installed. Run: pip install xgboost")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✓ Imports complete")

In [None]:
# =============================================================================
# CELL 2: CONFIGURATION - MODIFY THESE
# =============================================================================

FILE_PATH = "your_data.xlsx"           # <-- Your Excel file
RESPONSE_COLUMN = "yield"              # <-- Your response variable name  
EXCLUDE_COLUMNS = ["experiment_id"]    # <-- Columns to exclude from features
TEST_SIZE = 0.2
TOP_K = 5

print(f"Response: {RESPONSE_COLUMN}")
print(f"Exclude: {EXCLUDE_COLUMNS}")
print(f"Top K: {TOP_K}")

In [None]:
# =============================================================================
# CELL 3: LOAD DATA
# =============================================================================

# Try loading your file, or create sample data for demo
try:
    df = pd.read_excel(FILE_PATH)
    print(f"✓ Loaded: {df.shape}")
except:
    print("Creating sample chemical data...")
    np.random.seed(42)
    n = 200
    df = pd.DataFrame({
        'experiment_id': range(1, n+1),
        'temperature_C': np.random.uniform(20, 100, n),
        'pressure_bar': np.random.uniform(1, 10, n),
        'pH': np.random.uniform(2, 12, n),
        'concentration_mol_L': np.random.exponential(0.5, n),
        'reaction_time_min': np.random.uniform(5, 120, n),
        'catalyst_amount_g': np.random.uniform(0.1, 5, n),
        'stirring_speed_rpm': np.random.uniform(100, 1000, n),
        'solvent_ratio': np.random.uniform(0.1, 0.9, n),
        'humidity_percent': np.random.uniform(30, 80, n),
        'particle_size_um': np.random.uniform(1, 100, n),
    })
    # Response with known relationships
    df['yield'] = (
        0.5 * df['temperature_C'] +
        2.0 * df['catalyst_amount_g'] +
        -0.3 * df['pH'] +
        0.1 * df['reaction_time_min'] +
        np.random.normal(0, 5, n)
    ).clip(0, 100)
    
    # Add some missing values
    df.loc[np.random.choice(n, 5), 'humidity_percent'] = np.nan
    print(f"✓ Sample data created: {df.shape}")

print(f"\nColumns: {list(df.columns)}")
print(f"\nMissing values:\n{df.isnull().sum()[df.isnull().sum() > 0]}")
df.head()

In [None]:
# =============================================================================
# CELL 4: PREPARE FEATURES AND TARGET
# =============================================================================

# Get numeric columns only, exclude specified columns and response
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in EXCLUDE_COLUMNS + [RESPONSE_COLUMN]]

X = df[feature_cols].copy()
y = df[RESPONSE_COLUMN].copy()

print(f"Features ({len(feature_cols)}): {feature_cols}")
print(f"Target: {RESPONSE_COLUMN}")
print(f"Shape: X={X.shape}, y={y.shape}")

In [None]:
# =============================================================================
# CELL 5: CHEMICAL DATA CHECKS
# =============================================================================

# Check for outliers (Z-score > 3)
print("=" * 50)
print("OUTLIER DETECTION (Z-score > 3)")
print("=" * 50)
z_scores = np.abs((X - X.mean()) / X.std())
outliers = (z_scores > 3).sum()
for col in outliers[outliers > 0].index:
    print(f"  {col}: {outliers[col]} outliers")
if outliers.sum() == 0:
    print("  No outliers detected")

# Check multicollinearity
print("\n" + "=" * 50)
print("MULTICOLLINEARITY CHECK (|r| > 0.9)")
print("=" * 50)
corr = X.corr().abs()
high_corr = []
for i in range(len(corr.columns)):
    for j in range(i+1, len(corr.columns)):
        if corr.iloc[i, j] > 0.9:
            high_corr.append((corr.columns[i], corr.columns[j], corr.iloc[i, j]))
            print(f"  {corr.columns[i]} ↔ {corr.columns[j]}: {corr.iloc[i,j]:.3f}")
if not high_corr:
    print("  No highly correlated features")

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap='RdBu_r', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 6: TRAIN/TEST SPLIT (BEFORE STANDARDIZATION - NO LEAKAGE)
# =============================================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# =============================================================================
# CELL 7: IMPUTE MISSING VALUES (FIT ON TRAIN ONLY)
# =============================================================================

imputer = SimpleImputer(strategy='median')

# Fit on train, transform both
X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

# Handle missing in y
train_mask = ~y_train.isnull()
test_mask = ~y_test.isnull()
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]

print(f"✓ Missing values imputed (fitted on training only)")
print(f"  Final training: {len(X_train)}, test: {len(X_test)}")

In [None]:
# =============================================================================
# CELL 8: STANDARDIZATION (FIT ON TRAIN ONLY - NO LEAKAGE)
# =============================================================================

scaler = StandardScaler()

# Fit on training data ONLY
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

# Apply training statistics to test data
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print("✓ Standardization complete (NO DATA LEAKAGE)")
print("  - Scaler fitted on training data only")
print("  - Training mean applied to test data")
print(f"\nTraining data stats (should be ~0 mean, ~1 std):")
print(X_train_scaled.describe().loc[['mean', 'std']].round(4))

In [None]:
# =============================================================================
# CELL 9: METHOD 1 - LINEAR REGRESSION
# =============================================================================

print("=" * 50)
print("METHOD 1: LINEAR REGRESSION")
print("=" * 50)

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Coefficients as importance (absolute value for standardized data)
lr_importance = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'coefficient': lr_model.coef_,
    'importance': np.abs(lr_model.coef_)
}).sort_values('importance', ascending=False).reset_index(drop=True)
lr_importance['rank'] = range(1, len(lr_importance) + 1)

# Performance
lr_train_r2 = r2_score(y_train, lr_model.predict(X_train_scaled))
lr_test_r2 = r2_score(y_test, lr_model.predict(X_test_scaled))

print(f"\nTrain R²: {lr_train_r2:.4f}")
print(f"Test R²:  {lr_test_r2:.4f}")
print(f"\nFeature Importance (|coefficient|):")
print(lr_importance[['rank', 'feature', 'coefficient', 'importance']].to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
colors = ['green' if c > 0 else 'red' for c in lr_importance['coefficient']]
plt.barh(lr_importance['feature'][::-1], lr_importance['importance'][::-1], color=colors[::-1])
plt.xlabel('|Coefficient|')
plt.title('Linear Regression Feature Importance\n(Green=Positive, Red=Negative)')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 10: METHOD 2 - RANDOM FOREST
# =============================================================================

print("=" * 50)
print("METHOD 2: RANDOM FOREST")
print("=" * 50)

rf_model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# Built-in feature importance (MDI)
rf_importance = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance_mdi': rf_model.feature_importances_
}).sort_values('importance_mdi', ascending=False).reset_index(drop=True)

# Permutation importance (more reliable)
print("\nCalculating permutation importance...")
perm_result = permutation_importance(
    rf_model, X_test_scaled, y_test,
    n_repeats=30, random_state=RANDOM_STATE, n_jobs=-1
)
rf_importance['importance_perm'] = [
    perm_result.importances_mean[list(X_train_scaled.columns).index(f)] 
    for f in rf_importance['feature']
]
rf_importance['rank'] = range(1, len(rf_importance) + 1)

# Performance
rf_train_r2 = r2_score(y_train, rf_model.predict(X_train_scaled))
rf_test_r2 = r2_score(y_test, rf_model.predict(X_test_scaled))

print(f"\nTrain R²: {rf_train_r2:.4f}")
print(f"Test R²:  {rf_test_r2:.4f}")
print(f"\nFeature Importance:")
print(rf_importance[['rank', 'feature', 'importance_mdi', 'importance_perm']].to_string(index=False))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].barh(rf_importance['feature'][::-1], rf_importance['importance_mdi'][::-1], color='forestgreen')
axes[0].set_xlabel('Importance (MDI)')
axes[0].set_title('Random Forest - Mean Decrease Impurity')

rf_perm_sorted = rf_importance.sort_values('importance_perm', ascending=False)
axes[1].barh(rf_perm_sorted['feature'][::-1], rf_perm_sorted['importance_perm'][::-1], color='steelblue')
axes[1].set_xlabel('Importance (Permutation)')
axes[1].set_title('Random Forest - Permutation Importance')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 11: METHOD 3 - GAUSSIAN PROCESS (BAYESIAN)
# =============================================================================

print("=" * 50)
print("METHOD 3: GAUSSIAN PROCESS (BAYESIAN)")
print("=" * 50)

# Subsample if needed (GP is O(n³))
max_samples = 500
if len(X_train_scaled) > max_samples:
    print(f"Subsampling to {max_samples} for GP (computational efficiency)")
    idx = np.random.choice(len(X_train_scaled), max_samples, replace=False)
    X_train_gp = X_train_scaled.iloc[idx]
    y_train_gp = y_train.iloc[idx]
else:
    X_train_gp = X_train_scaled
    y_train_gp = y_train

# Kernel with ARD
n_features = X_train_scaled.shape[1]
kernel = ConstantKernel(1.0) * Matern(length_scale=np.ones(n_features), nu=2.5) + WhiteKernel(1.0)

print("Fitting Gaussian Process (may take a moment)...")
gp_model = GaussianProcessRegressor(
    kernel=kernel,
    n_restarts_optimizer=5,
    random_state=RANDOM_STATE,
    normalize_y=True
)
gp_model.fit(X_train_gp, y_train_gp)

# Permutation importance
print("Calculating permutation importance...")
gp_perm = permutation_importance(
    gp_model, X_test_scaled, y_test,
    n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1
)

gp_importance = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance': gp_perm.importances_mean,
    'importance_std': gp_perm.importances_std
}).sort_values('importance', ascending=False).reset_index(drop=True)
gp_importance['rank'] = range(1, len(gp_importance) + 1)

# Performance
gp_train_r2 = r2_score(y_train_gp, gp_model.predict(X_train_gp))
gp_test_r2 = r2_score(y_test, gp_model.predict(X_test_scaled))

print(f"\nTrain R²: {gp_train_r2:.4f}")
print(f"Test R²:  {gp_test_r2:.4f}")
print(f"\nFeature Importance (Permutation):")
print(gp_importance[['rank', 'feature', 'importance', 'importance_std']].to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
plt.barh(gp_importance['feature'][::-1], gp_importance['importance'][::-1], 
         xerr=gp_importance['importance_std'][::-1], color='darkorange', capsize=3)
plt.xlabel('Permutation Importance')
plt.title('Gaussian Process - Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 12: METHOD 4 - XGBOOST
# =============================================================================
"""
WHY XGBOOST FOR CHEMISTRY:
- State-of-the-art predictive performance
- Handles non-linear relationships
- Built-in regularization prevents overfitting
- Handles missing values natively
- Fast training
"""

print("=" * 60)
print("METHOD 4: XGBOOST")
print("=" * 60)

if not XGBOOST_AVAILABLE:
    print("⚠ XGBoost not installed. Skipping...")
    xgb_importance = None
    xgb_test_r2 = 0
else:
    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        verbosity=0
    )
    xgb_model.fit(X_train_scaled, y_train)
    
    # Built-in feature importance (gain-based)
    xgb_importance = pd.DataFrame({
        'feature': X_train_scaled.columns,
        'importance_gain': xgb_model.feature_importances_
    }).sort_values('importance_gain', ascending=False).reset_index(drop=True)
    
    # Permutation importance
    print("Calculating permutation importance...")
    xgb_perm = permutation_importance(
        xgb_model, X_test_scaled, y_test,
        n_repeats=30, random_state=RANDOM_STATE, n_jobs=-1
    )
    xgb_importance['importance_perm'] = [
        xgb_perm.importances_mean[list(X_train_scaled.columns).index(f)]
        for f in xgb_importance['feature']
    ]
    xgb_importance['rank'] = range(1, len(xgb_importance) + 1)
    
    # Performance
    xgb_train_r2 = r2_score(y_train, xgb_model.predict(X_train_scaled))
    xgb_test_r2 = r2_score(y_test, xgb_model.predict(X_test_scaled))
    xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_model.predict(X_train_scaled)))
    xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_model.predict(X_test_scaled)))
    
    print(f"\nTrain R²: {xgb_train_r2:.4f}, RMSE: {xgb_train_rmse:.4f}")
    print(f"Test R²:  {xgb_test_r2:.4f}, RMSE: {xgb_test_rmse:.4f}")
    print(f"\nFeature Importance:")
    print(xgb_importance[['rank', 'feature', 'importance_gain', 'importance_perm']].to_string(index=False))
    
    # Plot
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    axes[0].barh(xgb_importance['feature'][::-1], xgb_importance['importance_gain'][::-1], color='darkgreen')
    axes[0].set_xlabel('Importance (Gain)')
    axes[0].set_title('XGBoost - Gain-based Importance')
    
    xgb_perm_sorted = xgb_importance.sort_values('importance_perm', ascending=False)
    axes[1].barh(xgb_perm_sorted['feature'][::-1], xgb_perm_sorted['importance_perm'][::-1], color='teal')
    axes[1].set_xlabel('Importance (Permutation)')
    axes[1].set_title('XGBoost - Permutation Importance')
    
    plt.tight_layout()
    plt.show()

In [None]:
# =============================================================================
# CELL 13: METHOD 5 - PLS REGRESSION
# =============================================================================
"""
WHY PLS FOR CHEMISTRY:
- Standard method in chemometrics
- Handles multicollinearity (common in chemical data)
- Works when features > samples
- VIP scores are the standard importance metric in chemistry
- Finds latent variables explaining both X and y
"""

print("=" * 60)
print("METHOD 5: PLS REGRESSION (Partial Least Squares)")
print("=" * 60)

# Find optimal number of components via cross-validation
max_components = min(10, X_train_scaled.shape[1], len(X_train_scaled) - 1)
cv_scores = []

print("Finding optimal number of components...")
for n_comp in range(1, max_components + 1):
    pls_temp = PLSRegression(n_components=n_comp)
    scores = cross_val_score(pls_temp, X_train_scaled, y_train, cv=5, scoring='r2')
    cv_scores.append(scores.mean())

optimal_components = np.argmax(cv_scores) + 1
print(f"Optimal components: {optimal_components} (CV R² = {max(cv_scores):.4f})")

# Fit final model
pls_model = PLSRegression(n_components=optimal_components)
pls_model.fit(X_train_scaled, y_train)


# Calculate VIP scores (Variable Importance in Projection)
def calculate_vip_scores(model, X):
    """
    Calculate VIP (Variable Importance in Projection) scores.
    VIP > 1 is typically considered important in chemometrics.
    """
    t = model.x_scores_  # X scores
    w = model.x_weights_  # X weights
    q = model.y_loadings_  # Y loadings
    
    n_features, n_components = w.shape
    
    # Calculate sum of squares of Y explained by each component
    ss_y = np.sum(t ** 2, axis=0) * q.flatten() ** 2
    ss_y_total = np.sum(ss_y)
    
    # Calculate VIP for each feature
    vip_scores = np.zeros(n_features)
    for i in range(n_features):
        weight_sum = 0
        for j in range(n_components):
            weight_sum += (w[i, j] ** 2) * ss_y[j] / (np.sum(w[:, j] ** 2))
        vip_scores[i] = np.sqrt(n_features * weight_sum / ss_y_total)
    
    return vip_scores


vip_scores = calculate_vip_scores(pls_model, X_train_scaled)

# Create importance dataframe
pls_importance = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'VIP': vip_scores,
    'coefficient': pls_model.coef_.flatten(),
    'abs_coefficient': np.abs(pls_model.coef_.flatten())
}).sort_values('VIP', ascending=False).reset_index(drop=True)
pls_importance['rank'] = range(1, len(pls_importance) + 1)

# Performance
pls_train_r2 = r2_score(y_train, pls_model.predict(X_train_scaled))
pls_test_r2 = r2_score(y_test, pls_model.predict(X_test_scaled))
pls_train_rmse = np.sqrt(mean_squared_error(y_train, pls_model.predict(X_train_scaled)))
pls_test_rmse = np.sqrt(mean_squared_error(y_test, pls_model.predict(X_test_scaled)))

print(f"\nTrain R²: {pls_train_r2:.4f}, RMSE: {pls_train_rmse:.4f}")
print(f"Test R²:  {pls_test_r2:.4f}, RMSE: {pls_test_rmse:.4f}")

print(f"\nFeature Importance (VIP > 1 = important in chemometrics):")
print(pls_importance[['rank', 'feature', 'VIP', 'coefficient']].to_string(index=False))

# Identify important features by VIP threshold
important_vip = pls_importance[pls_importance['VIP'] >= 1.0]['feature'].tolist()
moderately_important = pls_importance[(pls_importance['VIP'] >= 0.8) & (pls_importance['VIP'] < 1.0)]['feature'].tolist()
less_important = pls_importance[pls_importance['VIP'] < 0.8]['feature'].tolist()

print(f"\n✓ Important (VIP ≥ 1.0): {important_vip}")
print(f"⚠ Moderate (0.8 ≤ VIP < 1.0): {moderately_important}")
print(f"✗ Less Important (VIP < 0.8): {less_important}")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# VIP scores plot
colors = ['darkgreen' if v >= 1.0 else 'orange' if v >= 0.8 else 'lightcoral' 
          for v in pls_importance['VIP']]
axes[0].barh(pls_importance['feature'][::-1], pls_importance['VIP'][::-1], color=colors[::-1])
axes[0].axvline(x=1.0, color='red', linestyle='--', linewidth=2, label='VIP = 1.0 threshold')
axes[0].axvline(x=0.8, color='orange', linestyle='--', linewidth=1, label='VIP = 0.8')
axes[0].set_xlabel('VIP Score')
axes[0].set_title('PLS - Variable Importance in Projection (VIP)\n(Green ≥1.0, Orange ≥0.8, Red <0.8)')
axes[0].legend()

# Coefficient plot
coef_colors = ['forestgreen' if c > 0 else 'crimson' for c in pls_importance['coefficient']]
axes[1].barh(pls_importance['feature'][::-1], pls_importance['abs_coefficient'][::-1], color=coef_colors[::-1])
axes[1].set_xlabel('|Coefficient|')
axes[1].set_title('PLS - Regression Coefficients\n(Green = Positive, Red = Negative)')

plt.tight_layout()
plt.show()

# Component scores plot
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(range(1, max_components + 1), cv_scores, 'bo-', linewidth=2, markersize=8)
ax.axvline(x=optimal_components, color='red', linestyle='--', label=f'Optimal = {optimal_components}')
ax.set_xlabel('Number of Components')
ax.set_ylabel('Cross-Validation R²')
ax.set_title('PLS Component Selection')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 14: AGGREGATE RANKINGS (UPDATED FOR 5 METHODS)
# =============================================================================

print("=" * 60)
print("AGGREGATED RANKINGS (ALL 5 METHODS)")
print("=" * 60)

# Create ranking from each method
features = list(X_train_scaled.columns)

# Linear Regression ranking
lr_ranking = lr_importance[['feature']].copy()
lr_ranking['rank_lr'] = range(1, len(lr_ranking) + 1)

# Random Forest ranking
rf_ranking = rf_importance[['feature']].copy()
rf_ranking['rank_rf'] = range(1, len(rf_ranking) + 1)

# Gaussian Process ranking
gp_ranking = gp_importance[['feature']].copy()
gp_ranking['rank_gp'] = range(1, len(gp_ranking) + 1)

# XGBoost ranking
if xgb_importance is not None:
    xgb_ranking = xgb_importance[['feature']].copy()
    xgb_ranking['rank_xgb'] = range(1, len(xgb_ranking) + 1)
else:
    xgb_ranking = pd.DataFrame({'feature': features, 'rank_xgb': [np.nan] * len(features)})

# PLS ranking (by VIP score)
pls_ranking = pls_importance[['feature']].copy()
pls_ranking['rank_pls'] = range(1, len(pls_ranking) + 1)

# Merge all rankings
combined = lr_ranking.merge(rf_ranking, on='feature')
combined = combined.merge(gp_ranking, on='feature')
combined = combined.merge(xgb_ranking, on='feature')
combined = combined.merge(pls_ranking, on='feature')

# Calculate average rank (handling potential NaN from XGBoost)
rank_cols = ['rank_lr', 'rank_rf', 'rank_gp', 'rank_xgb', 'rank_pls']
combined['avg_rank'] = combined[rank_cols].mean(axis=1, skipna=True)
combined = combined.sort_values('avg_rank').reset_index(drop=True)
combined['final_rank'] = range(1, len(combined) + 1)

# Add agreement score (how many methods agree on top 5)
def count_top_k_agreement(row, k=5):
    count = 0
    for col in rank_cols:
        if pd.notna(row[col]) and row[col] <= k:
            count += 1
    return count

combined['top5_agreement'] = combined.apply(lambda row: count_top_k_agreement(row, k=5), axis=1)

print("\nConsensus Ranking (All 5 Methods):")
print(combined[['final_rank', 'feature', 'rank_lr', 'rank_rf', 'rank_gp', 'rank_xgb', 'rank_pls', 'avg_rank', 'top5_agreement']].to_string(index=False))

# Get top K
top_k_features = combined.head(TOP_K)['feature'].tolist()
print(f"\n✓ TOP {TOP_K} FEATURES (by consensus): {top_k_features}")

# Show high-agreement features
high_agreement = combined[combined['top5_agreement'] >= 3]['feature'].tolist()
print(f"\n✓ HIGH AGREEMENT FEATURES (≥3 methods rank in top 5): {high_agreement}")

# Store for later use
consensus_ranking = combined.copy()

In [None]:
# =============================================================================
# CELL 15: RANKING VISUALIZATION (UPDATED FOR 5 METHODS)
# =============================================================================

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Bar comparison for top features
ax1 = axes[0, 0]
x = np.arange(len(consensus_ranking.head(TOP_K)))
width = 0.15

ax1.bar(x - 2*width, consensus_ranking.head(TOP_K)['rank_lr'], width, label='Linear Reg', color='steelblue')
ax1.bar(x - width, consensus_ranking.head(TOP_K)['rank_rf'], width, label='Random Forest', color='forestgreen')
ax1.bar(x, consensus_ranking.head(TOP_K)['rank_gp'], width, label='Gaussian Process', color='darkorange')
if xgb_importance is not None:
    ax1.bar(x + width, consensus_ranking.head(TOP_K)['rank_xgb'], width, label='XGBoost', color='teal')
ax1.bar(x + 2*width, consensus_ranking.head(TOP_K)['rank_pls'], width, label='PLS', color='purple')

ax1.set_xlabel('Feature')
ax1.set_ylabel('Rank (lower = better)')
ax1.set_title(f'Ranking Comparison - Top {TOP_K} Features')
ax1.set_xticks(x)
ax1.set_xticklabels(consensus_ranking.head(TOP_K)['feature'], rotation=45, ha='right')
ax1.legend(loc='upper right')
ax1.invert_yaxis()

# 2. Heatmap of all rankings
ax2 = axes[0, 1]
heatmap_cols = ['rank_lr', 'rank_rf', 'rank_gp', 'rank_xgb', 'rank_pls']
heatmap_data = consensus_ranking.set_index('feature')[heatmap_cols].head(10)
heatmap_data.columns = ['Linear Reg', 'Random Forest', 'Gaussian Proc', 'XGBoost', 'PLS']
sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='RdYlGn_r', ax=ax2, cbar_kws={'label': 'Rank'})
ax2.set_title('Feature Rankings Heatmap (Top 10)\n(Lower = More Important)')

# 3. Agreement visualization
ax3 = axes[1, 0]
agreement_data = consensus_ranking.head(10)
colors = ['darkgreen' if a >= 4 else 'orange' if a >= 3 else 'lightcoral' for a in agreement_data['top5_agreement']]
ax3.barh(agreement_data['feature'][::-1], agreement_data['top5_agreement'][::-1], color=colors[::-1])
ax3.axvline(x=3, color='orange', linestyle='--', linewidth=2, label='Good agreement (3+)')
ax3.set_xlabel('Number of Methods Ranking in Top 5')
ax3.set_title('Method Agreement on Top Features')
ax3.legend()

# 4. Model performance comparison
ax4 = axes[1, 1]
model_names = ['Linear Reg', 'Random Forest', 'Gaussian Proc', 'XGBoost', 'PLS']
test_r2_scores = [lr_test_r2, rf_test_r2, gp_test_r2, 
                  xgb_test_r2 if xgb_importance is not None else 0, 
                  pls_test_r2]
bar_colors = ['steelblue', 'forestgreen', 'darkorange', 'teal', 'purple']

bars = ax4.bar(model_names, test_r2_scores, color=bar_colors)
ax4.set_ylabel('Test R²')
ax4.set_title('Model Performance Comparison')
ax4.set_ylim(0, 1)

for bar, score in zip(bars, test_r2_scores):
    ax4.annotate(f'{score:.3f}', xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

# Print model performance summary
print("\n" + "=" * 60)
print("MODEL PERFORMANCE SUMMARY")
print("=" * 60)
performance_df = pd.DataFrame({
    'Model': model_names,
    'Test_R2': test_r2_scores,
    'Suitable_for_Feature_Importance': ['Yes' if r2 > 0.3 else 'Caution' if r2 > 0.15 else 'No' for r2 in test_r2_scores]
})
performance_df = performance_df.sort_values('Test_R2', ascending=False)
print(performance_df.to_string(index=False))

In [None]:
# =============================================================================
# CELL 16: MANUAL FEATURE SELECTION (UPDATED FOR 5 METHODS)
# =============================================================================

def select_features(features_to_use):
    """
    Manually select which features to use.
    
    Parameters:
    -----------
    features_to_use : list
        List of feature names to use
        
    Returns:
    --------
    selected_features : list
    """
    all_features = list(X_train_scaled.columns)
    
    invalid = [f for f in features_to_use if f not in all_features]
    valid = [f for f in features_to_use if f in all_features]
    
    if invalid:
        print(f"⚠ Invalid features (ignored): {invalid}")
    
    print(f"\n✓ Selected {len(valid)} features:")
    for i, f in enumerate(valid, 1):
        lr_rank = int(lr_importance[lr_importance['feature'] == f]['rank'].values[0])
        rf_rank = int(rf_importance[rf_importance['feature'] == f]['rank'].values[0])
        gp_rank = int(gp_importance[gp_importance['feature'] == f]['rank'].values[0])
        xgb_rank = int(xgb_importance[xgb_importance['feature'] == f]['rank'].values[0]) if xgb_importance is not None else 'N/A'
        pls_rank = int(pls_importance[pls_importance['feature'] == f]['rank'].values[0])
        vip = pls_importance[pls_importance['feature'] == f]['VIP'].values[0]
        print(f"  {i}. {f} (LR:{lr_rank}, RF:{rf_rank}, GP:{gp_rank}, XGB:{xgb_rank}, PLS:{pls_rank}, VIP:{vip:.2f})")
    
    return valid


def show_all_features():
    """Display all available features with their rankings from all 5 methods."""
    print("\nALL AVAILABLE FEATURES (Ranked by 5 Methods):")
    print("-" * 90)
    print(f"{'#':<4} {'Feature':<22} {'LR':<5} {'RF':<5} {'GP':<5} {'XGB':<5} {'PLS':<5} {'VIP':<6} {'Avg':<6}")
    print("-" * 90)
    
    for _, row in consensus_ranking.iterrows():
        vip = pls_importance[pls_importance['feature'] == row['feature']]['VIP'].values[0]
        xgb_rank = f"{int(row['rank_xgb'])}" if pd.notna(row['rank_xgb']) else 'N/A'
        print(f"{int(row['final_rank']):<4} {row['feature']:<22} "
              f"{int(row['rank_lr']):<5} {int(row['rank_rf']):<5} "
              f"{int(row['rank_gp']):<5} {xgb_rank:<5} {int(row['rank_pls']):<5} "
              f"{vip:<6.2f} {row['avg_rank']:<6.2f}")


def use_top_k(k):
    """Select top K features from consensus ranking."""
    features = consensus_ranking.head(k)['feature'].tolist()
    print(f"✓ Selected top {k} features: {features}")
    return features


def use_high_agreement(min_agreement=3):
    """Select features where at least min_agreement methods rank them in top 5."""
    features = consensus_ranking[consensus_ranking['top5_agreement'] >= min_agreement]['feature'].tolist()
    print(f"✓ Selected {len(features)} features with ≥{min_agreement} methods agreement: {features}")
    return features


def use_vip_threshold(threshold=1.0):
    """Select features with VIP score above threshold (chemometrics standard)."""
    features = pls_importance[pls_importance['VIP'] >= threshold]['feature'].tolist()
    print(f"✓ Selected {len(features)} features with VIP ≥ {threshold}: {features}")
    return features


# Show all options
show_all_features()
print(f"\nCurrent selection (top {TOP_K}): {top_k_features}")
print(f"\nSELECTION FUNCTIONS AVAILABLE:")
print("  - use_top_k(k)                    : Top K by average rank")
print("  - use_high_agreement(min=3)       : Features with method agreement")
print("  - use_vip_threshold(threshold=1.0): Features by PLS VIP score")
print("  - select_features(['feat1', ...]) : Manual selection")

In [None]:
# =============================================================================
# CELL 17: EXAMPLE - OVERRIDE FEATURE SELECTION (UPDATED)
# =============================================================================

# OPTION 1: Use top K from consensus (all 5 methods)
selected_features = use_top_k(TOP_K)

# OPTION 2: Use features with high method agreement
# selected_features = use_high_agreement(min_agreement=3)

# OPTION 3: Use chemometrics standard (VIP >= 1.0)
# selected_features = use_vip_threshold(threshold=1.0)

# OPTION 4: Manually select specific features
# selected_features = select_features(['temperature_C', 'catalyst_amount_g', 'pH'])

# OPTION 5: Combine approaches (intersection or union)
# vip_features = set(use_vip_threshold(1.0))
# consensus_features = set(use_top_k(5))
# selected_features = list(vip_features & consensus_features)  # intersection
# selected_features = list(vip_features | consensus_features)  # union

print(f"\n{'='*60}")
print(f"FINAL SELECTED FEATURES: {selected_features}")
print(f"{'='*60}")

In [None]:
# =============================================================================
# CELL 18: FINAL MODEL WITH SELECTED FEATURES (UPDATED FOR 5 METHODS)
# =============================================================================

# Subset data to selected features
X_train_final = X_train_scaled[selected_features]
X_test_final = X_test_scaled[selected_features]

print("=" * 60)
print("FINAL MODEL COMPARISON (Selected Features Only)")
print("=" * 60)
print(f"Features used ({len(selected_features)}): {selected_features}\n")

results_list = []

# 1. Linear Regression
lr_final = LinearRegression()
lr_final.fit(X_train_final, y_train)
lr_r2_final = r2_score(y_test, lr_final.predict(X_test_final))
lr_rmse_final = np.sqrt(mean_squared_error(y_test, lr_final.predict(X_test_final)))
results_list.append(['Linear Regression', lr_r2_final, lr_rmse_final])

# 2. Random Forest
rf_final = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf_final.fit(X_train_final, y_train)
rf_r2_final = r2_score(y_test, rf_final.predict(X_test_final))
rf_rmse_final = np.sqrt(mean_squared_error(y_test, rf_final.predict(X_test_final)))
results_list.append(['Random Forest', rf_r2_final, rf_rmse_final])

# 3. Gaussian Process
n_feat = len(selected_features)
gp_kernel = ConstantKernel(1.0) * Matern(length_scale=np.ones(n_feat), nu=2.5) + WhiteKernel(1.0)
gp_final = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=5, 
                                     random_state=RANDOM_STATE, normalize_y=True)
gp_final.fit(X_train_final, y_train)
gp_r2_final = r2_score(y_test, gp_final.predict(X_test_final))
gp_rmse_final = np.sqrt(mean_squared_error(y_test, gp_final.predict(X_test_final)))
results_list.append(['Gaussian Process', gp_r2_final, gp_rmse_final])

# 4. XGBoost
if XGBOOST_AVAILABLE:
    xgb_final = xgb.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1,
                                  random_state=RANDOM_STATE, verbosity=0)
    xgb_final.fit(X_train_final, y_train)
    xgb_r2_final = r2_score(y_test, xgb_final.predict(X_test_final))
    xgb_rmse_final = np.sqrt(mean_squared_error(y_test, xgb_final.predict(X_test_final)))
    results_list.append(['XGBoost', xgb_r2_final, xgb_rmse_final])

# 5. PLS
pls_final = PLSRegression(n_components=min(optimal_components, len(selected_features)))
pls_final.fit(X_train_final, y_train)
pls_r2_final = r2_score(y_test, pls_final.predict(X_test_final))
pls_rmse_final = np.sqrt(mean_squared_error(y_test, pls_final.predict(X_test_final)))
results_list.append(['PLS', pls_r2_final, pls_rmse_final])

# Display results
results_df = pd.DataFrame(results_list, columns=['Model', 'Test_R2', 'Test_RMSE'])
results_df = results_df.sort_values('Test_R2', ascending=False)
print(results_df.to_string(index=False))

# Best model
best_model = results_df.iloc[0]['Model']
best_r2 = results_df.iloc[0]['Test_R2']
print(f"\n✓ Best Model: {best_model} (R² = {best_r2:.4f})")

# Plot predictions for all models
n_models = len(results_list)
fig, axes = plt.subplots(1, n_models, figsize=(4*n_models, 4))

models_dict = {
    'Linear Regression': lr_final,
    'Random Forest': rf_final,
    'Gaussian Process': gp_final,
    'PLS': pls_final
}
if XGBOOST_AVAILABLE:
    models_dict['XGBoost'] = xgb_final

for ax, (name, model) in zip(axes, models_dict.items()):
    y_pred = model.predict(X_test_final)
    if len(y_pred.shape) > 1:
        y_pred = y_pred.flatten()
    
    ax.scatter(y_test, y_pred, alpha=0.6, edgecolors='black', linewidth=0.5)
    
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
    
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    r2 = r2_score(y_test, y_pred)
    ax.set_title(f'{name}\nR² = {r2:.4f}')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# CELL 19: EXPORT RESULTS (UPDATED)
# =============================================================================

# Save complete rankings to CSV
consensus_ranking.to_csv('feature_rankings_5methods.csv', index=False)
print("✓ Saved: feature_rankings_5methods.csv")

# Save individual method rankings
lr_importance.to_csv('importance_linear_regression.csv', index=False)
rf_importance.to_csv('importance_random_forest.csv', index=False)
gp_importance.to_csv('importance_gaussian_process.csv', index=False)
if xgb_importance is not None:
    xgb_importance.to_csv('importance_xgboost.csv', index=False)
pls_importance.to_csv('importance_pls.csv', index=False)
print("✓ Saved: Individual importance files")

# Save selected features
with open('selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print("✓ Saved: selected_features.txt")

# Save PLS VIP analysis
vip_analysis = pls_importance[['feature', 'VIP', 'coefficient']].copy()
vip_analysis['VIP_category'] = vip_analysis['VIP'].apply(
    lambda x: 'Important (≥1.0)' if x >= 1.0 else 'Moderate (≥0.8)' if x >= 0.8 else 'Less Important (<0.8)'
)
vip_analysis.to_csv('pls_vip_analysis.csv', index=False)
print("✓ Saved: pls_vip_analysis.csv")

# Summary
print("\n" + "=" * 60)
print("FINAL SUMMARY")
print("=" * 60)
print(f"Total features analyzed: {len(feature_cols)}")
print(f"Methods used: Linear Regression, Random Forest, Gaussian Process, XGBoost, PLS")
print(f"Selected features ({len(selected_features)}): {selected_features}")
print(f"Best model: {best_model} (R² = {best_r2:.4f})")
print(f"\nFeatures by PLS VIP:")
print(f"  Important (VIP ≥ 1.0): {pls_importance[pls_importance['VIP'] >= 1.0]['feature'].tolist()}")
print(f"  Moderate (VIP ≥ 0.8): {pls_importance[(pls_importance['VIP'] >= 0.8) & (pls_importance['VIP'] < 1.0)]['feature'].tolist()}")