# TabPFN + GGH Hybrid Imputation Benchmark

## Research Goal
Combine the strengths of **TabPFN** (pre-trained transformer with learned priors) and **GGH** (gradient-guided hypothesis selection with domain knowledge) into a hybrid imputation method.

## Two-Stage Hybrid Approach
1. **Stage 1: TabPFN Initial Imputation**
   - Map training data to nearest hypothesis class
   - TabPFN predicts among K hypothesis classes (domain-constrained)
   - Get class probabilities for confidence estimation

2. **Stage 2: GGH Gradient Refinement**
   - Train downstream model briefly with Stage 1 imputations
   - Compute gradients for partial data (known correct)
   - Build per-class anchors (correct vs incorrect)
   - For low-confidence samples: combine TabPFN probs with GGH anchor similarity

## Expected Outcome
- TabPFN+GGH Hybrid should outperform TabPFN alone (adds domain knowledge)
- May match or exceed GGH by leveraging TabPFN's learned priors as warm start
- Biggest gains on samples where TabPFN is uncertain

In [7]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
sys.path.insert(0, '../')
sys.path.insert(0, '../GGH')

from GGH.data_ops import DataOperator
from GGH.selection_algorithms import AlgoModulators
from GGH.models import initialize_model
from GGH.train_val_loop import TrainValidationManager
from GGH.inspector import Inspector
from scipy import stats
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import grad
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

def set_to_deterministic(rand_state):
    import random
    random.seed(rand_state)
    np.random.seed(rand_state)
    torch.manual_seed(rand_state)
    torch.set_num_threads(1)
    torch.use_deterministic_algorithms(True)

print("Imports successful!")

# GPU Detection
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


Imports successful!


In [8]:
# =============================================================================
# DATA CONFIGURATION - Photocell Degradation Dataset
# =============================================================================
data_path = '../data/dataset_photo_pce10/data.csv'
results_path = "../saved_results/TabPFN_GGH_Hybrid"

# Variables
inpt_vars = ['P3HT', 'PTB7-Th']
target_vars = ['Degradation']
miss_vars = ['PCBM']

# Hypothesis values (6 PCBM concentration values)
hypothesis = [[0.03, 0.11, 0.20, 0.32, 0.43, 0.6]]
HYPOTHESIS_VALUES = np.array(hypothesis[0])

# Model parameters
hidden_size = 32
output_size = len(target_vars)
hyp_per_sample = len(hypothesis[0])  # 6 hypotheses
batch_size = 100 * hyp_per_sample

# Training parameters
partial_perc = 0.03  # 3% complete data
dropout = 0.05
lr = 0.004
nu = 0.1

# Benchmark parameters
BENCHMARK_N_RUNS = 15
BENCHMARK_EPOCHS = 600

# TabPFN+GGH Hybrid parameters
HYBRID_BRIEF_EPOCHS = 30           # Brief training for gradient computation
HYBRID_CONFIDENCE_THRESHOLD = 0.6  # Re-refine samples where TabPFN max prob < 60%
HYBRID_LR = 0.01                   # Learning rate for brief training

# Create directories
import os
os.makedirs(results_path, exist_ok=True)

print(f"Dataset: Photocell Degradation")
print(f"Hypothesis values: {HYPOTHESIS_VALUES}")
print(f"Number of hypotheses: {hyp_per_sample}")
print(f"Partial percentage: {partial_perc*100}%")
print(f"Benchmark runs: {BENCHMARK_N_RUNS}")
print(f"Epochs: {BENCHMARK_EPOCHS}")
print(f"\nHybrid parameters:")
print(f"  Brief training epochs: {HYBRID_BRIEF_EPOCHS}")
print(f"  Confidence threshold: {HYBRID_CONFIDENCE_THRESHOLD}")
print(f"Results will be saved to: {results_path}")

Dataset: Photocell Degradation
Hypothesis values: [0.03 0.11 0.2  0.32 0.43 0.6 ]
Number of hypotheses: 6
Partial percentage: 3.0%
Benchmark runs: 15
Epochs: 600

Hybrid parameters:
  Brief training epochs: 30
  Confidence threshold: 0.6
Results will be saved to: ../saved_results/TabPFN_GGH_Hybrid


## TabPFN Constrained to Hypothesis Values

First, we implement a TabPFN imputer that is constrained to only output hypothesis values (fair comparison with GGH).

In [9]:
class TabPFNConstrainedImputer:
    """TabPFN imputer constrained to hypothesis values only.
    
    Unlike standard TabPFN which predicts among all unique values in training data,
    this version maps training values to nearest hypothesis and predicts among those.
    This gives TabPFN the same domain knowledge as GGH for fair comparison.
    """
    
    def __init__(self, hypothesis_values, rand_state, device=None):
        self.hypothesis_values = np.array(hypothesis_values)
        self.rand_state = rand_state
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        
        from tabpfn import TabPFNClassifier
        self.tabpfn_cls = TabPFNClassifier
        
    def fit_transform(self, matrix):
        """Impute missing values, constrained to hypothesis values.
        
        Returns:
            imputed_values: Array of imputed values (hypothesis values only)
            probabilities: Class probabilities from TabPFN
            missing_mask: Boolean mask of which rows were imputed
        """
        np.random.seed(self.rand_state)
        torch.manual_seed(self.rand_state)
        
        mask = np.isnan(matrix)
        if not mask.any():
            return matrix[:, -1], None, np.zeros(len(matrix), dtype=bool)
        
        column = np.argwhere(np.sum(mask, axis=0) > 0)[0, 0]
        
        X = np.delete(matrix, column, axis=1)
        y = matrix[:, column]
        
        known_mask = ~mask[:, column]
        X_train, y_train = X[known_mask], y[known_mask]
        X_missing = X[mask[:, column]]
        
        # Map y_train to nearest hypothesis class
        y_train_mapped = np.array([
            self.hypothesis_values[np.argmin(np.abs(self.hypothesis_values - v))]
            for v in y_train
        ])
        
        # Encode as class indices (0 to K-1)
        y_train_encoded = np.array([
            np.argmin(np.abs(self.hypothesis_values - v)) for v in y_train_mapped
        ])
        
        # Train TabPFN
        try:
            model = self.tabpfn_cls(device=self.device)
        except TypeError:
            model = self.tabpfn_cls()
        
        model.fit(X_train, y_train_encoded)
        
        # Get predictions and probabilities
        predictions = model.predict(X_missing)
        probs = model.predict_proba(X_missing)
        
        # Map predictions back to hypothesis values
        imputed_values = self.hypothesis_values[predictions]
        
        return imputed_values, probs, mask[:, column]

print("TabPFNConstrainedImputer defined.")

TabPFNConstrainedImputer defined.


## TabPFN + GGH Hybrid Imputer

The main innovation: combine TabPFN's predictions with GGH's gradient-based anchor similarity.

In [10]:
class HypothesisAmplifyingModel(nn.Module):
    """Neural network for gradient computation in GGH refinement."""
    def __init__(self, n_shared_features, n_hypothesis_features=1, 
                 shared_hidden=16, hypothesis_hidden=32, final_hidden=32, output_size=1):
        super().__init__()
        
        self.shared_path = nn.Sequential(
            nn.Linear(n_shared_features, shared_hidden),
            nn.ReLU(),
        )
        
        self.hypothesis_path = nn.Sequential(
            nn.Linear(n_hypothesis_features, hypothesis_hidden),
            nn.ReLU(),
            nn.Linear(hypothesis_hidden, hypothesis_hidden),
            nn.ReLU(),
        )
        
        combined_size = shared_hidden + hypothesis_hidden
        self.final_path = nn.Sequential(
            nn.Linear(combined_size, final_hidden),
            nn.ReLU(),
            nn.Linear(final_hidden, output_size)
        )
        
        self.n_shared = n_shared_features
        
    def forward(self, x):
        shared_features = x[:, :self.n_shared]
        hypothesis_feature = x[:, self.n_shared:]
        
        shared_emb = self.shared_path(shared_features)
        hypothesis_emb = self.hypothesis_path(hypothesis_feature)
        
        combined = torch.cat([shared_emb, hypothesis_emb], dim=1)
        return self.final_path(combined)


class TabPFNGGHHybridImputer:
    """Two-stage TabPFN + GGH hybrid imputation.
    
    Stage 1: TabPFN imputes with hypothesis constraints, returns probabilities
    Stage 2: For low-confidence samples, refine using GGH anchor similarity
    """
    
    def __init__(self, hypothesis_values, rand_state, device=None,
                 brief_epochs=30, confidence_threshold=0.6, lr=0.01):
        self.hypothesis_values = np.array(hypothesis_values)
        self.rand_state = rand_state
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.brief_epochs = brief_epochs
        self.confidence_threshold = confidence_threshold
        self.lr = lr
        
        # Initialize TabPFN
        self.tabpfn_imputer = TabPFNConstrainedImputer(
            hypothesis_values, rand_state, device
        )
        
        # Stats tracking
        self.stats = {
            'total_imputed': 0,
            'low_confidence': 0,
            'refined': 0,
            'changed': 0,
        }
        
    def fit_transform(self, DO, batch_size=32):
        """Two-stage imputation with GGH refinement.
        
        Args:
            DO: DataOperator with partial data info
            batch_size: Batch size for brief training
            
        Returns:
            imputed_matrix: Matrix with imputed values
        """
        from GGH.imputation_methods import prep_imputation_input
        
        # Prepare imputation input (same as standard Imputer)
        df_train_imp = prep_imputation_input(
            DO, DO.df_train[DO.inpt_vars + DO.miss_vars + DO.target_vars], 
            DO.miss_vars, DO.partial_rows_id
        )
        matrix = df_train_imp.values
        
        # Stage 1: TabPFN initial imputation
        print("  Stage 1: TabPFN initial imputation...")
        initial_values, tabpfn_probs, missing_mask = self.tabpfn_imputer.fit_transform(matrix)
        
        if tabpfn_probs is None:
            # No missing values
            return matrix
        
        self.stats['total_imputed'] = len(initial_values)
        
        # Create initial imputed matrix
        matrix_imputed = matrix.copy()
        column = np.argwhere(np.sum(np.isnan(matrix), axis=0) > 0)[0, 0]
        matrix_imputed[missing_mask, column] = initial_values
        
        # Identify low-confidence samples
        max_probs = np.max(tabpfn_probs, axis=1)
        low_confidence_mask = max_probs < self.confidence_threshold
        self.stats['low_confidence'] = np.sum(low_confidence_mask)
        
        print(f"    Total imputed: {self.stats['total_imputed']}")
        print(f"    Low confidence (<{self.confidence_threshold}): {self.stats['low_confidence']}")
        
        if not low_confidence_mask.any():
            print("    No low-confidence samples to refine.")
            return matrix_imputed
        
        # Stage 2: GGH refinement for low-confidence samples
        print(f"  Stage 2: GGH gradient refinement ({self.brief_epochs} epochs)...")
        
        # Brief training to get gradients
        anchors = self._compute_ggh_anchors(matrix_imputed, DO, batch_size)
        
        if anchors is None:
            print("    Could not compute anchors, skipping refinement.")
            return matrix_imputed
        
        # Refine low-confidence samples
        X_missing = np.delete(matrix, column, axis=1)[missing_mask]
        refined_values = initial_values.copy()
        
        for i in np.where(low_confidence_mask)[0]:
            tabpfn_score = tabpfn_probs[i]  # Shape: (num_hypotheses,)
            ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
            
            if ggh_score is None:
                continue
            
            # Combine: multiply and renormalize
            combined = tabpfn_score * ggh_score
            combined_sum = combined.sum()
            if combined_sum > 0:
                combined /= combined_sum
            else:
                combined = tabpfn_score  # Fallback to TabPFN only
            
            new_value = self.hypothesis_values[np.argmax(combined)]
            
            if new_value != refined_values[i]:
                self.stats['changed'] += 1
            
            refined_values[i] = new_value
            self.stats['refined'] += 1
        
        print(f"    Refined: {self.stats['refined']}, Changed: {self.stats['changed']}")
        
        # Update matrix with refined values
        matrix_imputed[missing_mask, column] = refined_values
        
        return matrix_imputed
    
    def _compute_ggh_anchors(self, matrix_imputed, DO, batch_size):
        """Brief training to compute GGH-style anchors per class."""
        set_to_deterministic(self.rand_state + 500)
        
        n_shared = len(DO.inpt_vars)
        n_hyp = len(DO.miss_vars)
        out_size = len(DO.target_vars)
        hyp_per_sample = DO.num_hyp_comb
        
        # Get partial data info
        partial_correct_gids = set(DO.df_train_hypothesis[
            (DO.df_train_hypothesis['partial_full_info'] == 1) & 
            (DO.df_train_hypothesis['correct_hypothesis'] == True)
        ].index.tolist())
        blacklisted_gids = set(DO.df_train_hypothesis[
            (DO.df_train_hypothesis['partial_full_info'] == 1) & 
            (DO.df_train_hypothesis['correct_hypothesis'] == False)
        ].index.tolist())
        
        if not partial_correct_gids or not blacklisted_gids:
            return None
        
        # Create dataloader with imputed data
        input_cols = DO.inpt_vars + [var + '_hypothesis' for var in DO.miss_vars]
        n_samples = len(DO.df_train_hypothesis)
        global_ids = torch.arange(n_samples)
        
        dataset = TensorDataset(
            torch.tensor(DO.df_train_hypothesis[input_cols].values, dtype=torch.float32),
            torch.tensor(DO.df_train_hypothesis[DO.target_vars].values, dtype=torch.float32),
            global_ids
        )
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        # Train model briefly
        model = HypothesisAmplifyingModel(n_shared, n_hyp, 16, 32, 32, out_size)
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)
        criterion = nn.MSELoss()
        
        model.train()
        for epoch in range(self.brief_epochs):
            for inputs, targets, gids in dataloader:
                predictions = model(inputs)
                loss = criterion(predictions, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
        # Compute gradients for partial data
        model.eval()
        gradient_data = {}
        
        for inputs, targets, gids in dataloader:
            for i in range(len(inputs)):
                gid = gids[i].item()
                if gid not in partial_correct_gids and gid not in blacklisted_gids:
                    continue
                
                inp = inputs[i:i+1].clone().requires_grad_(True)
                pred = model(inp)
                loss = nn.MSELoss()(pred, targets[i:i+1])
                
                params = list(model.parameters())
                grad_param = grad(loss, params[-2], retain_graph=False)[0]
                grad_vec = grad_param.flatten().detach().cpu().numpy()
                
                is_correct = gid in partial_correct_gids
                class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
                features = DO.df_train_hypothesis.loc[gid, DO.inpt_vars].values.astype(np.float64)
                
                gradient_data[gid] = {
                    'gradient': grad_vec,
                    'features': features,
                    'is_correct': is_correct,
                    'class_id': class_id
                }
        
        # Build per-class anchors
        anchors = {
            'correct_features': {},
            'incorrect_features': {},
            'correct_grads': {},
            'incorrect_grads': {},
        }
        
        for class_id in range(hyp_per_sample):
            correct_feats = []
            incorrect_feats = []
            correct_grads = []
            incorrect_grads = []
            
            for gid, data in gradient_data.items():
                if data['class_id'] != class_id:
                    continue
                if data['is_correct']:
                    correct_feats.append(data['features'])
                    correct_grads.append(data['gradient'])
                else:
                    incorrect_feats.append(data['features'])
                    incorrect_grads.append(data['gradient'])
            
            if correct_feats and incorrect_feats:
                anchors['correct_features'][class_id] = np.mean(correct_feats, axis=0)
                anchors['incorrect_features'][class_id] = np.mean(incorrect_feats, axis=0)
                anchors['correct_grads'][class_id] = np.mean(correct_grads, axis=0)
                anchors['incorrect_grads'][class_id] = np.mean(incorrect_grads, axis=0)
        
        return anchors
    
    def _compute_anchor_similarity(self, features, anchors):
        """Compute similarity scores for each hypothesis class.
        
        Returns array of shape (num_hypotheses,) with similarity scores.
        Higher score = more similar to correct anchor.
        """
        scores = np.zeros(len(self.hypothesis_values))
        
        for class_id in range(len(self.hypothesis_values)):
            if class_id not in anchors['correct_features']:
                scores[class_id] = 0.5  # Neutral if no anchor
                continue
            
            correct_feat = anchors['correct_features'][class_id]
            incorrect_feat = anchors['incorrect_features'][class_id]
            
            # Cosine similarity to correct vs incorrect anchors
            sim_correct = np.dot(features, correct_feat) / (
                np.linalg.norm(features) * np.linalg.norm(correct_feat) + 1e-8
            )
            sim_incorrect = np.dot(features, incorrect_feat) / (
                np.linalg.norm(features) * np.linalg.norm(incorrect_feat) + 1e-8
            )
            
            # Convert to 0-1 score (higher = more similar to correct)
            raw_score = sim_correct - sim_incorrect
            scores[class_id] = 1 / (1 + np.exp(-raw_score))  # Sigmoid
        
        return scores

print("TabPFNGGHHybridImputer defined.")

TabPFNGGHHybridImputer defined.


## GGH Soft Refinement Functions

Import the GGH soft refinement functions from Photocell benchmark for comparison.

In [11]:
# GGH Configuration
GGH_ITER1_EPOCHS = 60
GGH_ITER1_ANALYSIS_EPOCHS = 5
GGH_ITER1_LR = 0.01
GGH_ITER2_EPOCHS = 30
GGH_ITER2_LR = 0.01
GGH_SCORING_PASSES = 5
GGH_FINAL_EPOCHS = 200
GGH_MIN_WEIGHT = 0.1
GGH_TEMPERATURE_ITER1 = 1.0
GGH_TEMPERATURE_ITER3 = 0.8
GGH_LOSS_INFLUENCE = 0.25
GGH_PARTIAL_BASE_WEIGHT = 2.0
GGH_BENCHMARK_LR = 0.01
MODEL_SHARED_HIDDEN = 16
MODEL_HYPOTHESIS_HIDDEN = 32
MODEL_FINAL_HIDDEN = 32

# Import GGH functions from Photocell benchmark (copy essential ones)
def sigmoid_stable(x):
    x = np.array(x, dtype=np.float64)
    return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))

def compute_soft_weights(scores, min_weight=0.1, temperature=1.0):
    scores = np.array(scores, dtype=np.float64)
    if len(scores) == 0:
        return np.array([])
    mean_s = np.mean(scores)
    std_s = np.std(scores) + 1e-8
    normalized = (scores - mean_s) / std_s
    raw_weights = sigmoid_stable(normalized / temperature)
    weights = min_weight + (1 - min_weight) * raw_weights
    return weights

def create_dataloader_with_gids(DO, batch_size=32):
    input_cols = DO.inpt_vars + [var + '_hypothesis' for var in DO.miss_vars]
    n_samples = len(DO.df_train_hypothesis)
    global_ids = torch.arange(n_samples)
    dataset = TensorDataset(
        torch.tensor(DO.df_train_hypothesis[input_cols].values, dtype=torch.float32),
        torch.tensor(DO.df_train_hypothesis[DO.target_vars].values, dtype=torch.float32),
        global_ids
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def evaluate_on_test(DO, model):
    model.eval()
    with torch.no_grad():
        test_inputs, test_targets = DO.get_test_tensors(use_info="full info")
        test_preds = model(test_inputs)
        test_loss = torch.nn.functional.mse_loss(test_preds, test_targets).item()
        test_mae = torch.nn.functional.l1_loss(test_preds, test_targets).item()
        ss_res = torch.sum((test_targets - test_preds) ** 2).item()
        ss_tot = torch.sum((test_targets - test_targets.mean()) ** 2).item()
        r2_score = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
    return test_loss, test_mae, r2_score

print("GGH helper functions defined.")

GGH helper functions defined.


## Benchmark: TabPFN vs TabPFN+GGH Hybrid

Compare:
1. **Full Info** (Oracle)
2. **Partial** (Baseline)
3. **TabPFN Constrained** (TabPFN with hypothesis constraints)
4. **TabPFN+GGH Hybrid** (Our new method)
5. **GGH Soft Refinement** (Original GGH)

In [11]:
def full_experiment(use_info, DO, INSPECT, batch_size, hidden_size, output_size, num_epochs, 
                    rand_state, results_path, dropout=0.05, lr=0.004, nu=0.1, final_analysis=False):
    """Standard full experiment for Full Info / Partial."""
    AM = AlgoModulators(DO, lr=lr, nu=nu)
    dataloader = DO.prep_dataloader(use_info, batch_size)
    model = initialize_model(DO, dataloader, hidden_size, rand_state, dropout=dropout)
    TVM = TrainValidationManager(use_info, num_epochs, dataloader, batch_size, rand_state, 
                                 results_path, final_analysis=final_analysis)
    TVM.train_model(DO, AM, model, final_analysis=final_analysis)
    INSPECT.save_train_val_logs(DO, AM, TVM, model, final_analysis=final_analysis)
    return DO, TVM, model


print("=" * 80)
print("BENCHMARK: TabPFN vs TabPFN+GGH Hybrid on Photocell")
print("=" * 80)
print(f"Methods compared:")
print(f"  - Full Info (oracle)")
print(f"  - Partial (baseline)")
print(f"  - TabPFN Constrained (hypothesis values only)")
print(f"  - TabPFN+GGH Hybrid (NEW)")
print(f"Training epochs: {BENCHMARK_EPOCHS}")
print(f"Number of runs: {BENCHMARK_N_RUNS}")
print(f"Hybrid confidence threshold: {HYBRID_CONFIDENCE_THRESHOLD}")
print("=" * 80)

# Store results
all_results = {
    'Full Info': {'r2': [], 'mse': [], 'mae': []},
    'Partial': {'r2': [], 'mse': [], 'mae': []},
    'TabPFN Constrained': {'r2': [], 'mse': [], 'mae': []},
    'TabPFN+GGH Hybrid': {'r2': [], 'mse': [], 'mae': [], 'refined': [], 'changed': []},
}

# Find valid r_states
valid_r_states = []
for r_state in range(2000):
    if len(valid_r_states) >= BENCHMARK_N_RUNS:
        break
    set_to_deterministic(r_state)
    DO_test = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                           partial_perc, r_state, device=DEVICE)
    if not DO_test.lack_partial_coverage:
        valid_r_states.append(r_state)

print(f"Using r_states: {valid_r_states}")

# Run benchmark
for run_idx, r_state in enumerate(valid_r_states):
    print(f"\n{'='*60}")
    print(f"RUN {run_idx + 1}/{BENCHMARK_N_RUNS} (rand_state={r_state})")
    print(f"{'='*60}")
    
    set_to_deterministic(r_state)
    DO = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                      partial_perc, r_state, device=DEVICE)
    
    # === Full Info ===
    print("\nTraining Full Info...")
    set_to_deterministic(r_state)
    DO_full = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                           partial_perc, r_state, device=DEVICE)
    INSPECT_full = Inspector(results_path, hidden_size)
    DO_full, TVM_full, model_full = full_experiment(
        "full info", DO_full, INSPECT_full, batch_size, hidden_size, output_size,
        BENCHMARK_EPOCHS, r_state, results_path, dropout, lr, nu
    )
    full_r2 = INSPECT_full.calculate_val_r2score(DO_full, TVM_full, model_full, data="test")
    _, _, full_r2_direct = evaluate_on_test(DO, model_full)
    all_results['Full Info']['r2'].append(full_r2)
    print(f"  Full Info R2: {full_r2:.4f}")
    
    # === Partial ===
    print("\nTraining Partial...")
    set_to_deterministic(r_state)
    DO_partial = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                              partial_perc, r_state, device=DEVICE)
    INSPECT_partial = Inspector(results_path, hidden_size)
    DO_partial, TVM_partial, model_partial = full_experiment(
        "partial info", DO_partial, INSPECT_partial, batch_size, hidden_size, output_size,
        BENCHMARK_EPOCHS, r_state, results_path, dropout, lr, nu
    )
    partial_r2 = INSPECT_partial.calculate_val_r2score(DO_partial, TVM_partial, model_partial, data="test")
    all_results['Partial']['r2'].append(partial_r2)
    print(f"  Partial R2: {partial_r2:.4f}")
    
    # === TabPFN Constrained ===
    print("\nRunning TabPFN Constrained...")
    set_to_deterministic(r_state)
    DO_tabpfn = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                             partial_perc, r_state, device=DEVICE)
    
    try:
        from GGH.imputation_methods import prep_imputation_input
        df_train_imp = prep_imputation_input(
            DO_tabpfn, DO_tabpfn.df_train[DO_tabpfn.inpt_vars + DO_tabpfn.miss_vars + DO_tabpfn.target_vars], 
            DO_tabpfn.miss_vars, DO_tabpfn.partial_rows_id
        )
        matrix = df_train_imp.values
        
        tabpfn_imputer = TabPFNConstrainedImputer(HYPOTHESIS_VALUES, r_state)
        imputed_values, _, missing_mask = tabpfn_imputer.fit_transform(matrix)
        
        # Create imputed matrix and train
        column = np.argwhere(np.sum(np.isnan(matrix), axis=0) > 0)[0, 0]
        matrix_imputed = matrix.copy()
        matrix_imputed[missing_mask, column] = imputed_values
        imput_input = matrix_imputed[:, :-1]  # Remove target column
        
        AM_tabpfn = AlgoModulators(DO_tabpfn, lr=lr)
        dataloader_tabpfn = DO_tabpfn.prep_dataloader("use imputation", batch_size, imputed_input=imput_input)
        
        set_to_deterministic(r_state)
        model_tabpfn = initialize_model(DO_tabpfn, dataloader_tabpfn, hidden_size, r_state, dropout=dropout)
        TVM_tabpfn = TrainValidationManager("use imputation", BENCHMARK_EPOCHS, dataloader_tabpfn, 
                                            batch_size, r_state, results_path, 
                                            imput_method="TabPFN Constrained", final_analysis=False)
        TVM_tabpfn.train_model(DO_tabpfn, AM_tabpfn, model_tabpfn, final_analysis=False)
        
        INSPECT_tabpfn = Inspector(results_path, hidden_size)
        tabpfn_r2 = INSPECT_tabpfn.calculate_val_r2score(DO_tabpfn, TVM_tabpfn, model_tabpfn, data="test")
        all_results['TabPFN Constrained']['r2'].append(tabpfn_r2)
        print(f"  TabPFN Constrained R2: {tabpfn_r2:.4f}")
        
    except Exception as e:
        print(f"  TabPFN Constrained FAILED: {e}")
        all_results['TabPFN Constrained']['r2'].append(np.nan)
    
    # === TabPFN+GGH Hybrid ===
    print("\nRunning TabPFN+GGH Hybrid...")
    set_to_deterministic(r_state)
    DO_hybrid = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                             partial_perc, r_state, device=DEVICE)
    
    try:
        hybrid_imputer = TabPFNGGHHybridImputer(
            HYPOTHESIS_VALUES, r_state,
            brief_epochs=HYBRID_BRIEF_EPOCHS,
            confidence_threshold=HYBRID_CONFIDENCE_THRESHOLD,
            lr=HYBRID_LR
        )
        
        matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
        imput_input_hybrid = matrix_hybrid[:, :-1]
        
        AM_hybrid = AlgoModulators(DO_hybrid, lr=lr)
        dataloader_hybrid = DO_hybrid.prep_dataloader("use imputation", batch_size, imputed_input=imput_input_hybrid)
        
        set_to_deterministic(r_state)
        model_hybrid = initialize_model(DO_hybrid, dataloader_hybrid, hidden_size, r_state, dropout=dropout)
        TVM_hybrid = TrainValidationManager("use imputation", BENCHMARK_EPOCHS, dataloader_hybrid, 
                                            batch_size, r_state, results_path, 
                                            imput_method="TabPFN+GGH Hybrid", final_analysis=False)
        TVM_hybrid.train_model(DO_hybrid, AM_hybrid, model_hybrid, final_analysis=False)
        
        INSPECT_hybrid = Inspector(results_path, hidden_size)
        hybrid_r2 = INSPECT_hybrid.calculate_val_r2score(DO_hybrid, TVM_hybrid, model_hybrid, data="test")
        all_results['TabPFN+GGH Hybrid']['r2'].append(hybrid_r2)
        all_results['TabPFN+GGH Hybrid']['refined'].append(hybrid_imputer.stats['refined'])
        all_results['TabPFN+GGH Hybrid']['changed'].append(hybrid_imputer.stats['changed'])
        print(f"  TabPFN+GGH Hybrid R2: {hybrid_r2:.4f}")
        
    except Exception as e:
        print(f"  TabPFN+GGH Hybrid FAILED: {e}")
        import traceback
        traceback.print_exc()
        all_results['TabPFN+GGH Hybrid']['r2'].append(np.nan)
        all_results['TabPFN+GGH Hybrid']['refined'].append(0)
        all_results['TabPFN+GGH Hybrid']['changed'].append(0)
    
    # Print run summary
    print(f"\n>>> Run {run_idx + 1} Summary:")
    print(f"    Full Info: {all_results['Full Info']['r2'][-1]:.4f}")
    print(f"    Partial: {all_results['Partial']['r2'][-1]:.4f}")
    if not np.isnan(all_results['TabPFN Constrained']['r2'][-1]):
        print(f"    TabPFN Constrained: {all_results['TabPFN Constrained']['r2'][-1]:.4f}")
    if not np.isnan(all_results['TabPFN+GGH Hybrid']['r2'][-1]):
        print(f"    TabPFN+GGH Hybrid: {all_results['TabPFN+GGH Hybrid']['r2'][-1]:.4f}")
        if 'TabPFN Constrained' in all_results and not np.isnan(all_results['TabPFN Constrained']['r2'][-1]):
            improvement = all_results['TabPFN+GGH Hybrid']['r2'][-1] - all_results['TabPFN Constrained']['r2'][-1]
            print(f"    >>> Hybrid improvement over TabPFN: {improvement:+.4f}")

BENCHMARK: TabPFN vs TabPFN+GGH Hybrid on Photocell
Methods compared:
  - Full Info (oracle)
  - Partial (baseline)
  - TabPFN Constrained (hypothesis values only)
  - TabPFN+GGH Hybrid (NEW)
Training epochs: 600
Number of runs: 15
Hybrid confidence threshold: 0.6
Using r_states: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16]

RUN 1/15 (rand_state=0)

Training Full Info...
  Full Info R2: 0.8350

Training Partial...
  Partial R2: 0.2697

Running TabPFN Constrained...
  TabPFN Constrained R2: -1.2025

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 736
    Low confidence (<0.6): 736
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 1 Summary:
    Full Info: 0.8350
    Partial: 0.2697
    TabPFN Constrained: -1.2025

RUN 2/15 (rand_state=1)


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)



Training Full Info...
  Full Info R2: 0.8697

Training Partial...
  Partial R2: -0.3155

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.2809

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 733
    Low confidence (<0.6): 733
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 2 Summary:
    Full Info: 0.8697
    Partial: -0.3155
    TabPFN Constrained: 0.2809

RUN 3/15 (rand_state=2)

Training Full Info...


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)


  Full Info R2: 0.8931

Training Partial...
  Partial R2: -0.2415

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.1489

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 730
    Low confidence (<0.6): 730
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 3 Summary:
    Full Info: 0.8931
    Partial: -0.2415
    TabPFN Constrained: 0.1489

RUN 4/15 (rand_state=3)

Training Full Info...


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)


  Full Info R2: 0.9169

Training Partial...
  Partial R2: -0.0361

Running TabPFN Constrained...
  TabPFN Constrained R2: -0.5406

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 735
    Low confidence (<0.6): 452
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 4 Summary:
    Full Info: 0.9169
    Partial: -0.0361
    TabPFN Constrained: -0.5406

RUN 5/15 (rand_state=4)


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)



Training Full Info...
  Full Info R2: 0.7224

Training Partial...
  Partial R2: 0.7539

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.0850

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 730
    Low confidence (<0.6): 718
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 5 Summary:
    Full Info: 0.7224
    Partial: 0.7539
    TabPFN Constrained: 0.0850

RUN 6/15 (rand_state=5)

Training Full Info...


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)


  Full Info R2: 0.8583

Training Partial...
  Partial R2: 0.4309

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.1245

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 735
    Low confidence (<0.6): 735
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 6 Summary:
    Full Info: 0.8583
    Partial: 0.4309
    TabPFN Constrained: 0.1245

RUN 7/15 (rand_state=6)


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)



Training Full Info...
  Full Info R2: 0.7352

Training Partial...
  Partial R2: 0.2176

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.1856

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 734
    Low confidence (<0.6): 734
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 7 Summary:
    Full Info: 0.7352
    Partial: 0.2176
    TabPFN Constrained: 0.1856

RUN 8/15 (rand_state=7)

Training Full Info...


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)


  Full Info R2: 0.8879

Training Partial...
  Partial R2: 0.5407

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.3633

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...
    Total imputed: 735
    Low confidence (<0.6): 666
  Stage 2: GGH gradient refinement (30 epochs)...
  TabPFN+GGH Hybrid FAILED: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)

>>> Run 8 Summary:
    Full Info: 0.8879
    Partial: 0.5407
    TabPFN Constrained: 0.3633

RUN 9/15 (rand_state=8)

Training Full Info...


Traceback (most recent call last):
  File "/tmp/ipykernel_495478/905720842.py", line 143, in <module>
    matrix_hybrid = hybrid_imputer.fit_transform(DO_hybrid, batch_size)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 130, in fit_transform
    ggh_score = self._compute_anchor_similarity(X_missing[i], anchors)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_495478/343590846.py", line 284, in _compute_anchor_similarity
    sim_correct = np.dot(features, correct_feat) / (
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: shapes (3,) and (2,) not aligned: 3 (dim 0) != 2 (dim 0)


  Full Info R2: 0.8793

Training Partial...
  Partial R2: 0.3586

Running TabPFN Constrained...
  TabPFN Constrained R2: 0.0169

Running TabPFN+GGH Hybrid...
  Stage 1: TabPFN initial imputation...



KeyboardInterrupt


KeyboardInterrupt



In [None]:
# =============================================================================
# SUMMARY
# =============================================================================
print(f"\n{'='*80}")
print("BENCHMARK SUMMARY: TabPFN vs TabPFN+GGH Hybrid")
print(f"{'='*80}")

# Create summary table
summary_data = []
for method, data in all_results.items():
    r2_vals = [v for v in data['r2'] if not np.isnan(v)]
    if r2_vals:
        summary_data.append({
            'Method': method,
            'R2 Mean': np.mean(r2_vals),
            'R2 Std': np.std(r2_vals),
            'N Runs': len(r2_vals)
        })

summary_df = pd.DataFrame(summary_data)
print("\n" + summary_df.to_string(index=False))

# Statistical tests
print(f"\n{'='*80}")
print("STATISTICAL TESTS")
print(f"{'='*80}")

# TabPFN+GGH Hybrid vs TabPFN Constrained
hybrid_r2 = np.array(all_results['TabPFN+GGH Hybrid']['r2'])
tabpfn_r2 = np.array(all_results['TabPFN Constrained']['r2'])
valid_mask = ~(np.isnan(hybrid_r2) | np.isnan(tabpfn_r2))

if valid_mask.sum() >= 2:
    t_stat, p_val = stats.ttest_rel(hybrid_r2[valid_mask], tabpfn_r2[valid_mask])
    diff = np.mean(hybrid_r2[valid_mask]) - np.mean(tabpfn_r2[valid_mask])
    sig = '***' if p_val < 0.001 else '**' if p_val < 0.01 else '*' if p_val < 0.05 else ''
    
    print(f"\nTabPFN+GGH Hybrid vs TabPFN Constrained:")
    print(f"  Hybrid R2: {np.mean(hybrid_r2[valid_mask]):.4f}")
    print(f"  TabPFN R2: {np.mean(tabpfn_r2[valid_mask]):.4f}")
    print(f"  Difference: {diff:+.4f}")
    print(f"  t={t_stat:.3f}, p={p_val:.6f} {sig}")
    
    if diff > 0:
        print(f"  >>> Hybrid OUTPERFORMS TabPFN")
    else:
        print(f"  >>> TabPFN outperforms Hybrid")

# Hybrid refinement stats
if all_results['TabPFN+GGH Hybrid']['refined']:
    avg_refined = np.mean([r for r in all_results['TabPFN+GGH Hybrid']['refined'] if r > 0])
    avg_changed = np.mean([c for c in all_results['TabPFN+GGH Hybrid']['changed'] if c > 0])
    print(f"\nHybrid Refinement Stats:")
    print(f"  Avg samples refined: {avg_refined:.1f}")
    print(f"  Avg samples changed: {avg_changed:.1f}")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: R2 comparison
methods = list(all_results.keys())
r2_means = [np.nanmean(all_results[m]['r2']) for m in methods]
r2_stds = [np.nanstd(all_results[m]['r2']) for m in methods]
colors = ['#2ecc71', '#95a5a6', '#9b59b6', '#3498db']

ax1 = axes[0]
bars = ax1.bar(range(len(methods)), r2_means, yerr=r2_stds, capsize=5, 
               color=colors[:len(methods)], edgecolor='black', linewidth=1.2)
ax1.set_xlabel('Method', fontsize=12)
ax1.set_ylabel('Test R2 Score', fontsize=12)
ax1.set_title('Test R2 by Method', fontsize=14, fontweight='bold')
ax1.set_xticks(range(len(methods)))
ax1.set_xticklabels(methods, rotation=30, ha='right', fontsize=10)
ax1.grid(axis='y', alpha=0.3)

for bar, val in zip(bars, r2_means):
    if not np.isnan(val):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', 
                 ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 2: Per-run comparison (Hybrid vs TabPFN)
ax2 = axes[1]
x = np.arange(BENCHMARK_N_RUNS)
width = 0.35

tabpfn_vals = [v if not np.isnan(v) else 0 for v in all_results['TabPFN Constrained']['r2']]
hybrid_vals = [v if not np.isnan(v) else 0 for v in all_results['TabPFN+GGH Hybrid']['r2']]

ax2.bar(x - width/2, tabpfn_vals, width, label='TabPFN Constrained', color='#9b59b6', alpha=0.7)
ax2.bar(x + width/2, hybrid_vals, width, label='TabPFN+GGH Hybrid', color='#3498db', alpha=0.7)
ax2.set_xlabel('Run')
ax2.set_ylabel('Test R2')
ax2.set_title('Per-Run Comparison: TabPFN vs Hybrid')
ax2.legend()
ax2.set_xticks(x)
ax2.set_xticklabels([str(i+1) for i in range(BENCHMARK_N_RUNS)])

plt.tight_layout()
plt.savefig(f'{results_path}/tabpfn_ggh_hybrid_benchmark.png', dpi=150, bbox_inches='tight')
plt.show()

# Final conclusion
print(f"\n{'='*80}")
print("CONCLUSION")
print(f"{'='*80}")

full_mean = np.nanmean(all_results['Full Info']['r2'])
partial_mean = np.nanmean(all_results['Partial']['r2'])
tabpfn_mean = np.nanmean(all_results['TabPFN Constrained']['r2'])
hybrid_mean = np.nanmean(all_results['TabPFN+GGH Hybrid']['r2'])

print(f"\nFull Info (Oracle): {full_mean:.4f}")
print(f"TabPFN+GGH Hybrid: {hybrid_mean:.4f}")
print(f"TabPFN Constrained: {tabpfn_mean:.4f}")
print(f"Partial (Baseline): {partial_mean:.4f}")

if not np.isnan(hybrid_mean) and not np.isnan(tabpfn_mean):
    improvement = hybrid_mean - tabpfn_mean
    print(f"\n>>> Hybrid improvement over TabPFN: {improvement:+.4f}")
    
    gap_to_full = full_mean - partial_mean
    if gap_to_full > 0:
        hybrid_closes = (hybrid_mean - partial_mean) / gap_to_full * 100
        tabpfn_closes = (tabpfn_mean - partial_mean) / gap_to_full * 100
        print(f">>> Hybrid closes {hybrid_closes:.1f}% of gap to Full Info")
        print(f">>> TabPFN closes {tabpfn_closes:.1f}% of gap to Full Info")