# TabPFN + GGH Fusion Benchmark

## Goal
Create a hybrid method that combines GGH and TabPFN to get the best of both worlds:
- **GGH excels** in extreme missingness (very little partial data)
- **TabPFN excels** when there's more partial data available

## Option A: TabPFN Confidence as GGH Prior
Use TabPFN's prediction probabilities to weight GGH's scoring:

```python
# TabPFN predicts probability for each hypothesis
tabpfn_probs = TabPFN.predict_proba(sample, hypotheses)

# GGH computes gradient-based scores  
ggh_scores = compute_enriched_scores(gradients, anchors)

# Combine: TabPFN acts as prior, GGH as evidence
final_score = ggh_score + alpha * log(tabpfn_prob)
```

## Methods Compared
1. **GGH Soft Refinement** (baseline)
2. **TabPFN Classifier** (constrained to hypothesis values)
3. **TabPFN + GGH Fusion** (Option A)
4. **Partial Only** (lower bound)
5. **Full Info** (oracle upper bound)

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
sys.path.insert(0, '../')
sys.path.insert(0, '../GGH')

from GGH.data_ops import DataOperator
from GGH.selection_algorithms import AlgoModulators
from GGH.models import initialize_model
from GGH.train_val_loop import TrainValidationManager
from GGH.inspector import Inspector
from scipy import stats
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import grad
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# TabPFN imports
from tabpfn import TabPFNClassifier

def set_to_deterministic(rand_state):
    import random
    random.seed(rand_state)
    np.random.seed(rand_state)
    torch.manual_seed(rand_state)
    torch.set_num_threads(1)
    torch.use_deterministic_algorithms(True)

print("Imports successful!")

# GPU Detection
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


Imports successful!


In [2]:
# =============================================================================
# DATA CONFIGURATION - Photocell Degradation Dataset
# =============================================================================
data_path = '../data/dataset_photo_pce10/data.csv'
results_path = "../saved_results/TabPFN_GGH_Fusion"

# Variables
inpt_vars = ['P3HT', 'PTB7-Th']
target_vars = ['Degradation']
miss_vars = ['PCBM']

# Hypothesis values (6 PCBM concentration values)
hypothesis = [[0.03, 0.11, 0.20, 0.32, 0.43, 0.6]]

# Model parameters
hidden_size = 32
output_size = len(target_vars)
hyp_per_sample = len(hypothesis[0])  # 6 hypotheses
batch_size = 100 * hyp_per_sample

# Training parameters
dropout = 0.05
lr = 0.004
nu = 0.1

# Benchmark parameters
BENCHMARK_N_RUNS = 3
BENCHMARK_EPOCHS = 600

# GGH parameters
GGH_ITER1_EPOCHS = 60
GGH_ITER1_ANALYSIS_EPOCHS = 5
GGH_ITER1_LR = 0.01
GGH_ITER2_EPOCHS = 30
GGH_ITER2_LR = 0.01
GGH_SCORING_PASSES = 5
GGH_FINAL_EPOCHS = 200
GGH_MIN_WEIGHT = 0.1
GGH_TEMPERATURE_ITER1 = 1.0
GGH_TEMPERATURE_ITER3 = 0.8
GGH_LOSS_INFLUENCE = 0.25
GGH_PARTIAL_BASE_WEIGHT = 2.0
GGH_BENCHMARK_LR = 0.01

# Model architecture
MODEL_SHARED_HIDDEN = 16
MODEL_HYPOTHESIS_HIDDEN = 32
MODEL_FINAL_HIDDEN = 32

# Fusion parameters
FUSION_ALPHA = 1.0  # Weight for TabPFN log-probability
CONFIDENCE_THRESHOLD = 0.4  # Only apply TabPFN when confidence > threshold

# Test multiple partial percentages (reduced set for speed)
PARTIAL_PERCENTAGES = [0.03, 0.10, 0.25]

# Create directories
import os
os.makedirs(results_path, exist_ok=True)

print(f"Dataset: Photocell Degradation")
print(f"Hypothesis values: {hypothesis[0]}")
print(f"Partial percentages to test: {PARTIAL_PERCENTAGES}")
print(f"Fusion alpha: {FUSION_ALPHA}")
print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")

Dataset: Photocell Degradation
Hypothesis values: [0.03, 0.11, 0.2, 0.32, 0.43, 0.6]
Partial percentages to test: [0.03, 0.1, 0.25]
Fusion alpha: 1.0
Confidence threshold: 0.4


## GGH Soft Refinement Functions

Standard GGH implementation (copied from benchmark notebooks).

In [3]:
# =============================================================================
# GGH SOFT WEIGHT ITERATIVE REFINEMENT FUNCTIONS
# =============================================================================

class HypothesisAmplifyingModel(nn.Module):
    """Neural network that amplifies the impact of hypothesis feature on gradients."""
    def __init__(self, n_shared_features, n_hypothesis_features=1, 
                 shared_hidden=16, hypothesis_hidden=32, final_hidden=32, output_size=1):
        super().__init__()
        
        self.shared_path = nn.Sequential(
            nn.Linear(n_shared_features, shared_hidden),
            nn.ReLU(),
        )
        
        self.hypothesis_path = nn.Sequential(
            nn.Linear(n_hypothesis_features, hypothesis_hidden),
            nn.ReLU(),
            nn.Linear(hypothesis_hidden, hypothesis_hidden),
            nn.ReLU(),
        )
        
        combined_size = shared_hidden + hypothesis_hidden
        self.final_path = nn.Sequential(
            nn.Linear(combined_size, final_hidden),
            nn.ReLU(),
            nn.Linear(final_hidden, output_size)
        )
        
        self.n_shared = n_shared_features
        
    def forward(self, x):
        shared_features = x[:, :self.n_shared]
        hypothesis_feature = x[:, self.n_shared:]
        
        shared_emb = self.shared_path(shared_features)
        hypothesis_emb = self.hypothesis_path(hypothesis_feature)
        
        combined = torch.cat([shared_emb, hypothesis_emb], dim=1)
        return self.final_path(combined)


class UnbiasedTrainer:
    """Train on ALL hypotheses equally. Track per-hypothesis losses and gradients."""
    def __init__(self, DO, model, lr=0.001, device=DEVICE):
        self.DO = DO
        self.model = model
        self.device = device
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        self.criterion = nn.MSELoss(reduction='none')
        self.hyp_per_sample = DO.num_hyp_comb
        
        self.loss_history = {}
        self.gradient_history = {}
        
    def train_epoch(self, dataloader, epoch, track_data=False):
        self.model.train()
        total_loss = 0
        num_batches = 0
        
        for batch_idx, (inputs, targets, global_ids) in enumerate(dataloader):
            inputs = inputs.to(self.device)
            targets = targets.to(self.device)
            
            predictions = self.model(inputs)
            individual_losses = self.criterion(predictions, targets).mean(dim=1)
            batch_loss = individual_losses.mean()
            
            if track_data:
                self._track_hypothesis_data(inputs, targets, global_ids, individual_losses)
            
            self.optimizer.zero_grad()
            batch_loss.backward()
            self.optimizer.step()
            
            total_loss += batch_loss.item()
            num_batches += 1
        
        return total_loss / num_batches
    
    def _track_hypothesis_data(self, inputs, targets, global_ids, losses):
        self.model.eval()
        
        for i in range(len(inputs)):
            gid = global_ids[i].item()
            
            if gid not in self.loss_history:
                self.loss_history[gid] = []
            self.loss_history[gid].append(losses[i].item())
            
            inp = inputs[i:i+1].clone().requires_grad_(True)
            pred = self.model(inp)
            loss = nn.MSELoss()(pred, targets[i:i+1])
            
            params = list(self.model.parameters())
            grad_param = grad(loss, params[-2], retain_graph=False)[0]
            grad_vec = grad_param.flatten().detach().cpu().numpy()
            
            if gid not in self.gradient_history:
                self.gradient_history[gid] = []
            self.gradient_history[gid].append(grad_vec)
        
        self.model.train()
    
    def get_hypothesis_analysis(self):
        analysis = {}
        for gid in self.loss_history:
            analysis[gid] = {
                'avg_loss': np.mean(self.loss_history[gid]),
                'loss_std': np.std(self.loss_history[gid]),
                'avg_gradient': np.mean(self.gradient_history[gid], axis=0) if gid in self.gradient_history else None,
                'gradient_magnitude': np.mean([np.linalg.norm(g) for g in self.gradient_history.get(gid, [])]),
            }
        return analysis


class WeightedTrainer:
    """Train on ALL samples with continuous weights."""
    def __init__(self, DO, model, sample_weights, partial_gids, partial_weight, lr=0.001, device=DEVICE):
        self.DO = DO
        self.model = model
        self.device = device
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        self.criterion = nn.MSELoss(reduction='none')
        self.hyp_per_sample = DO.num_hyp_comb
        
        self.sample_weights = sample_weights
        self.partial_gids = set(partial_gids)
        self.partial_weight = partial_weight
        
    def train_epoch(self, dataloader, epoch, track_data=False):
        self.model.train()
        total_loss = 0
        total_weight = 0
        
        for batch_idx, (inputs, targets, global_ids) in enumerate(dataloader):
            inputs = inputs.to(self.device)
            targets = targets.to(self.device)
            
            predictions = self.model(inputs)
            individual_losses = self.criterion(predictions, targets).mean(dim=1)
            
            weights = torch.zeros(len(inputs), device=self.device)
            
            for i, gid in enumerate(global_ids):
                gid = gid.item()
                if gid in self.partial_gids:
                    weights[i] = self.partial_weight
                elif gid in self.sample_weights:
                    weights[i] = self.sample_weights[gid]
            
            if weights.sum() == 0:
                continue
            
            weighted_loss = (individual_losses * weights).sum() / weights.sum()
            
            self.optimizer.zero_grad()
            weighted_loss.backward()
            self.optimizer.step()
            
            total_loss += weighted_loss.item() * weights.sum().item()
            total_weight += weights.sum().item()
        
        return total_loss / total_weight if total_weight > 0 else 0


class RemainingDataScorer:
    """Score remaining data using biased model."""
    def __init__(self, DO, model, remaining_sample_indices, device=DEVICE):
        self.DO = DO
        self.model = model
        self.device = device
        self.hyp_per_sample = DO.num_hyp_comb
        self.remaining_sample_indices = set(remaining_sample_indices)
        
        self.loss_scores = {}
        self.gradient_history = {}
        
    def compute_scores(self, dataloader, n_passes=5):
        self.model.eval()
        
        for pass_idx in range(n_passes):
            for inputs, targets, global_ids in dataloader:
                inputs = inputs.to(self.device)
                targets = targets.to(self.device)
                
                for i in range(len(inputs)):
                    gid = global_ids[i].item()
                    sample_idx = gid // self.hyp_per_sample
                    
                    if sample_idx not in self.remaining_sample_indices:
                        continue
                    
                    inp = inputs[i:i+1].clone().requires_grad_(True)
                    pred = self.model(inp)
                    loss = nn.MSELoss()(pred, targets[i:i+1])
                    
                    if gid not in self.loss_scores:
                        self.loss_scores[gid] = []
                    self.loss_scores[gid].append(loss.item())
                    
                    params = list(self.model.parameters())
                    grad_param = grad(loss, params[-2], retain_graph=False)[0]
                    grad_vec = grad_param.flatten().detach().cpu().numpy()
                    
                    if gid not in self.gradient_history:
                        self.gradient_history[gid] = []
                    self.gradient_history[gid].append(grad_vec)
    
    def get_analysis(self):
        analysis = {}
        for gid in self.loss_scores:
            analysis[gid] = {
                'avg_loss': np.mean(self.loss_scores[gid]),
                'loss_std': np.std(self.loss_scores[gid]),
                'avg_gradient': np.mean(self.gradient_history[gid], axis=0) if gid in self.gradient_history else None,
                'gradient_magnitude': np.mean([np.linalg.norm(g) for g in self.gradient_history.get(gid, [])]),
            }
        return analysis


def sigmoid_stable(x):
    """Numerically stable sigmoid."""
    x = np.array(x, dtype=np.float64)
    return np.where(x >= 0,
                    1 / (1 + np.exp(-x)),
                    np.exp(x) / (1 + np.exp(x)))


def compute_soft_weights(scores, min_weight=0.1, temperature=1.0):
    """Convert scores to soft weights using sigmoid."""
    scores = np.array(scores, dtype=np.float64)
    if len(scores) == 0:
        return np.array([])
    
    mean_s = np.mean(scores)
    std_s = np.std(scores) + 1e-8
    normalized = (scores - mean_s) / std_s
    
    raw_weights = sigmoid_stable(normalized / temperature)
    weights = min_weight + (1 - min_weight) * raw_weights
    
    return weights


def create_dataloader_with_gids(DO, batch_size=32):
    """Create dataloader that includes global_ids."""
    input_cols = DO.inpt_vars + [var + '_hypothesis' for var in DO.miss_vars]
    n_samples = len(DO.df_train_hypothesis)
    global_ids = torch.arange(n_samples)
    
    dataset = TensorDataset(
        torch.tensor(DO.df_train_hypothesis[input_cols].values, dtype=torch.float32),
        torch.tensor(DO.df_train_hypothesis[DO.target_vars].values, dtype=torch.float32),
        global_ids
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


def evaluate_on_test(DO, model):
    """Evaluate model on test set."""
    model.eval()
    with torch.no_grad():
        test_inputs, test_targets = DO.get_test_tensors(use_info="full info")
        test_preds = model(test_inputs)
        test_loss = torch.nn.functional.mse_loss(test_preds, test_targets).item()
        test_mae = torch.nn.functional.l1_loss(test_preds, test_targets).item()
        
        ss_res = torch.sum((test_targets - test_preds) ** 2).item()
        ss_tot = torch.sum((test_targets - test_targets.mean()) ** 2).item()
        r2_score = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
    return test_loss, test_mae, r2_score


def train_with_soft_weights(DO, model, sample_weights, partial_gids, partial_weight, lr, n_epochs=200, batch_size=32):
    """Train model with soft weights and validation-based epoch selection."""
    dataloader = create_dataloader_with_gids(DO, batch_size)
    
    trainer = WeightedTrainer(DO, model, sample_weights=sample_weights, 
                             partial_gids=partial_gids, partial_weight=partial_weight, lr=lr)
    
    best_val_loss = float('inf')
    best_epoch = 0
    best_state = None
    
    for epoch in range(n_epochs):
        trainer.train_epoch(dataloader, epoch, track_data=False)
        
        model.eval()
        with torch.no_grad():
            val_inputs, val_targets = DO.get_validation_tensors(use_info="full info")
            val_preds = model(val_inputs)
            val_loss = torch.nn.functional.mse_loss(val_preds, val_targets).item()
        model.train()
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            best_state = {k: v.clone() for k, v in model.state_dict().items()}
    
    model.load_state_dict(best_state)
    return model, best_epoch, best_val_loss


print("GGH functions defined.")

GGH functions defined.


In [4]:
# =============================================================================
# GGH ANCHOR AND SCORING FUNCTIONS
# =============================================================================

def compute_anchor_data(trainer, DO):
    """Compute gradient-only anchors AND enriched anchors for each class."""
    analysis = trainer.get_hypothesis_analysis()
    hyp_per_sample = DO.num_hyp_comb
    input_cols = DO.inpt_vars
    
    partial_correct_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist())
    blacklisted_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == False)
    ].index.tolist())
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    anchor_correct_grad = {}
    anchor_incorrect_grad = {}
    anchor_correct_enriched = {}
    anchor_incorrect_enriched = {}
    feature_norm_params = {}
    
    all_grads = [analysis[gid]['avg_gradient'] for gid in analysis 
                 if analysis[gid]['avg_gradient'] is not None]
    grad_scale = float(np.mean([np.linalg.norm(g) for g in all_grads])) if all_grads else 1.0
    
    for class_id in range(hyp_per_sample):
        class_correct_gids = [gid for gid in partial_correct_gids 
                              if DO.df_train_hypothesis.iloc[gid]['hyp_class_id'] == class_id]
        class_incorrect_gids = [gid for gid in blacklisted_gids 
                                if DO.df_train_hypothesis.iloc[gid]['hyp_class_id'] == class_id]
        
        correct_grads = []
        correct_features = []
        for gid in class_correct_gids:
            if gid in analysis and analysis[gid]['avg_gradient'] is not None:
                correct_grads.append(analysis[gid]['avg_gradient'])
                feat = DO.df_train_hypothesis.loc[gid, input_cols].values.astype(np.float64)
                correct_features.append(feat)
        
        incorrect_grads = []
        incorrect_features = []
        for gid in class_incorrect_gids:
            if gid in analysis and analysis[gid]['avg_gradient'] is not None:
                incorrect_grads.append(analysis[gid]['avg_gradient'])
                feat = DO.df_train_hypothesis.loc[gid, input_cols].values.astype(np.float64)
                incorrect_features.append(feat)
        
        if not correct_grads or not incorrect_grads:
            continue
            
        anchor_correct_grad[class_id] = np.mean(correct_grads, axis=0)
        anchor_incorrect_grad[class_id] = np.mean(incorrect_grads, axis=0)
        
        correct_grads = np.array(correct_grads, dtype=np.float64)
        incorrect_grads = np.array(incorrect_grads, dtype=np.float64)
        correct_features = np.array(correct_features, dtype=np.float64)
        incorrect_features = np.array(incorrect_features, dtype=np.float64)
        
        all_features = np.vstack([correct_features, incorrect_features])
        feat_mean = np.mean(all_features, axis=0)
        feat_std = np.std(all_features, axis=0) + 1e-8
        
        feature_norm_params[class_id] = {'mean': feat_mean, 'std': feat_std, 'scale': grad_scale}
        
        correct_features_norm = (correct_features - feat_mean) / feat_std * grad_scale
        incorrect_features_norm = (incorrect_features - feat_mean) / feat_std * grad_scale
        
        correct_enriched = np.hstack([correct_grads, correct_features_norm])
        incorrect_enriched = np.hstack([incorrect_grads, incorrect_features_norm])
        
        anchor_correct_enriched[class_id] = np.mean(correct_enriched, axis=0)
        anchor_incorrect_enriched[class_id] = np.mean(incorrect_enriched, axis=0)
    
    return {
        'anchor_correct_grad': anchor_correct_grad,
        'anchor_incorrect_grad': anchor_incorrect_grad,
        'anchor_correct_enriched': anchor_correct_enriched,
        'anchor_incorrect_enriched': anchor_incorrect_enriched,
        'grad_scale': grad_scale,
        'feature_norm_params': feature_norm_params,
        'partial_correct_gids': partial_correct_gids,
        'blacklisted_gids': blacklisted_gids,
        'partial_sample_indices': partial_sample_indices,
        'input_cols': input_cols
    }


def compute_enriched_score(gradient, features, class_id, anchor_data):
    """Compute enriched score (gradient + normalized features)."""
    norm_params = anchor_data.get('feature_norm_params', {}).get(class_id)
    if norm_params:
        features_norm = (features - norm_params['mean']) / norm_params['std'] * norm_params['scale']
    else:
        grad_scale = anchor_data.get('grad_scale', 1.0)
        features_norm = features * grad_scale / (np.linalg.norm(features) + 1e-8)
    
    enriched = np.concatenate([gradient, features_norm])
    anchor_c = anchor_data.get('anchor_correct_enriched', {}).get(class_id)
    anchor_i = anchor_data.get('anchor_incorrect_enriched', {}).get(class_id)
    
    if anchor_c is None:
        anchor_c = anchor_data.get('anchor_correct_grad', {}).get(class_id)
        anchor_i = anchor_data.get('anchor_incorrect_grad', {}).get(class_id)
        enriched = gradient
    
    if anchor_c is None:
        return 0.0
    
    sim_c = float(np.dot(enriched, anchor_c) / (np.linalg.norm(enriched) * np.linalg.norm(anchor_c) + 1e-8))
    sim_i = float(np.dot(enriched, anchor_i) / (np.linalg.norm(enriched) * np.linalg.norm(anchor_i) + 1e-8)) if anchor_i is not None else 0.0
    
    return sim_c - sim_i


print("Anchor and scoring functions defined.")

Anchor and scoring functions defined.


## TabPFN Classifier for Hypothesis Selection

Use TabPFN as a classifier to predict which hypothesis class is correct.

In [5]:
# =============================================================================
# TABPFN HYPOTHESIS CLASSIFIER WITH DIAGNOSTICS
# =============================================================================

def get_tabpfn_probabilities(DO, rand_state, verbose=False):
    """
    Use TabPFN to predict hypothesis class probabilities for all samples.
    
    Returns:
        tabpfn_probs: dict mapping sample_idx -> array of probabilities for each hypothesis class
        diagnostics: dict with diagnostic info (classes seen, confidence stats, etc.)
    """
    hyp_per_sample = DO.num_hyp_comb
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    
    # Get partial data for training TabPFN
    partial_correct_gids = DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist()
    
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    # Prepare training data: input features -> hypothesis class
    X_train = []
    y_train = []
    
    for gid in partial_correct_gids:
        row = DO.df_train_hypothesis.iloc[gid]
        features = row[DO.inpt_vars].values.astype(np.float64)
        class_id = int(row['hyp_class_id'])
        X_train.append(features)
        y_train.append(class_id)
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    # DIAGNOSTIC: Class coverage
    unique_classes = np.unique(y_train)
    class_counts = {c: np.sum(y_train == c) for c in unique_classes}
    
    diagnostics = {
        'n_partial_samples': len(X_train),
        'classes_seen': unique_classes.tolist(),
        'n_classes_seen': len(unique_classes),
        'class_counts': class_counts,
        'missing_classes': [c for c in range(hyp_per_sample) if c not in unique_classes],
    }
    
    if verbose:
        print(f"    TabPFN Training: {len(X_train)} samples, {len(unique_classes)}/{hyp_per_sample} classes")
        print(f"    Classes seen: {unique_classes.tolist()}, Missing: {diagnostics['missing_classes']}")
    
    if len(X_train) < 2:
        print("    Warning: Not enough partial data for TabPFN")
        return None, diagnostics
    
    # Prepare test data: all non-partial samples
    X_test = []
    test_sample_indices = []
    
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        gid = sample_idx * hyp_per_sample
        row = DO.df_train_hypothesis.iloc[gid]
        features = row[DO.inpt_vars].values.astype(np.float64)
        X_test.append(features)
        test_sample_indices.append(sample_idx)
    
    X_test = np.array(X_test)
    
    if len(X_test) == 0:
        return {}, diagnostics
    
    # Train TabPFN and get probabilities
    try:
        tabpfn = TabPFNClassifier(device=DEVICE)
        tabpfn.fit(X_train, y_train)
        probs = tabpfn.predict_proba(X_test)
        
        # Map to sample indices
        tabpfn_probs = {}
        confidence_scores = []
        
        for i, sample_idx in enumerate(test_sample_indices):
            if probs.shape[1] < hyp_per_sample:
                # Pad with uniform probabilities for missing classes
                full_probs = np.ones(hyp_per_sample) / hyp_per_sample
                for j, cls in enumerate(tabpfn.classes_):
                    full_probs[cls] = probs[i, j]
                tabpfn_probs[sample_idx] = full_probs
            else:
                tabpfn_probs[sample_idx] = probs[i]
            
            # Track confidence (max probability)
            confidence_scores.append(np.max(tabpfn_probs[sample_idx]))
        
        # DIAGNOSTIC: Confidence statistics
        diagnostics['avg_confidence'] = np.mean(confidence_scores)
        diagnostics['std_confidence'] = np.std(confidence_scores)
        diagnostics['min_confidence'] = np.min(confidence_scores)
        diagnostics['max_confidence'] = np.max(confidence_scores)
        
        if verbose:
            print(f"    TabPFN Confidence: avg={diagnostics['avg_confidence']:.3f}, "
                  f"std={diagnostics['std_confidence']:.3f}, "
                  f"range=[{diagnostics['min_confidence']:.3f}, {diagnostics['max_confidence']:.3f}]")
        
        return tabpfn_probs, diagnostics
        
    except Exception as e:
        print(f"    TabPFN error: {e}")
        diagnostics['error'] = str(e)
        return None, diagnostics


def analyze_tabpfn_vs_ggh_decisions(DO, tabpfn_probs, ggh_scores, sample_scores, verbose=True):
    """
    Analyze where TabPFN and GGH agree/disagree.
    
    Returns diagnostic dict with agreement/disagreement statistics.
    """
    hyp_per_sample = DO.num_hyp_comb
    
    # For each sample, find TabPFN's top choice vs GGH's top choice
    agreement_stats = {
        'total_samples': 0,
        'both_correct': 0,
        'both_wrong': 0,
        'ggh_correct_tabpfn_wrong': 0,
        'ggh_wrong_tabpfn_correct': 0,
        'tabpfn_predictions': [],
        'ggh_predictions': [],
        'true_classes': [],
    }
    
    for sample_idx, (ggh_score, ggh_gid, ggh_is_correct) in sample_scores.items():
        if tabpfn_probs is None or sample_idx not in tabpfn_probs:
            continue
            
        agreement_stats['total_samples'] += 1
        
        # GGH's choice
        ggh_class = DO.df_train_hypothesis.iloc[ggh_gid]['hyp_class_id']
        
        # TabPFN's choice (argmax of probabilities)
        tabpfn_class = np.argmax(tabpfn_probs[sample_idx])
        
        # True class
        start = sample_idx * hyp_per_sample
        true_gid = None
        for hyp_idx in range(hyp_per_sample):
            gid = start + hyp_idx
            if DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']:
                true_gid = gid
                break
        
        if true_gid is None:
            continue
            
        true_class = DO.df_train_hypothesis.iloc[true_gid]['hyp_class_id']
        
        tabpfn_correct = (tabpfn_class == true_class)
        
        agreement_stats['tabpfn_predictions'].append(tabpfn_class)
        agreement_stats['ggh_predictions'].append(ggh_class)
        agreement_stats['true_classes'].append(true_class)
        
        if ggh_is_correct and tabpfn_correct:
            agreement_stats['both_correct'] += 1
        elif not ggh_is_correct and not tabpfn_correct:
            agreement_stats['both_wrong'] += 1
        elif ggh_is_correct and not tabpfn_correct:
            agreement_stats['ggh_correct_tabpfn_wrong'] += 1
        else:
            agreement_stats['ggh_wrong_tabpfn_correct'] += 1
    
    if verbose and agreement_stats['total_samples'] > 0:
        total = agreement_stats['total_samples']
        print(f"\n    === TabPFN vs GGH Agreement Analysis ===")
        print(f"    Both correct:           {agreement_stats['both_correct']:4d} ({agreement_stats['both_correct']/total*100:5.1f}%)")
        print(f"    Both wrong:             {agreement_stats['both_wrong']:4d} ({agreement_stats['both_wrong']/total*100:5.1f}%)")
        print(f"    GGH correct, TabPFN wrong: {agreement_stats['ggh_correct_tabpfn_wrong']:4d} ({agreement_stats['ggh_correct_tabpfn_wrong']/total*100:5.1f}%)")
        print(f"    GGH wrong, TabPFN correct: {agreement_stats['ggh_wrong_tabpfn_correct']:4d} ({agreement_stats['ggh_wrong_tabpfn_correct']/total*100:5.1f}%)")
        
        # TabPFN standalone accuracy
        tabpfn_accuracy = (agreement_stats['both_correct'] + agreement_stats['ggh_wrong_tabpfn_correct']) / total * 100
        ggh_accuracy = (agreement_stats['both_correct'] + agreement_stats['ggh_correct_tabpfn_wrong']) / total * 100
        print(f"    TabPFN standalone accuracy: {tabpfn_accuracy:.1f}%")
        print(f"    GGH standalone accuracy:    {ggh_accuracy:.1f}%")
    
    return agreement_stats


print("TabPFN functions with diagnostics defined.")

TabPFN functions with diagnostics defined.


In [6]:
# =============================================================================
# TABPFN DIAGNOSTIC: Classification vs Regression, 2-feat vs 3-feat
# =============================================================================
# Key insight: For imputation, we should use ALL available information including
# the target variable (Degradation) which is known for each training sample.

from tabpfn import TabPFNClassifier, TabPFNRegressor

DIAGNOSTIC_PARTIAL = 0.08  # 8% partial data
DIAGNOSTIC_RUNS = 5

# Store results
results = {
    'clf_2feat': {'accuracy': [], 'confidence': []},
    'clf_3feat': {'accuracy': [], 'confidence': []},
    'reg_2feat': {'mae': [], 'class_accuracy': []},
    'reg_3feat': {'mae': [], 'class_accuracy': []},
}

print("=" * 70)
print("TabPFN Imputation Diagnostic: Classification vs Regression")
print("=" * 70)
print(f"Partial percentage: {DIAGNOSTIC_PARTIAL*100}%")
print(f"Random states: {DIAGNOSTIC_RUNS}")
print("=" * 70)

# Hypothesis values for mapping regression to class
hyp_values = np.array(hypothesis[0])  # [0.03, 0.11, 0.20, 0.32, 0.43, 0.6]

for r_state in range(DIAGNOSTIC_RUNS):
    set_to_deterministic(r_state)
    
    # Create DataOperator
    DO = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                      DIAGNOSTIC_PARTIAL, r_state, device=DEVICE)
    
    if DO.lack_partial_coverage:
        print(f"  r_state={r_state}: Skipping (lack coverage)")
        continue
    
    hyp_per_sample = DO.num_hyp_comb
    
    # Get partial data (training) and remaining data (test)
    partial_correct_gids = DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist()
    
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    
    # Build training data from partial samples
    X_train_2feat = []
    X_train_3feat = []
    y_train_class = []
    y_train_continuous = []
    
    for gid in partial_correct_gids:
        row = DO.df_train_hypothesis.iloc[gid]
        # 2-feature: just input vars
        X_train_2feat.append(row[DO.inpt_vars].values.astype(np.float64))
        # 3-feature: input vars + target (Degradation)
        feat_3 = np.concatenate([row[DO.inpt_vars].values.astype(np.float64), 
                                  row[DO.target_vars].values.astype(np.float64)])
        X_train_3feat.append(feat_3)
        # Class target
        y_train_class.append(int(row['hyp_class_id']))
        # Continuous target (actual PCBM value)
        y_train_continuous.append(float(row['PCBM_hypothesis']))
    
    X_train_2feat = np.array(X_train_2feat)
    X_train_3feat = np.array(X_train_3feat)
    y_train_class = np.array(y_train_class)
    y_train_continuous = np.array(y_train_continuous)
    
    # Build test data from non-partial samples (use correct hypothesis)
    X_test_2feat = []
    X_test_3feat = []
    y_test_class = []
    y_test_continuous = []
    
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        # Find the correct hypothesis for this sample
        start = sample_idx * hyp_per_sample
        for hyp_idx in range(hyp_per_sample):
            gid = start + hyp_idx
            if DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']:
                row = DO.df_train_hypothesis.iloc[gid]
                X_test_2feat.append(row[DO.inpt_vars].values.astype(np.float64))
                feat_3 = np.concatenate([row[DO.inpt_vars].values.astype(np.float64), 
                                          row[DO.target_vars].values.astype(np.float64)])
                X_test_3feat.append(feat_3)
                y_test_class.append(int(row['hyp_class_id']))
                y_test_continuous.append(float(row['PCBM_hypothesis']))
                break
    
    X_test_2feat = np.array(X_test_2feat)
    X_test_3feat = np.array(X_test_3feat)
    y_test_class = np.array(y_test_class)
    y_test_continuous = np.array(y_test_continuous)
    
    print(f"\nr_state={r_state}: Train={len(X_train_2feat)}, Test={len(X_test_2feat)}")
    
    # === 1. Classification with 2 features ===
    try:
        clf_2 = TabPFNClassifier(device=DEVICE)
        clf_2.fit(X_train_2feat, y_train_class)
        pred_2 = clf_2.predict(X_test_2feat)
        probs_2 = clf_2.predict_proba(X_test_2feat)
        acc_2 = np.mean(pred_2 == y_test_class) * 100
        conf_2 = np.mean(np.max(probs_2, axis=1))
        results['clf_2feat']['accuracy'].append(acc_2)
        results['clf_2feat']['confidence'].append(conf_2)
        print(f"  Classification 2-feat: Acc={acc_2:.1f}%, Conf={conf_2:.3f}")
    except Exception as e:
        print(f"  Classification 2-feat: ERROR - {e}")
    
    # === 2. Classification with 3 features (+ Degradation) ===
    try:
        clf_3 = TabPFNClassifier(device=DEVICE)
        clf_3.fit(X_train_3feat, y_train_class)
        pred_3 = clf_3.predict(X_test_3feat)
        probs_3 = clf_3.predict_proba(X_test_3feat)
        acc_3 = np.mean(pred_3 == y_test_class) * 100
        conf_3 = np.mean(np.max(probs_3, axis=1))
        results['clf_3feat']['accuracy'].append(acc_3)
        results['clf_3feat']['confidence'].append(conf_3)
        print(f"  Classification 3-feat: Acc={acc_3:.1f}%, Conf={conf_3:.3f}")
    except Exception as e:
        print(f"  Classification 3-feat: ERROR - {e}")
    
    # === 3. Regression with 2 features ===
    try:
        reg_2 = TabPFNRegressor(device=DEVICE)
        reg_2.fit(X_train_2feat, y_train_continuous)
        pred_reg_2 = reg_2.predict(X_test_2feat)
        mae_2 = np.mean(np.abs(pred_reg_2 - y_test_continuous))
        # Map to nearest hypothesis class
        pred_class_2 = np.array([np.argmin(np.abs(hyp_values - p)) for p in pred_reg_2])
        class_acc_2 = np.mean(pred_class_2 == y_test_class) * 100
        results['reg_2feat']['mae'].append(mae_2)
        results['reg_2feat']['class_accuracy'].append(class_acc_2)
        print(f"  Regression 2-feat:     MAE={mae_2:.4f}, ClassAcc={class_acc_2:.1f}%")
    except Exception as e:
        print(f"  Regression 2-feat: ERROR - {e}")
    
    # === 4. Regression with 3 features (+ Degradation) ===
    try:
        reg_3 = TabPFNRegressor(device=DEVICE)
        reg_3.fit(X_train_3feat, y_train_continuous)
        pred_reg_3 = reg_3.predict(X_test_3feat)
        mae_3 = np.mean(np.abs(pred_reg_3 - y_test_continuous))
        # Map to nearest hypothesis class
        pred_class_3 = np.array([np.argmin(np.abs(hyp_values - p)) for p in pred_reg_3])
        class_acc_3 = np.mean(pred_class_3 == y_test_class) * 100
        results['reg_3feat']['mae'].append(mae_3)
        results['reg_3feat']['class_accuracy'].append(class_acc_3)
        print(f"  Regression 3-feat:     MAE={mae_3:.4f}, ClassAcc={class_acc_3:.1f}%")
    except Exception as e:
        print(f"  Regression 3-feat: ERROR - {e}")

# === Summary ===
print("\n" + "=" * 70)
print("SUMMARY: TabPFN Imputation Approaches")
print("=" * 70)

print(f"\n{'Approach':<25} | {'Accuracy':>10} | {'MAE':>10} | {'Confidence':>10}")
print("-" * 70)

if results['clf_2feat']['accuracy']:
    acc = np.mean(results['clf_2feat']['accuracy'])
    conf = np.mean(results['clf_2feat']['confidence'])
    print(f"{'Classification 2-feat':<25} | {acc:>9.1f}% | {'N/A':>10} | {conf:>10.3f}")

if results['clf_3feat']['accuracy']:
    acc = np.mean(results['clf_3feat']['accuracy'])
    conf = np.mean(results['clf_3feat']['confidence'])
    print(f"{'Classification 3-feat':<25} | {acc:>9.1f}% | {'N/A':>10} | {conf:>10.3f}")

if results['reg_2feat']['mae']:
    mae = np.mean(results['reg_2feat']['mae'])
    acc = np.mean(results['reg_2feat']['class_accuracy'])
    print(f"{'Regression 2-feat':<25} | {acc:>9.1f}% | {mae:>10.4f} | {'N/A':>10}")

if results['reg_3feat']['mae']:
    mae = np.mean(results['reg_3feat']['mae'])
    acc = np.mean(results['reg_3feat']['class_accuracy'])
    print(f"{'Regression 3-feat':<25} | {acc:>9.1f}% | {mae:>10.4f} | {'N/A':>10}")

# Improvement analysis
if results['clf_2feat']['accuracy'] and results['clf_3feat']['accuracy']:
    improvement = np.mean(results['clf_3feat']['accuracy']) - np.mean(results['clf_2feat']['accuracy'])
    print(f"\n>>> Adding Degradation feature improves classification by: {improvement:+.1f}%")

if results['reg_2feat']['class_accuracy'] and results['reg_3feat']['class_accuracy']:
    improvement = np.mean(results['reg_3feat']['class_accuracy']) - np.mean(results['reg_2feat']['class_accuracy'])
    print(f">>> Adding Degradation feature improves regression class accuracy by: {improvement:+.1f}%")

# Best approach
all_accs = {
    'clf_2feat': np.mean(results['clf_2feat']['accuracy']) if results['clf_2feat']['accuracy'] else 0,
    'clf_3feat': np.mean(results['clf_3feat']['accuracy']) if results['clf_3feat']['accuracy'] else 0,
    'reg_2feat': np.mean(results['reg_2feat']['class_accuracy']) if results['reg_2feat']['class_accuracy'] else 0,
    'reg_3feat': np.mean(results['reg_3feat']['class_accuracy']) if results['reg_3feat']['class_accuracy'] else 0,
}
best = max(all_accs, key=all_accs.get)
print(f"\n>>> BEST APPROACH: {best} with {all_accs[best]:.1f}% accuracy")

TabPFN Imputation Diagnostic: Classification vs Regression
Partial percentage: 8.0%
Random states: 5

r_state=0: Train=59, Test=689
  Classification 2-feat: Acc=30.2%, Conf=0.360
  Classification 3-feat: Acc=33.7%, Conf=0.367
  Regression 2-feat:     MAE=0.1286, ClassAcc=24.5%
  Regression 3-feat:     MAE=0.1249, ClassAcc=27.3%

r_state=1: Train=59, Test=689
  Classification 2-feat: Acc=29.8%, Conf=0.298
  Classification 3-feat: Acc=36.0%, Conf=0.335
  Regression 2-feat:     MAE=0.1256, ClassAcc=22.8%
  Regression 3-feat:     MAE=0.1209, ClassAcc=26.6%

r_state=2: Train=59, Test=689
  Classification 2-feat: Acc=26.3%, Conf=0.359
  Classification 3-feat: Acc=38.8%, Conf=0.449
  Regression 2-feat:     MAE=0.1241, ClassAcc=24.1%
  Regression 3-feat:     MAE=0.1115, ClassAcc=27.6%

r_state=3: Train=59, Test=689
  Classification 2-feat: Acc=29.5%, Conf=0.333
  Classification 3-feat: Acc=37.6%, Conf=0.383
  Regression 2-feat:     MAE=0.1273, ClassAcc=23.9%
  Regression 3-feat:     MAE=0.1201

## Option A: TabPFN + GGH Fusion

Combine TabPFN probabilities with GGH gradient scores:

```
final_score = ggh_score + alpha * log(tabpfn_prob)
```

In [7]:
# =============================================================================
# TABPFN + GGH FUSION (OPTION A) WITH CONFIDENCE GATING
# =============================================================================

def run_ggh_with_tabpfn_prior(DO, rand_state, alpha=1.0, confidence_threshold=0.4, verbose_diagnostics=True):
    """
    GGH Soft Refinement with TabPFN probabilities as prior.
    
    CONFIDENCE-GATED: Only applies TabPFN when max(tabpfn_prob) > threshold.
    Otherwise falls back to pure GGH.
    
    final_score = ggh_score + alpha * log(tabpfn_prob)  [if confident]
    final_score = ggh_score                              [if not confident]
    
    Returns: gid_weights, effective_precision, partial_gids, partial_weight_dynamic, diagnostics
    """
    set_to_deterministic(rand_state)
    
    hyp_per_sample = DO.num_hyp_comb
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    n_shared = len(DO.inpt_vars)
    n_hyp = len(DO.miss_vars)
    out_size = len(DO.target_vars)
    
    partial_correct_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist())
    blacklisted_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == False)
    ].index.tolist())
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    dataloader = create_dataloader_with_gids(DO, batch_size=32)
    
    # === Get TabPFN probabilities with diagnostics ===
    print("    Getting TabPFN probabilities...")
    tabpfn_probs, tabpfn_diagnostics = get_tabpfn_probabilities(DO, rand_state, verbose=verbose_diagnostics)
    
    # === ITERATION 1: Unbiased training + Initial soft weights ===
    print("    Iter1: Unbiased training...")
    model_unbiased = HypothesisAmplifyingModel(n_shared, n_hyp, 
                                               MODEL_SHARED_HIDDEN, MODEL_HYPOTHESIS_HIDDEN, 
                                               MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
    trainer_unbiased = UnbiasedTrainer(DO, model_unbiased, lr=GGH_ITER1_LR)
    
    for epoch in range(GGH_ITER1_EPOCHS - GGH_ITER1_ANALYSIS_EPOCHS):
        trainer_unbiased.train_epoch(dataloader, epoch, track_data=False)
    for epoch in range(GGH_ITER1_EPOCHS - GGH_ITER1_ANALYSIS_EPOCHS, GGH_ITER1_EPOCHS):
        trainer_unbiased.train_epoch(dataloader, epoch, track_data=True)
    
    anchor_data = compute_anchor_data(trainer_unbiased, DO)
    analysis = trainer_unbiased.get_hypothesis_analysis()
    input_cols = anchor_data['input_cols']
    
    # === SCORING WITH CONFIDENCE-GATED TABPFN PRIOR ===
    ggh_only_scores = {}  # For diagnostic comparison
    sample_scores = {}
    
    # Track gating statistics
    n_samples_with_fusion = 0
    n_samples_pure_ggh = 0
    
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        start = sample_idx * hyp_per_sample
        best_ggh_score, best_ggh_gid, best_ggh_is_correct = -np.inf, None, False
        best_combined_score, best_combined_gid, best_combined_is_correct = -np.inf, None, False
        
        # Get TabPFN probabilities for this sample
        if tabpfn_probs and sample_idx in tabpfn_probs:
            sample_tabpfn_probs = tabpfn_probs[sample_idx]
            max_tabpfn_confidence = np.max(sample_tabpfn_probs)
        else:
            sample_tabpfn_probs = np.ones(hyp_per_sample) / hyp_per_sample
            max_tabpfn_confidence = 1.0 / hyp_per_sample  # Uniform = low confidence
        
        # Determine if we should use TabPFN for this sample
        use_tabpfn = (max_tabpfn_confidence > confidence_threshold)
        
        if use_tabpfn:
            n_samples_with_fusion += 1
        else:
            n_samples_pure_ggh += 1
        
        for hyp_idx in range(hyp_per_sample):
            gid = start + hyp_idx
            if gid in blacklisted_gids or gid not in analysis or analysis[gid]['avg_gradient'] is None:
                continue
            
            gradient = analysis[gid]['avg_gradient']
            class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
            features = DO.df_train_hypothesis.loc[gid, input_cols].values.astype(np.float64)
            is_correct = DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']
            
            # GGH gradient score (pure)
            ggh_score = compute_enriched_score(gradient, features, class_id, anchor_data)
            
            # Track best GGH-only choice
            if ggh_score > best_ggh_score:
                best_ggh_score = ggh_score
                best_ggh_gid = gid
                best_ggh_is_correct = is_correct
            
            # Combined score: CONFIDENCE-GATED
            if use_tabpfn:
                # TabPFN is confident - apply fusion
                tabpfn_prob = sample_tabpfn_probs[class_id]
                tabpfn_log_prob = np.log(tabpfn_prob + 1e-10)
                combined_score = ggh_score + alpha * tabpfn_log_prob
            else:
                # TabPFN is not confident - use pure GGH
                combined_score = ggh_score
            
            if combined_score > best_combined_score:
                best_combined_score = combined_score
                best_combined_gid = gid
                best_combined_is_correct = is_correct
        
        if best_ggh_gid is not None:
            ggh_only_scores[sample_idx] = (best_ggh_score, best_ggh_gid, best_ggh_is_correct)
        if best_combined_gid is not None:
            sample_scores[sample_idx] = (best_combined_score, best_combined_gid, best_combined_is_correct)
    
    # === DIAGNOSTIC: Gating statistics ===
    total_samples_scored = n_samples_with_fusion + n_samples_pure_ggh
    if verbose_diagnostics and total_samples_scored > 0:
        print(f"\n    === Confidence Gating (threshold={confidence_threshold}) ===")
        print(f"    Samples using TabPFN+GGH fusion: {n_samples_with_fusion:4d} ({n_samples_with_fusion/total_samples_scored*100:5.1f}%)")
        print(f"    Samples using pure GGH:          {n_samples_pure_ggh:4d} ({n_samples_pure_ggh/total_samples_scored*100:5.1f}%)")
    
    # === DIAGNOSTIC: Analyze GGH vs TabPFN agreement ===
    if verbose_diagnostics:
        agreement_stats = analyze_tabpfn_vs_ggh_decisions(DO, tabpfn_probs, None, ggh_only_scores, verbose=True)
        
        # Check: when fusion flips GGH's decision, is it helpful or harmful?
        fusion_flips_to_correct = 0
        fusion_flips_to_wrong = 0
        fusion_agrees_with_ggh = 0
        
        for sample_idx in ggh_only_scores:
            if sample_idx not in sample_scores:
                continue
            
            _, ggh_gid, ggh_correct = ggh_only_scores[sample_idx]
            _, fusion_gid, fusion_correct = sample_scores[sample_idx]
            
            if ggh_gid == fusion_gid:
                fusion_agrees_with_ggh += 1
            elif not ggh_correct and fusion_correct:
                fusion_flips_to_correct += 1
            elif ggh_correct and not fusion_correct:
                fusion_flips_to_wrong += 1
        
        total_comparable = len(ggh_only_scores)
        print(f"\n    === Fusion Impact Analysis ===")
        print(f"    Fusion agrees with GGH:      {fusion_agrees_with_ggh:4d} ({fusion_agrees_with_ggh/total_comparable*100:5.1f}%)")
        print(f"    Fusion flips GGH wrong→correct: {fusion_flips_to_correct:4d} ({fusion_flips_to_correct/total_comparable*100:5.1f}%) [GOOD]")
        print(f"    Fusion flips GGH correct→wrong: {fusion_flips_to_wrong:4d} ({fusion_flips_to_wrong/total_comparable*100:5.1f}%) [BAD]")
        
        net_benefit = fusion_flips_to_correct - fusion_flips_to_wrong
        print(f"    Net benefit of fusion: {net_benefit:+d} samples")
    
    # Convert to soft weights
    scores_list = [s[0] for s in sample_scores.values()]
    weights_iter1 = compute_soft_weights(scores_list, GGH_MIN_WEIGHT, GGH_TEMPERATURE_ITER1)
    
    gid_weights = {}
    for i, (sample_idx, (score, gid, is_correct)) in enumerate(sample_scores.items()):
        gid_weights[gid] = float(weights_iter1[i])
    
    iter1_correct = sum(1 for s in sample_scores.values() if s[2])
    iter1_precision = iter1_correct / len(sample_scores) * 100 if sample_scores else 0
    
    print(f"    Iter1+TabPFN: {len(sample_scores)} samples, precision: {iter1_precision:.1f}%")
    
    # === ITERATION 2: Weighted training ===
    print("    Iter2: Weighted training...")
    set_to_deterministic(rand_state + 100)
    model_weighted = HypothesisAmplifyingModel(n_shared, n_hyp,
                                               MODEL_SHARED_HIDDEN, MODEL_HYPOTHESIS_HIDDEN,
                                               MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
    
    trainer_weighted = WeightedTrainer(DO, model_weighted, sample_weights=gid_weights,
                                       partial_gids=partial_correct_gids,
                                       partial_weight=GGH_PARTIAL_BASE_WEIGHT, lr=GGH_ITER2_LR)
    
    for epoch in range(GGH_ITER2_EPOCHS):
        trainer_weighted.train_epoch(dataloader, epoch)
    
    # === ITERATION 3: Biased rescoring -> Multiply weights ===
    print("    Iter3: Biased rescoring...")
    selected_sample_indices = set(sample_scores.keys())
    scorer = RemainingDataScorer(DO, model_weighted, selected_sample_indices | partial_sample_indices)
    scorer.compute_scores(dataloader, n_passes=GGH_SCORING_PASSES)
    biased_analysis = scorer.get_analysis()
    
    # Build biased anchor data
    anchor_data_biased = {
        'anchor_correct_grad': {},
        'anchor_incorrect_grad': {},
        'anchor_correct_enriched': {},
        'anchor_incorrect_enriched': {},
        'feature_norm_params': {},
        'loss_norm_params': {},
    }
    
    all_grads = [biased_analysis[gid]['avg_gradient'] for gid in partial_correct_gids | blacklisted_gids
                 if gid in biased_analysis and biased_analysis[gid]['avg_gradient'] is not None]
    grad_scale = np.mean([np.linalg.norm(g) for g in all_grads]) if all_grads else 1.0
    anchor_data_biased['grad_scale'] = grad_scale
    
    inpt_vars_list = DO.inpt_vars
    
    for class_id in range(hyp_per_sample):
        correct_grads, incorrect_grads = [], []
        correct_features, incorrect_features = [], []
        correct_losses, incorrect_losses = [], []
        
        for gid in partial_correct_gids:
            if gid in biased_analysis and DO.df_train_hypothesis.iloc[gid]['hyp_class_id'] == class_id:
                if biased_analysis[gid]['avg_gradient'] is not None:
                    correct_grads.append(biased_analysis[gid]['avg_gradient'])
                    correct_features.append(DO.df_train_hypothesis.loc[gid, inpt_vars_list].values.astype(np.float64))
                    correct_losses.append(biased_analysis[gid]['avg_loss'])
        
        for gid in blacklisted_gids:
            if gid in biased_analysis and DO.df_train_hypothesis.iloc[gid]['hyp_class_id'] == class_id:
                if biased_analysis[gid]['avg_gradient'] is not None:
                    incorrect_grads.append(biased_analysis[gid]['avg_gradient'])
                    incorrect_features.append(DO.df_train_hypothesis.loc[gid, inpt_vars_list].values.astype(np.float64))
                    incorrect_losses.append(biased_analysis[gid]['avg_loss'])
        
        if correct_grads and incorrect_grads:
            anchor_data_biased['anchor_correct_grad'][class_id] = np.mean(correct_grads, axis=0)
            anchor_data_biased['anchor_incorrect_grad'][class_id] = np.mean(incorrect_grads, axis=0)
            
            all_features = correct_features + incorrect_features
            feat_mean = np.mean(all_features, axis=0)
            feat_std = np.std(all_features, axis=0) + 1e-8
            anchor_data_biased['feature_norm_params'][class_id] = {'mean': feat_mean, 'std': feat_std, 'scale': grad_scale}
            
            correct_features_norm = [(f - feat_mean) / feat_std * grad_scale for f in correct_features]
            incorrect_features_norm = [(f - feat_mean) / feat_std * grad_scale for f in incorrect_features]
            
            all_losses = correct_losses + incorrect_losses
            loss_mean = np.mean(all_losses)
            loss_std = np.std(all_losses) + 1e-8
            anchor_data_biased['loss_norm_params'][class_id] = {'mean': loss_mean, 'std': loss_std, 'scale': grad_scale}
            
            correct_losses_norm = [-(l - loss_mean) / loss_std * grad_scale for l in correct_losses]
            incorrect_losses_norm = [-(l - loss_mean) / loss_std * grad_scale for l in incorrect_losses]
            
            correct_enriched = [np.concatenate([g, f, [l]]) 
                               for g, f, l in zip(correct_grads, correct_features_norm, correct_losses_norm)]
            incorrect_enriched = [np.concatenate([g, f, [l]]) 
                                 for g, f, l in zip(incorrect_grads, incorrect_features_norm, incorrect_losses_norm)]
            
            anchor_data_biased['anchor_correct_enriched'][class_id] = np.mean(correct_enriched, axis=0)
            anchor_data_biased['anchor_incorrect_enriched'][class_id] = np.mean(incorrect_enriched, axis=0)
    
    # Rescore with biased model (NO TabPFN in Iter3 - just gradient-based)
    iter3_scores = {}
    for sample_idx, (_, gid, _) in sample_scores.items():
        if gid in biased_analysis and biased_analysis[gid]['avg_gradient'] is not None:
            gradient = biased_analysis[gid]['avg_gradient']
            loss = biased_analysis[gid]['avg_loss']
            class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
            features = DO.df_train_hypothesis.loc[gid, inpt_vars_list].values.astype(np.float64)
            
            norm_params = anchor_data_biased.get('feature_norm_params', {}).get(class_id)
            loss_params = anchor_data_biased.get('loss_norm_params', {}).get(class_id)
            
            if norm_params:
                features_norm = (features - norm_params['mean']) / norm_params['std'] * norm_params['scale']
            else:
                features_norm = features * grad_scale / (np.linalg.norm(features) + 1e-8)
            
            if loss_params:
                loss_norm = -((loss - loss_params['mean']) / loss_params['std']) * loss_params['scale']
            else:
                loss_norm = -loss * grad_scale
            
            enriched = np.concatenate([gradient, features_norm, [loss_norm]])
            
            anchor_c = anchor_data_biased.get('anchor_correct_enriched', {}).get(class_id)
            anchor_i = anchor_data_biased.get('anchor_incorrect_enriched', {}).get(class_id)
            
            if anchor_c is not None:
                sim_c = float(np.dot(enriched, anchor_c) / (np.linalg.norm(enriched) * np.linalg.norm(anchor_c) + 1e-8))
                sim_i = float(np.dot(enriched, anchor_i) / (np.linalg.norm(enriched) * np.linalg.norm(anchor_i) + 1e-8)) if anchor_i is not None else 0.0
                iter3_scores[gid] = sim_c - sim_i
            else:
                iter3_scores[gid] = 0.0
    
    scores_list_iter3 = list(iter3_scores.values())
    gids_iter3 = list(iter3_scores.keys())
    weights_iter3_raw = compute_soft_weights(scores_list_iter3, GGH_MIN_WEIGHT, GGH_TEMPERATURE_ITER3)
    
    for i, gid in enumerate(gids_iter3):
        gid_weights[gid] = gid_weights[gid] * weights_iter3_raw[i]
    
    if gid_weights:
        max_w = max(gid_weights.values())
        if max_w > 0:
            for gid in gid_weights:
                gid_weights[gid] = GGH_MIN_WEIGHT + (gid_weights[gid] / max_w) * (1 - GGH_MIN_WEIGHT)
    
    # === ITERATION 4: Loss-based adjustment ===
    losses = {gid: biased_analysis[gid]['avg_loss']
              for gid in gid_weights if gid in biased_analysis}
    
    if losses:
        loss_values = list(losses.values())
        loss_mean = np.mean(loss_values)
        loss_std = np.std(loss_values) + 1e-8
        
        for gid in gid_weights:
            if gid in losses:
                norm_loss = (losses[gid] - loss_mean) / loss_std
                loss_factor = 1 - GGH_LOSS_INFLUENCE * sigmoid_stable(norm_loss)
                gid_weights[gid] = max(GGH_MIN_WEIGHT, gid_weights[gid] * loss_factor)
    
    # Calculate effective precision
    correct_weights_final = [gid_weights[s[1]] for s in sample_scores.values() if s[2] and s[1] in gid_weights]
    total_weight_correct = sum(correct_weights_final)
    total_weight_all = sum(gid_weights.values())
    effective_precision = total_weight_correct / total_weight_all * 100 if total_weight_all > 0 else 0
    
    print(f"    Final effective precision: {effective_precision:.1f}%")
    
    avg_final_weight = np.mean(list(gid_weights.values())) if gid_weights else 0.5
    partial_weight_dynamic = GGH_PARTIAL_BASE_WEIGHT * (1 + (1 - avg_final_weight))
    
    # Collect all diagnostics
    all_diagnostics = {
        'tabpfn': tabpfn_diagnostics,
        'iter1_precision': iter1_precision,
        'effective_precision': effective_precision,
        'n_samples_with_fusion': n_samples_with_fusion,
        'n_samples_pure_ggh': n_samples_pure_ggh,
        'fusion_ratio': n_samples_with_fusion / total_samples_scored if total_samples_scored > 0 else 0,
    }
    
    return gid_weights, effective_precision, partial_correct_gids, partial_weight_dynamic, all_diagnostics


print("Fusion function with confidence gating defined.")

Fusion function with confidence gating defined.


In [8]:
# =============================================================================
# STANDARD GGH (WITHOUT TABPFN) FOR COMPARISON - FULL 4 ITERATIONS
# =============================================================================

def run_ggh_soft_refinement(DO, rand_state):
    """
    Standard GGH soft refinement (without TabPFN prior).
    Full 4-iteration implementation matching Photocell_Benchmark.
    """
    set_to_deterministic(rand_state)
    
    hyp_per_sample = DO.num_hyp_comb
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    n_shared = len(DO.inpt_vars)
    n_hyp = len(DO.miss_vars)
    out_size = len(DO.target_vars)
    
    partial_correct_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist())
    blacklisted_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == False)
    ].index.tolist())
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    dataloader = create_dataloader_with_gids(DO, batch_size=32)
    
    # === ITERATION 1: Unbiased training + Initial soft weights ===
    model_unbiased = HypothesisAmplifyingModel(n_shared, n_hyp, 
                                               MODEL_SHARED_HIDDEN, MODEL_HYPOTHESIS_HIDDEN, 
                                               MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
    trainer_unbiased = UnbiasedTrainer(DO, model_unbiased, lr=GGH_ITER1_LR)
    
    for epoch in range(GGH_ITER1_EPOCHS - GGH_ITER1_ANALYSIS_EPOCHS):
        trainer_unbiased.train_epoch(dataloader, epoch, track_data=False)
    for epoch in range(GGH_ITER1_EPOCHS - GGH_ITER1_ANALYSIS_EPOCHS, GGH_ITER1_EPOCHS):
        trainer_unbiased.train_epoch(dataloader, epoch, track_data=True)
    
    anchor_data = compute_anchor_data(trainer_unbiased, DO)
    analysis = trainer_unbiased.get_hypothesis_analysis()
    input_cols = anchor_data['input_cols']
    
    # Standard scoring (no TabPFN)
    sample_scores = {}
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        start = sample_idx * hyp_per_sample
        best_score, best_gid, best_is_correct = -np.inf, None, False
        
        for hyp_idx in range(hyp_per_sample):
            gid = start + hyp_idx
            if gid in blacklisted_gids or gid not in analysis or analysis[gid]['avg_gradient'] is None:
                continue
            
            gradient = analysis[gid]['avg_gradient']
            class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
            features = DO.df_train_hypothesis.loc[gid, input_cols].values.astype(np.float64)
            score = compute_enriched_score(gradient, features, class_id, anchor_data)
            
            if score > best_score:
                best_score = score
                best_gid = gid
                best_is_correct = DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']
        
        if best_gid is not None:
            sample_scores[sample_idx] = (best_score, best_gid, best_is_correct)
    
    # Convert to soft weights
    scores_list = [s[0] for s in sample_scores.values()]
    weights_iter1 = compute_soft_weights(scores_list, GGH_MIN_WEIGHT, GGH_TEMPERATURE_ITER1)
    
    gid_weights = {}
    for i, (sample_idx, (score, gid, is_correct)) in enumerate(sample_scores.items()):
        gid_weights[gid] = float(weights_iter1[i])
    
    iter1_correct = sum(1 for s in sample_scores.values() if s[2])
    iter1_precision = iter1_correct / len(sample_scores) * 100 if sample_scores else 0
    
    # === ITERATION 2: Weighted training ===
    set_to_deterministic(rand_state + 100)
    model_weighted = HypothesisAmplifyingModel(n_shared, n_hyp,
                                               MODEL_SHARED_HIDDEN, MODEL_HYPOTHESIS_HIDDEN,
                                               MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
    
    trainer_weighted = WeightedTrainer(DO, model_weighted, sample_weights=gid_weights,
                                       partial_gids=partial_correct_gids,
                                       partial_weight=GGH_PARTIAL_BASE_WEIGHT, lr=GGH_ITER2_LR)
    
    for epoch in range(GGH_ITER2_EPOCHS):
        trainer_weighted.train_epoch(dataloader, epoch)
    
    # === ITERATION 3: Biased rescoring -> Multiply weights ===
    selected_sample_indices = set(sample_scores.keys())
    scorer = RemainingDataScorer(DO, model_weighted, selected_sample_indices | partial_sample_indices)
    scorer.compute_scores(dataloader, n_passes=GGH_SCORING_PASSES)
    biased_analysis = scorer.get_analysis()
    
    # Build biased anchor data
    anchor_data_biased = {
        'anchor_correct_grad': {},
        'anchor_incorrect_grad': {},
        'anchor_correct_enriched': {},
        'anchor_incorrect_enriched': {},
        'feature_norm_params': {},
        'loss_norm_params': {},
    }
    
    all_grads = [biased_analysis[gid]['avg_gradient'] for gid in partial_correct_gids | blacklisted_gids
                 if gid in biased_analysis and biased_analysis[gid]['avg_gradient'] is not None]
    grad_scale = np.mean([np.linalg.norm(g) for g in all_grads]) if all_grads else 1.0
    anchor_data_biased['grad_scale'] = grad_scale
    
    inpt_vars_list = DO.inpt_vars
    
    for class_id in range(hyp_per_sample):
        correct_grads, incorrect_grads = [], []
        correct_features, incorrect_features = [], []
        correct_losses, incorrect_losses = [], []
        
        for gid in partial_correct_gids:
            if gid in biased_analysis and DO.df_train_hypothesis.iloc[gid]['hyp_class_id'] == class_id:
                if biased_analysis[gid]['avg_gradient'] is not None:
                    correct_grads.append(biased_analysis[gid]['avg_gradient'])
                    correct_features.append(DO.df_train_hypothesis.loc[gid, inpt_vars_list].values.astype(np.float64))
                    correct_losses.append(biased_analysis[gid]['avg_loss'])
        
        for gid in blacklisted_gids:
            if gid in biased_analysis and DO.df_train_hypothesis.iloc[gid]['hyp_class_id'] == class_id:
                if biased_analysis[gid]['avg_gradient'] is not None:
                    incorrect_grads.append(biased_analysis[gid]['avg_gradient'])
                    incorrect_features.append(DO.df_train_hypothesis.loc[gid, inpt_vars_list].values.astype(np.float64))
                    incorrect_losses.append(biased_analysis[gid]['avg_loss'])
        
        if correct_grads and incorrect_grads:
            anchor_data_biased['anchor_correct_grad'][class_id] = np.mean(correct_grads, axis=0)
            anchor_data_biased['anchor_incorrect_grad'][class_id] = np.mean(incorrect_grads, axis=0)
            
            all_features = correct_features + incorrect_features
            feat_mean = np.mean(all_features, axis=0)
            feat_std = np.std(all_features, axis=0) + 1e-8
            anchor_data_biased['feature_norm_params'][class_id] = {'mean': feat_mean, 'std': feat_std, 'scale': grad_scale}
            
            correct_features_norm = [(f - feat_mean) / feat_std * grad_scale for f in correct_features]
            incorrect_features_norm = [(f - feat_mean) / feat_std * grad_scale for f in incorrect_features]
            
            all_losses = correct_losses + incorrect_losses
            loss_mean = np.mean(all_losses)
            loss_std = np.std(all_losses) + 1e-8
            anchor_data_biased['loss_norm_params'][class_id] = {'mean': loss_mean, 'std': loss_std, 'scale': grad_scale}
            
            correct_losses_norm = [-(l - loss_mean) / loss_std * grad_scale for l in correct_losses]
            incorrect_losses_norm = [-(l - loss_mean) / loss_std * grad_scale for l in incorrect_losses]
            
            correct_enriched = [np.concatenate([g, f, [l]]) 
                               for g, f, l in zip(correct_grads, correct_features_norm, correct_losses_norm)]
            incorrect_enriched = [np.concatenate([g, f, [l]]) 
                                 for g, f, l in zip(incorrect_grads, incorrect_features_norm, incorrect_losses_norm)]
            
            anchor_data_biased['anchor_correct_enriched'][class_id] = np.mean(correct_enriched, axis=0)
            anchor_data_biased['anchor_incorrect_enriched'][class_id] = np.mean(incorrect_enriched, axis=0)
    
    # Rescore with biased model
    iter3_scores = {}
    for sample_idx, (_, gid, _) in sample_scores.items():
        if gid in biased_analysis and biased_analysis[gid]['avg_gradient'] is not None:
            gradient = biased_analysis[gid]['avg_gradient']
            loss = biased_analysis[gid]['avg_loss']
            class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
            features = DO.df_train_hypothesis.loc[gid, inpt_vars_list].values.astype(np.float64)
            
            norm_params = anchor_data_biased.get('feature_norm_params', {}).get(class_id)
            loss_params = anchor_data_biased.get('loss_norm_params', {}).get(class_id)
            
            if norm_params:
                features_norm = (features - norm_params['mean']) / norm_params['std'] * norm_params['scale']
            else:
                features_norm = features * grad_scale / (np.linalg.norm(features) + 1e-8)
            
            if loss_params:
                loss_norm = -((loss - loss_params['mean']) / loss_params['std']) * loss_params['scale']
            else:
                loss_norm = -loss * grad_scale
            
            enriched = np.concatenate([gradient, features_norm, [loss_norm]])
            
            anchor_c = anchor_data_biased.get('anchor_correct_enriched', {}).get(class_id)
            anchor_i = anchor_data_biased.get('anchor_incorrect_enriched', {}).get(class_id)
            
            if anchor_c is not None:
                sim_c = float(np.dot(enriched, anchor_c) / (np.linalg.norm(enriched) * np.linalg.norm(anchor_c) + 1e-8))
                sim_i = float(np.dot(enriched, anchor_i) / (np.linalg.norm(enriched) * np.linalg.norm(anchor_i) + 1e-8)) if anchor_i is not None else 0.0
                iter3_scores[gid] = sim_c - sim_i
            else:
                iter3_scores[gid] = 0.0
    
    # Multiply weights
    scores_list_iter3 = list(iter3_scores.values())
    gids_iter3 = list(iter3_scores.keys())
    weights_iter3_raw = compute_soft_weights(scores_list_iter3, GGH_MIN_WEIGHT, GGH_TEMPERATURE_ITER3)
    
    for i, gid in enumerate(gids_iter3):
        gid_weights[gid] = gid_weights[gid] * weights_iter3_raw[i]
    
    # Renormalize
    if gid_weights:
        max_w = max(gid_weights.values())
        if max_w > 0:
            for gid in gid_weights:
                gid_weights[gid] = GGH_MIN_WEIGHT + (gid_weights[gid] / max_w) * (1 - GGH_MIN_WEIGHT)
    
    # === ITERATION 4: Loss-based adjustment ===
    losses = {gid: biased_analysis[gid]['avg_loss']
              for gid in gid_weights if gid in biased_analysis}
    
    if losses:
        loss_values = list(losses.values())
        loss_mean = np.mean(loss_values)
        loss_std = np.std(loss_values) + 1e-8
        
        for gid in gid_weights:
            if gid in losses:
                norm_loss = (losses[gid] - loss_mean) / loss_std
                loss_factor = 1 - GGH_LOSS_INFLUENCE * sigmoid_stable(norm_loss)
                gid_weights[gid] = max(GGH_MIN_WEIGHT, gid_weights[gid] * loss_factor)
    
    # Calculate effective precision
    correct_weights_final = [gid_weights[s[1]] for s in sample_scores.values() if s[2] and s[1] in gid_weights]
    total_weight_correct = sum(correct_weights_final) if correct_weights_final else 0
    total_weight_all = sum(gid_weights.values()) if gid_weights else 1
    effective_precision = total_weight_correct / total_weight_all * 100 if total_weight_all > 0 else 0
    
    avg_final_weight = np.mean(list(gid_weights.values())) if gid_weights else 0.5
    partial_weight_dynamic = GGH_PARTIAL_BASE_WEIGHT * (1 + (1 - avg_final_weight))
    
    return gid_weights, effective_precision, partial_correct_gids, partial_weight_dynamic


print("Standard GGH function defined (full 4 iterations).")

Standard GGH function defined (full 4 iterations).


## Benchmark: GGH vs TabPFN+GGH Fusion

Compare across multiple partial percentages to see where each method excels.

In [None]:
# =============================================================================
# BENCHMARK EXECUTION WITH CONFIDENCE-GATED FUSION
# =============================================================================

print("=" * 80)
print("BENCHMARK: GGH vs TabPFN+GGH Fusion (Confidence-Gated)")
print("=" * 80)
print(f"Dataset: Photocell Degradation")
print(f"Partial percentages: {PARTIAL_PERCENTAGES}")
print(f"Runs per percentage: {BENCHMARK_N_RUNS}")
print(f"Fusion alpha: {FUSION_ALPHA}")
print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")
print("=" * 80)

all_results = {partial_perc: {'GGH': [], 'TabPFN': [], 'TabPFN+GGH': [], 'Partial': [], 'Full Info': []} 
               for partial_perc in PARTIAL_PERCENTAGES}

# Store aggregated diagnostics
all_diagnostics = {partial_perc: [] for partial_perc in PARTIAL_PERCENTAGES}

for partial_perc in PARTIAL_PERCENTAGES:
    print(f"\n{'='*60}")
    print(f"PARTIAL PERCENTAGE: {partial_perc*100}%")
    print(f"{'='*60}")
    
    valid_runs = 0
    r_state = 0
    
    while valid_runs < BENCHMARK_N_RUNS and r_state < 2000:
        set_to_deterministic(r_state)
        
        # Create DataOperator
        DO = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                          partial_perc, r_state, device=DEVICE)
        
        if DO.lack_partial_coverage:
            r_state += 1
            continue
        
        print(f"\nRun {valid_runs + 1}/{BENCHMARK_N_RUNS} (r_state={r_state})")
        
        n_shared = len(DO.inpt_vars)
        n_hyp = len(DO.miss_vars)
        out_size = len(DO.target_vars)
        
        partial_gids = set(DO.df_train_hypothesis[
            (DO.df_train_hypothesis['partial_full_info'] == 1) & 
            (DO.df_train_hypothesis['correct_hypothesis'] == True)
        ].index.tolist())
        
        # === Standard GGH ===
        print("  Running Standard GGH...")
        ggh_weights, ggh_precision, _, ggh_partial_weight = run_ggh_soft_refinement(DO, r_state)
        
        set_to_deterministic(r_state + 200)
        model_ggh = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                              MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_ggh, _, _ = train_with_soft_weights(
            DO, model_ggh, sample_weights=ggh_weights, partial_gids=partial_gids,
            partial_weight=ggh_partial_weight, lr=GGH_BENCHMARK_LR, n_epochs=GGH_FINAL_EPOCHS
        )
        _, _, ggh_r2 = evaluate_on_test(DO, model_ggh)
        print(f"    GGH R2: {ggh_r2:.4f}, Precision: {ggh_precision:.1f}%")
        
        # === TabPFN + GGH Fusion (Confidence-Gated) ===
        print("  Running TabPFN+GGH Fusion (Confidence-Gated)...")
        verbose = (valid_runs == 0)  # Only verbose for first run
        fusion_result = run_ggh_with_tabpfn_prior(
            DO, r_state, 
            alpha=FUSION_ALPHA, 
            confidence_threshold=CONFIDENCE_THRESHOLD,
            verbose_diagnostics=verbose
        )
        fusion_weights, fusion_precision, _, fusion_partial_weight, diagnostics = fusion_result
        
        if verbose:
            all_diagnostics[partial_perc].append(diagnostics)
        
        set_to_deterministic(r_state + 300)
        model_fusion = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                                 MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_fusion, _, _ = train_with_soft_weights(
            DO, model_fusion, sample_weights=fusion_weights, partial_gids=partial_gids,
            partial_weight=fusion_partial_weight, lr=GGH_BENCHMARK_LR, n_epochs=GGH_FINAL_EPOCHS
        )
        _, _, fusion_r2 = evaluate_on_test(DO, model_fusion)
        print(f"    Fusion R2: {fusion_r2:.4f}, Precision: {fusion_precision:.1f}%")
        
        # === TabPFN Standalone ===
        print("  Running TabPFN Standalone...")
        # Get TabPFN probabilities
        tabpfn_probs, _ = get_tabpfn_probabilities(DO, r_state, verbose=False)
        
        # Create TabPFN-only weights (hard assignment)
        tabpfn_weights = {}
        hyp_per_sample = DO.num_hyp_comb
        n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
        partial_sample_indices = set(gid // hyp_per_sample for gid in partial_gids)
        
        for sample_idx in range(n_samples):
            if sample_idx in partial_sample_indices:
                continue
            if sample_idx in tabpfn_probs:
                pred_class = np.argmax(tabpfn_probs[sample_idx])
                gid = sample_idx * hyp_per_sample + pred_class
                tabpfn_weights[gid] = 1.0
        
        set_to_deterministic(r_state + 500)
        model_tabpfn = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                                  MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_tabpfn, _, _ = train_with_soft_weights(DO, model_tabpfn, tabpfn_weights, partial_gids,
                                                      GGH_PARTIAL_BASE_WEIGHT, GGH_BENCHMARK_LR, GGH_FINAL_EPOCHS)
        _, _, tabpfn_r2 = evaluate_on_test(DO, model_tabpfn)
        print(f"    TabPFN R2: {tabpfn_r2:.4f}")
        
        # === Partial Only ===
        print("  Running Partial Only...")
        set_to_deterministic(r_state + 400)
        model_partial = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                                  MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_partial, _, _ = train_with_soft_weights(
            DO, model_partial, sample_weights={}, partial_gids=partial_gids,
            partial_weight=1.0, lr=GGH_BENCHMARK_LR, n_epochs=GGH_FINAL_EPOCHS
        )
        _, _, partial_r2 = evaluate_on_test(DO, model_partial)
        print(f"    Partial R2: {partial_r2:.4f}")
        
        # === Full Info (Fair - same architecture as GGH/Fusion) ===
        print("  Running Full Info...")
        hyp_per_sample = len(DO.hypothesis)
        n_samples_full = len(DO.df_train_hypothesis) // hyp_per_sample
        full_info_weights = {}
        for sample_idx in range(n_samples_full):
            for hyp_idx in range(hyp_per_sample):
                gid = sample_idx * hyp_per_sample + hyp_idx
                if DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']:
                    full_info_weights[gid] = 1.0
        
        set_to_deterministic(r_state + 600)
        model_full = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                               MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_full, _, _ = train_with_soft_weights(DO, model_full, full_info_weights, set(),
                                                    1.0, GGH_BENCHMARK_LR, GGH_FINAL_EPOCHS)
        _, _, full_r2 = evaluate_on_test(DO, model_full)
        print(f"    Full Info R2: {full_r2:.4f}")
        
        # Store results
        all_results[partial_perc]['GGH'].append({'r2': ggh_r2, 'precision': ggh_precision})
        all_results[partial_perc]['TabPFN'].append({'r2': tabpfn_r2})
        all_results[partial_perc]['TabPFN+GGH'].append({'r2': fusion_r2, 'precision': fusion_precision})
        all_results[partial_perc]['Partial'].append({'r2': partial_r2})
        all_results[partial_perc]['Full Info'].append({'r2': full_r2})
        
        print(f"  >>> Fusion vs GGH: {fusion_r2 - ggh_r2:+.4f}")
        
        valid_runs += 1
        r_state += 1

print(f"\n{'='*80}")
print("BENCHMARK COMPLETE")
print(f"{'='*80}")

BENCHMARK: GGH vs TabPFN+GGH Fusion (Confidence-Gated)
Dataset: Photocell Degradation
Partial percentages: [0.03, 0.1, 0.25]
Runs per percentage: 3
Fusion alpha: 1.0
Confidence threshold: 0.4

PARTIAL PERCENTAGE: 3.0%

Run 1/3 (r_state=0)
  Running Standard GGH...
    GGH R2: 0.4798, Precision: 31.7%
  Running TabPFN+GGH Fusion (Confidence-Gated)...
    Getting TabPFN probabilities...
    TabPFN Training: 22 samples, 6/6 classes
    Classes seen: [0, 1, 2, 3, 4, 5], Missing: []
    TabPFN Confidence: avg=0.338, std=0.069, range=[0.200, 0.497]
    Iter1: Unbiased training...

    === Confidence Gating (threshold=0.4) ===
    Samples using TabPFN+GGH fusion:  148 ( 20.4%)
    Samples using pure GGH:           578 ( 79.6%)

    === TabPFN vs GGH Agreement Analysis ===
    Both correct:            110 ( 15.2%)
    Both wrong:              453 ( 62.4%)
    GGH correct, TabPFN wrong:   87 ( 12.0%)
    GGH wrong, TabPFN correct:   76 ( 10.5%)
    TabPFN standalone accuracy: 25.6%
    GGH stan

In [None]:
# =============================================================================
# RESULTS SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("SUMMARY: GGH vs TabPFN+GGH Fusion")
print("=" * 80)

summary_data = []

for partial_perc in PARTIAL_PERCENTAGES:
    ggh_r2s = [r['r2'] for r in all_results[partial_perc]['GGH']]
    tabpfn_r2s = [r['r2'] for r in all_results[partial_perc]['TabPFN']]
    fusion_r2s = [r['r2'] for r in all_results[partial_perc]['TabPFN+GGH']]
    partial_r2s = [r['r2'] for r in all_results[partial_perc]['Partial']]
    full_r2s = [r['r2'] for r in all_results[partial_perc]['Full Info']]
    
    ggh_prec = [r['precision'] for r in all_results[partial_perc]['GGH']]
    fusion_prec = [r['precision'] for r in all_results[partial_perc]['TabPFN+GGH']]
    
    if ggh_r2s and fusion_r2s:
        t_stat, p_val = stats.ttest_rel(fusion_r2s, ggh_r2s)
        diff = np.mean(fusion_r2s) - np.mean(ggh_r2s)
        
        summary_data.append({
            'Partial %': f"{partial_perc*100:.0f}%",
            'Full Info': f"{np.mean(full_r2s):.4f}",
            'GGH R2': f"{np.mean(ggh_r2s):.4f} ± {np.std(ggh_r2s):.4f}",
            'Fusion R2': f"{np.mean(fusion_r2s):.4f} ± {np.std(fusion_r2s):.4f}",
            'Partial R2': f"{np.mean(partial_r2s):.4f}",
            'Δ (Fusion-GGH)': f"{diff:+.4f}",
            'p-value': f"{p_val:.4f}",
            'Winner': 'Fusion' if diff > 0 and p_val < 0.05 else ('GGH' if diff < 0 and p_val < 0.05 else 'Tie'),
        })
        
        print(f"\nPartial {partial_perc*100:.0f}%:")
        print(f"  Full Info:    {np.mean(full_r2s):.4f}")
        print(f"  TabPFN:       {np.mean(tabpfn_r2s):.4f} ± {np.std(tabpfn_r2s):.4f}")
        print(f"  GGH:          {np.mean(ggh_r2s):.4f} ± {np.std(ggh_r2s):.4f} (precision: {np.mean(ggh_prec):.1f}%)")
        print(f"  TabPFN+GGH:   {np.mean(fusion_r2s):.4f} ± {np.std(fusion_r2s):.4f} (precision: {np.mean(fusion_prec):.1f}%)")
        print(f"  Partial:      {np.mean(partial_r2s):.4f}")
        print(f"  >>> Fusion vs GGH: {diff:+.4f} (p={p_val:.4f})")

summary_df = pd.DataFrame(summary_data)
print(f"\n{'='*80}")
print("Summary Table:")
print(f"{'='*80}")
print(summary_df.to_string(index=False))

In [None]:
# =============================================================================
# VISUALIZATION
# =============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: R2 by Partial Percentage
ax1 = axes[0]

partial_pcts = [p * 100 for p in PARTIAL_PERCENTAGES]
ggh_means = [np.mean([r['r2'] for r in all_results[p]['GGH']]) for p in PARTIAL_PERCENTAGES]
ggh_stds = [np.std([r['r2'] for r in all_results[p]['GGH']]) for p in PARTIAL_PERCENTAGES]
fusion_means = [np.mean([r['r2'] for r in all_results[p]['TabPFN+GGH']]) for p in PARTIAL_PERCENTAGES]
fusion_stds = [np.std([r['r2'] for r in all_results[p]['TabPFN+GGH']]) for p in PARTIAL_PERCENTAGES]
partial_means = [np.mean([r['r2'] for r in all_results[p]['Partial']]) for p in PARTIAL_PERCENTAGES]
full_means = [np.mean([r['r2'] for r in all_results[p]['Full Info']]) for p in PARTIAL_PERCENTAGES]

ax1.errorbar(partial_pcts, ggh_means, yerr=ggh_stds, marker='o', label='GGH', capsize=5, linewidth=2)
ax1.errorbar(partial_pcts, fusion_means, yerr=fusion_stds, marker='s', label='TabPFN+GGH Fusion', capsize=5, linewidth=2)
ax1.plot(partial_pcts, partial_means, marker='^', label='Partial Only', linestyle='--', alpha=0.7)
ax1.plot(partial_pcts, full_means, marker='d', label='Full Info (Oracle)', linestyle=':', alpha=0.7)

ax1.set_xlabel('Partial Data (%)', fontsize=12)
ax1.set_ylabel('Test R2 Score', fontsize=12)
ax1.set_title('Test R2 by Partial Percentage', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Improvement (Fusion - GGH)
ax2 = axes[1]

improvements = [np.mean([r['r2'] for r in all_results[p]['TabPFN+GGH']]) - 
                np.mean([r['r2'] for r in all_results[p]['GGH']]) 
                for p in PARTIAL_PERCENTAGES]

colors = ['green' if imp > 0 else 'red' for imp in improvements]
bars = ax2.bar(partial_pcts, improvements, color=colors, alpha=0.7, edgecolor='black')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=1)

ax2.set_xlabel('Partial Data (%)', fontsize=12)
ax2.set_ylabel('R2 Improvement (Fusion - GGH)', fontsize=12)
ax2.set_title('TabPFN+GGH Fusion Improvement over GGH', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

for bar, val in zip(bars, improvements):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002 * np.sign(val), 
             f'{val:+.4f}', ha='center', va='bottom' if val > 0 else 'top', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{results_path}/ggh_vs_fusion_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

# =============================================================================
# CONCLUSION
# =============================================================================
print(f"\n{'='*80}")
print("CONCLUSION")
print(f"{'='*80}")

# Find crossover point
fusion_wins = sum(1 for imp in improvements if imp > 0)
ggh_wins = sum(1 for imp in improvements if imp < 0)

print(f"\nAcross {len(PARTIAL_PERCENTAGES)} partial percentages tested:")
print(f"  TabPFN+GGH Fusion wins: {fusion_wins}")
print(f"  Standard GGH wins: {ggh_wins}")
print(f"  Ties: {len(PARTIAL_PERCENTAGES) - fusion_wins - ggh_wins}")

# Identify regimes
if len([imp for imp in improvements[:2] if imp < 0]) > 0:
    print(f"\n>>> At low partial data: GGH may still be better (gradient signal dominates)")
if len([imp for imp in improvements[-2:] if imp > 0]) > 0:
    print(f">>> At high partial data: Fusion benefits from TabPFN's pattern recognition")

In [None]:
# =============================================================================
# AIRFOIL BENCHMARK: GGH vs TabPFN (5-feat) vs Fusion
# =============================================================================
# TabPFN uses ALL available information: 4 input features + 1 target (sound pressure)
# to predict the missing variable (chord-length class)

print("=" * 80)
print("AIRFOIL BENCHMARK: GGH vs TabPFN (5-feat) vs Fusion")
print("=" * 80)

# Airfoil configuration
AIRFOIL_DATA_PATH = '../data/airfoil_self_noise/data.csv'
AIRFOIL_RESULTS_PATH = "../saved_results/Airfoil_TabPFN_Fusion"
AIRFOIL_INPT_VARS = ['frequency', 'attack-angle', 'free-stream-velocity', 'suction-side-displacement-thickness']
AIRFOIL_TARGET_VARS = ['scaled-sound-pressure']
AIRFOIL_MISS_VARS = ['chord-length']
AIRFOIL_HYPOTHESIS = [[0.0254, 0.0508, 0.1016, 0.1524, 0.2286, 0.3048]]
AIRFOIL_HYP_VALUES = np.array(AIRFOIL_HYPOTHESIS[0])

# Benchmark parameters
AIRFOIL_PARTIAL_PERCENTAGES = [0.03, 0.08]
AIRFOIL_N_RUNS = 8
AIRFOIL_EPOCHS = 500
AIRFOIL_FUSION_ALPHA = 1.0
AIRFOIL_CONFIDENCE_THRESHOLD = 0.4

# GGH parameters for Airfoil
AIRFOIL_GGH_ITER1_EPOCHS = 60
AIRFOIL_GGH_ITER1_ANALYSIS_EPOCHS = 5
AIRFOIL_GGH_ITER2_EPOCHS = 30
AIRFOIL_GGH_FINAL_EPOCHS = 200

os.makedirs(AIRFOIL_RESULTS_PATH, exist_ok=True)

print(f"Input features: {AIRFOIL_INPT_VARS}")
print(f"Target: {AIRFOIL_TARGET_VARS}")
print(f"Missing: {AIRFOIL_MISS_VARS}")
print(f"Hypothesis values: {AIRFOIL_HYPOTHESIS[0]}")
print(f"Partial percentages: {AIRFOIL_PARTIAL_PERCENTAGES}")
print(f"Runs per percentage: {AIRFOIL_N_RUNS}")
print("=" * 80)

# Store results
airfoil_results = {p: {'GGH': [], 'TabPFN': [], 'Fusion': [], 'Partial': [], 'Full': []} 
                   for p in AIRFOIL_PARTIAL_PERCENTAGES}

def get_tabpfn_5feat_probs(DO, rand_state, verbose=False):
    """TabPFN with 5 features: 4 inputs + 1 target (sound pressure)."""
    hyp_per_sample = DO.num_hyp_comb
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    
    # Get partial data
    partial_correct_gids = DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist()
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    # Build training data with 5 features
    X_train = []
    y_train = []
    
    for gid in partial_correct_gids:
        row = DO.df_train_hypothesis.iloc[gid]
        # 5 features: 4 inputs + 1 target
        features = np.concatenate([
            row[DO.inpt_vars].values.astype(np.float64),
            row[DO.target_vars].values.astype(np.float64)
        ])
        X_train.append(features)
        y_train.append(int(row['hyp_class_id']))
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    unique_classes = np.unique(y_train)
    
    diagnostics = {
        'n_partial': len(X_train),
        'n_classes': len(unique_classes),
        'classes': unique_classes.tolist(),
    }
    
    if verbose:
        print(f"    TabPFN 5-feat: {len(X_train)} samples, {len(unique_classes)}/6 classes")
    
    if len(X_train) < 2:
        return None, diagnostics
    
    # Build test data
    X_test = []
    test_sample_indices = []
    
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        # Use the first hypothesis row to get input features (same for all hypotheses)
        gid = sample_idx * hyp_per_sample
        row = DO.df_train_hypothesis.iloc[gid]
        
        # Find the correct hypothesis to get the actual target value
        correct_gid = None
        for hyp_idx in range(hyp_per_sample):
            test_gid = sample_idx * hyp_per_sample + hyp_idx
            if DO.df_train_hypothesis.iloc[test_gid]['correct_hypothesis']:
                correct_gid = test_gid
                break
        
        if correct_gid is None:
            continue
            
        correct_row = DO.df_train_hypothesis.iloc[correct_gid]
        features = np.concatenate([
            correct_row[DO.inpt_vars].values.astype(np.float64),
            correct_row[DO.target_vars].values.astype(np.float64)
        ])
        X_test.append(features)
        test_sample_indices.append(sample_idx)
    
    X_test = np.array(X_test)
    
    if len(X_test) == 0:
        return {}, diagnostics
    
    # Train TabPFN
    try:
        tabpfn = TabPFNClassifier(device=DEVICE)
        tabpfn.fit(X_train, y_train)
        probs = tabpfn.predict_proba(X_test)
        
        tabpfn_probs = {}
        confidence_scores = []
        
        for i, sample_idx in enumerate(test_sample_indices):
            if probs.shape[1] < hyp_per_sample:
                full_probs = np.ones(hyp_per_sample) / hyp_per_sample
                for j, cls in enumerate(tabpfn.classes_):
                    full_probs[cls] = probs[i, j]
                tabpfn_probs[sample_idx] = full_probs
            else:
                tabpfn_probs[sample_idx] = probs[i]
            confidence_scores.append(np.max(tabpfn_probs[sample_idx]))
        
        diagnostics['avg_confidence'] = np.mean(confidence_scores)
        diagnostics['predictions'] = tabpfn.predict(X_test)
        diagnostics['test_indices'] = test_sample_indices
        
        if verbose:
            print(f"    TabPFN Confidence: avg={diagnostics['avg_confidence']:.3f}")
        
        return tabpfn_probs, diagnostics
        
    except Exception as e:
        print(f"    TabPFN error: {e}")
        return None, diagnostics


def run_airfoil_ggh(DO, rand_state):
    """Run GGH soft refinement for Airfoil."""
    set_to_deterministic(rand_state)
    
    hyp_per_sample = DO.num_hyp_comb
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    n_shared = len(DO.inpt_vars)
    n_hyp = len(DO.miss_vars)
    out_size = len(DO.target_vars)
    
    partial_correct_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist())
    blacklisted_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == False)
    ].index.tolist())
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    dataloader = create_dataloader_with_gids(DO, batch_size=32)
    
    # Iteration 1
    model = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                      MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size)
    trainer = UnbiasedTrainer(DO, model, lr=GGH_ITER1_LR)
    
    for epoch in range(AIRFOIL_GGH_ITER1_EPOCHS - AIRFOIL_GGH_ITER1_ANALYSIS_EPOCHS):
        trainer.train_epoch(dataloader, epoch, track_data=False)
    for epoch in range(AIRFOIL_GGH_ITER1_EPOCHS - AIRFOIL_GGH_ITER1_ANALYSIS_EPOCHS, AIRFOIL_GGH_ITER1_EPOCHS):
        trainer.train_epoch(dataloader, epoch, track_data=True)
    
    anchor_data = compute_anchor_data(trainer, DO)
    analysis = trainer.get_hypothesis_analysis()
    input_cols = anchor_data['input_cols']
    
    # Scoring
    sample_scores = {}
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        start = sample_idx * hyp_per_sample
        best_score, best_gid, best_is_correct = -np.inf, None, False
        
        for hyp_idx in range(hyp_per_sample):
            gid = start + hyp_idx
            if gid in blacklisted_gids or gid not in analysis or analysis[gid]['avg_gradient'] is None:
                continue
            
            gradient = analysis[gid]['avg_gradient']
            class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
            features = DO.df_train_hypothesis.loc[gid, input_cols].values.astype(np.float64)
            score = compute_enriched_score(gradient, features, class_id, anchor_data)
            
            if score > best_score:
                best_score = score
                best_gid = gid
                best_is_correct = DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']
        
        if best_gid is not None:
            sample_scores[sample_idx] = (best_score, best_gid, best_is_correct)
    
    # Soft weights
    scores_list = [s[0] for s in sample_scores.values()]
    weights = compute_soft_weights(scores_list, GGH_MIN_WEIGHT, GGH_TEMPERATURE_ITER1)
    
    gid_weights = {}
    for i, (sample_idx, (score, gid, is_correct)) in enumerate(sample_scores.items()):
        gid_weights[gid] = float(weights[i])
    
    precision = sum(1 for s in sample_scores.values() if s[2]) / len(sample_scores) * 100 if sample_scores else 0
    
    return gid_weights, precision, partial_correct_gids, sample_scores


def run_airfoil_fusion(DO, rand_state, tabpfn_probs, alpha=1.0, threshold=0.4):
    """Run GGH with TabPFN prior for Airfoil."""
    set_to_deterministic(rand_state)
    
    hyp_per_sample = DO.num_hyp_comb
    n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
    n_shared = len(DO.inpt_vars)
    n_hyp = len(DO.miss_vars)
    out_size = len(DO.target_vars)
    
    partial_correct_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == True)
    ].index.tolist())
    blacklisted_gids = set(DO.df_train_hypothesis[
        (DO.df_train_hypothesis['partial_full_info'] == 1) & 
        (DO.df_train_hypothesis['correct_hypothesis'] == False)
    ].index.tolist())
    partial_sample_indices = set(gid // hyp_per_sample for gid in partial_correct_gids)
    
    dataloader = create_dataloader_with_gids(DO, batch_size=32)
    
    # Iteration 1
    model = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                      MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size)
    trainer = UnbiasedTrainer(DO, model, lr=GGH_ITER1_LR)
    
    for epoch in range(AIRFOIL_GGH_ITER1_EPOCHS - AIRFOIL_GGH_ITER1_ANALYSIS_EPOCHS):
        trainer.train_epoch(dataloader, epoch, track_data=False)
    for epoch in range(AIRFOIL_GGH_ITER1_EPOCHS - AIRFOIL_GGH_ITER1_ANALYSIS_EPOCHS, AIRFOIL_GGH_ITER1_EPOCHS):
        trainer.train_epoch(dataloader, epoch, track_data=True)
    
    anchor_data = compute_anchor_data(trainer, DO)
    analysis = trainer.get_hypothesis_analysis()
    input_cols = anchor_data['input_cols']
    
    # Scoring with TabPFN prior (confidence-gated)
    sample_scores = {}
    n_fusion = 0
    n_pure_ggh = 0
    
    for sample_idx in range(n_samples):
        if sample_idx in partial_sample_indices:
            continue
        
        start = sample_idx * hyp_per_sample
        best_score, best_gid, best_is_correct = -np.inf, None, False
        
        # Get TabPFN probs
        if tabpfn_probs and sample_idx in tabpfn_probs:
            sample_probs = tabpfn_probs[sample_idx]
            max_conf = np.max(sample_probs)
        else:
            sample_probs = np.ones(hyp_per_sample) / hyp_per_sample
            max_conf = 1.0 / hyp_per_sample
        
        use_tabpfn = (max_conf > threshold)
        if use_tabpfn:
            n_fusion += 1
        else:
            n_pure_ggh += 1
        
        for hyp_idx in range(hyp_per_sample):
            gid = start + hyp_idx
            if gid in blacklisted_gids or gid not in analysis or analysis[gid]['avg_gradient'] is None:
                continue
            
            gradient = analysis[gid]['avg_gradient']
            class_id = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
            features = DO.df_train_hypothesis.loc[gid, input_cols].values.astype(np.float64)
            ggh_score = compute_enriched_score(gradient, features, class_id, anchor_data)
            
            if use_tabpfn:
                tabpfn_log = np.log(sample_probs[class_id] + 1e-10)
                score = ggh_score + alpha * tabpfn_log
            else:
                score = ggh_score
            
            if score > best_score:
                best_score = score
                best_gid = gid
                best_is_correct = DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']
        
        if best_gid is not None:
            sample_scores[sample_idx] = (best_score, best_gid, best_is_correct)
    
    # Soft weights
    scores_list = [s[0] for s in sample_scores.values()]
    weights = compute_soft_weights(scores_list, GGH_MIN_WEIGHT, GGH_TEMPERATURE_ITER1)
    
    gid_weights = {}
    for i, (sample_idx, (score, gid, is_correct)) in enumerate(sample_scores.items()):
        gid_weights[gid] = float(weights[i])
    
    precision = sum(1 for s in sample_scores.values() if s[2]) / len(sample_scores) * 100 if sample_scores else 0
    
    return gid_weights, precision, partial_correct_gids, n_fusion, n_pure_ggh


# Run benchmark
for partial_perc in AIRFOIL_PARTIAL_PERCENTAGES:
    print(f"\n{'='*60}")
    print(f"PARTIAL: {partial_perc*100}%")
    print(f"{'='*60}")
    
    valid_runs = 0
    r_state = 0
    
    while valid_runs < AIRFOIL_N_RUNS and r_state < 500:
        set_to_deterministic(r_state)
        
        DO = DataOperator(AIRFOIL_DATA_PATH, AIRFOIL_INPT_VARS, AIRFOIL_TARGET_VARS, 
                          AIRFOIL_MISS_VARS, AIRFOIL_HYPOTHESIS, partial_perc, r_state, device=DEVICE)
        
        if DO.lack_partial_coverage:
            r_state += 1
            continue
        
        print(f"\nRun {valid_runs+1}/{AIRFOIL_N_RUNS} (r_state={r_state})")
        
        n_shared = len(DO.inpt_vars)
        n_hyp = len(DO.miss_vars)
        out_size = len(DO.target_vars)
        hyp_per_sample = DO.num_hyp_comb
        
        partial_gids = set(DO.df_train_hypothesis[
            (DO.df_train_hypothesis['partial_full_info'] == 1) & 
            (DO.df_train_hypothesis['correct_hypothesis'] == True)
        ].index.tolist())
        
        # === TabPFN 5-feat (standalone) ===
        verbose = (valid_runs == 0)
        tabpfn_probs, tabpfn_diag = get_tabpfn_5feat_probs(DO, r_state, verbose=verbose)
        
        # Compute TabPFN standalone accuracy
        tabpfn_acc = 0
        if tabpfn_probs and 'test_indices' in tabpfn_diag:
            correct = 0
            total = 0
            for i, sample_idx in enumerate(tabpfn_diag['test_indices']):
                pred_class = np.argmax(tabpfn_probs[sample_idx])
                # Find true class
                for hyp_idx in range(hyp_per_sample):
                    gid = sample_idx * hyp_per_sample + hyp_idx
                    if DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']:
                        true_class = DO.df_train_hypothesis.iloc[gid]['hyp_class_id']
                        if pred_class == true_class:
                            correct += 1
                        total += 1
                        break
            tabpfn_acc = correct / total * 100 if total > 0 else 0
        
        print(f"  TabPFN 5-feat accuracy: {tabpfn_acc:.1f}%")
        
        # === GGH standalone ===
        ggh_weights, ggh_prec, _, ggh_scores = run_airfoil_ggh(DO, r_state)
        
        set_to_deterministic(r_state + 200)
        model_ggh = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                              MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_ggh, _, _ = train_with_soft_weights(DO, model_ggh, ggh_weights, partial_gids,
                                                   GGH_PARTIAL_BASE_WEIGHT, GGH_BENCHMARK_LR, 
                                                   AIRFOIL_GGH_FINAL_EPOCHS)
        _, _, ggh_r2 = evaluate_on_test(DO, model_ggh)
        print(f"  GGH: R2={ggh_r2:.4f}, Precision={ggh_prec:.1f}%")
        
        # === Fusion (GGH + TabPFN) ===
        fusion_weights, fusion_prec, _, n_fus, n_pure = run_airfoil_fusion(
            DO, r_state, tabpfn_probs, AIRFOIL_FUSION_ALPHA, AIRFOIL_CONFIDENCE_THRESHOLD)
        
        set_to_deterministic(r_state + 300)
        model_fusion = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                                 MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_fusion, _, _ = train_with_soft_weights(DO, model_fusion, fusion_weights, partial_gids,
                                                      GGH_PARTIAL_BASE_WEIGHT, GGH_BENCHMARK_LR, 
                                                      AIRFOIL_GGH_FINAL_EPOCHS)
        _, _, fusion_r2 = evaluate_on_test(DO, model_fusion)
        print(f"  Fusion: R2={fusion_r2:.4f}, Precision={fusion_prec:.1f}% (fusion:{n_fus}, pure:{n_pure})")
        
        # === Partial Only ===
        set_to_deterministic(r_state + 400)
        model_partial = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                                  MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_partial, _, _ = train_with_soft_weights(DO, model_partial, {}, partial_gids,
                                                       1.0, GGH_BENCHMARK_LR, AIRFOIL_GGH_FINAL_EPOCHS)
        _, _, partial_r2 = evaluate_on_test(DO, model_partial)
        
        # === Full Info (Fair - same architecture as GGH/Fusion) ===
        # Get all correct hypothesis GIDs with weight=1
        n_samples_full = len(DO.df_train_hypothesis) // hyp_per_sample
        full_info_weights = {}
        for sample_idx in range(n_samples_full):
            for hyp_idx in range(hyp_per_sample):
                gid = sample_idx * hyp_per_sample + hyp_idx
                if DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']:
                    full_info_weights[gid] = 1.0
        
        set_to_deterministic(r_state + 600)
        model_full = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                                MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_full, _, _ = train_with_soft_weights(DO, model_full, full_info_weights, partial_gids,
                                                    GGH_PARTIAL_BASE_WEIGHT, GGH_BENCHMARK_LR, AIRFOIL_GGH_FINAL_EPOCHS)
        _, _, full_r2 = evaluate_on_test(DO, model_full)
        
        # Store
        airfoil_results[partial_perc]['GGH'].append({'r2': ggh_r2, 'precision': ggh_prec})
        airfoil_results[partial_perc]['TabPFN'].append({'accuracy': tabpfn_acc})
        airfoil_results[partial_perc]['Fusion'].append({'r2': fusion_r2, 'precision': fusion_prec})
        airfoil_results[partial_perc]['Partial'].append({'r2': partial_r2})
        airfoil_results[partial_perc]['Full'].append({'r2': full_r2})
        
        print(f"  Full Info: R2={full_r2:.4f}")
        print(f"  >>> Fusion vs GGH: {fusion_r2 - ggh_r2:+.4f}, Fusion vs Full: {fusion_r2 - full_r2:+.4f}")
        
        valid_runs += 1
        r_state += 1

# Summary
print("\n" + "=" * 80)
print("AIRFOIL BENCHMARK SUMMARY")
print("=" * 80)

for partial_perc in AIRFOIL_PARTIAL_PERCENTAGES:
    ggh_r2s = [r['r2'] for r in airfoil_results[partial_perc]['GGH']]
    fusion_r2s = [r['r2'] for r in airfoil_results[partial_perc]['Fusion']]
    tabpfn_accs = [r['accuracy'] for r in airfoil_results[partial_perc]['TabPFN']]
    partial_r2s = [r['r2'] for r in airfoil_results[partial_perc]['Partial']]
    full_r2s = [r['r2'] for r in airfoil_results[partial_perc]['Full']]
    
    ggh_prec = [r['precision'] for r in airfoil_results[partial_perc]['GGH']]
    fusion_prec = [r['precision'] for r in airfoil_results[partial_perc]['Fusion']]
    
    print(f"\nPartial {partial_perc*100}%:")
    print(f"  Full Info:         {np.mean(full_r2s):.4f} ± {np.std(full_r2s):.4f}")
    print(f"  GGH:               {np.mean(ggh_r2s):.4f} ± {np.std(ggh_r2s):.4f} (prec: {np.mean(ggh_prec):.1f}%)")
    print(f"  Fusion:            {np.mean(fusion_r2s):.4f} ± {np.std(fusion_r2s):.4f} (prec: {np.mean(fusion_prec):.1f}%)")
    print(f"  TabPFN 5-feat acc: {np.mean(tabpfn_accs):.1f}%")
    print(f"  Partial Only:      {np.mean(partial_r2s):.4f}")
    
    diff_fusion_ggh = np.mean(fusion_r2s) - np.mean(ggh_r2s)
    diff_fusion_full = np.mean(fusion_r2s) - np.mean(full_r2s)
    _, p_fusion_ggh = stats.ttest_rel(fusion_r2s, ggh_r2s) if len(fusion_r2s) > 1 else (0, 1)
    _, p_fusion_full = stats.ttest_rel(fusion_r2s, full_r2s) if len(fusion_r2s) > 1 else (0, 1)
    print(f"  >>> Fusion vs GGH:  {diff_fusion_ggh:+.4f} (p={p_fusion_ggh:.4f})")
    print(f"  >>> Fusion vs Full: {diff_fusion_full:+.4f} (p={p_fusion_full:.4f})")

In [None]:
# =============================================================================
# TABPFN STANDALONE R2 - Complete Comparison
# =============================================================================
# Train a model using TabPFN's hard predictions (no GGH) to get TabPFN standalone R2

print("=" * 80)
print("TABPFN STANDALONE R2 COMPARISON")
print("=" * 80)
print("Using stored TabPFN predictions to train models and compute R2")
print("=" * 80)

# Re-run with TabPFN standalone R2
tabpfn_r2_results = {p: [] for p in AIRFOIL_PARTIAL_PERCENTAGES}

for partial_perc in AIRFOIL_PARTIAL_PERCENTAGES:
    print(f"\n{'='*60}")
    print(f"PARTIAL: {partial_perc*100}% - TabPFN Standalone R2")
    print(f"{'='*60}")
    
    valid_runs = 0
    r_state = 0
    
    while valid_runs < AIRFOIL_N_RUNS and r_state < 500:
        set_to_deterministic(r_state)
        
        DO = DataOperator(AIRFOIL_DATA_PATH, AIRFOIL_INPT_VARS, AIRFOIL_TARGET_VARS, 
                          AIRFOIL_MISS_VARS, AIRFOIL_HYPOTHESIS, partial_perc, r_state, device=DEVICE)
        
        if DO.lack_partial_coverage:
            r_state += 1
            continue
        
        n_shared = len(DO.inpt_vars)
        n_hyp = len(DO.miss_vars)
        out_size = len(DO.target_vars)
        hyp_per_sample = DO.num_hyp_comb
        
        partial_gids = set(DO.df_train_hypothesis[
            (DO.df_train_hypothesis['partial_full_info'] == 1) & 
            (DO.df_train_hypothesis['correct_hypothesis'] == True)
        ].index.tolist())
        
        # Get TabPFN predictions
        tabpfn_probs, tabpfn_diag = get_tabpfn_5feat_probs(DO, r_state, verbose=False)
        
        if tabpfn_probs is None or 'test_indices' not in tabpfn_diag:
            print(f"  r_state={r_state}: TabPFN failed, skipping")
            r_state += 1
            continue
        
        # Create TabPFN-only weights (hard assignment: weight=1 for predicted class)
        tabpfn_weights = {}
        n_samples = len(DO.df_train_hypothesis) // hyp_per_sample
        partial_sample_indices = set(gid // hyp_per_sample for gid in partial_gids)
        
        for sample_idx in tabpfn_diag['test_indices']:
            if sample_idx in partial_sample_indices:
                continue
            
            # Get TabPFN's predicted class
            pred_class = np.argmax(tabpfn_probs[sample_idx])
            
            # Find the gid for this predicted class
            gid = sample_idx * hyp_per_sample + pred_class
            tabpfn_weights[gid] = 1.0  # Hard assignment
        
        # Train model with TabPFN predictions
        set_to_deterministic(r_state + 500)
        model_tabpfn = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN, 
                                                  MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_tabpfn, _, _ = train_with_soft_weights(DO, model_tabpfn, tabpfn_weights, partial_gids,
                                                      GGH_PARTIAL_BASE_WEIGHT, GGH_BENCHMARK_LR, 
                                                      AIRFOIL_GGH_FINAL_EPOCHS)
        _, _, tabpfn_r2 = evaluate_on_test(DO, model_tabpfn)
        
        tabpfn_r2_results[partial_perc].append(tabpfn_r2)
        print(f"  Run {valid_runs+1}/{AIRFOIL_N_RUNS} (r_state={r_state}): TabPFN R2={tabpfn_r2:.4f}")
        
        valid_runs += 1
        r_state += 1

# Final comparison summary
print("\n" + "=" * 80)
print("COMPLETE COMPARISON: GGH vs TabPFN vs Fusion")
print("=" * 80)

for partial_perc in AIRFOIL_PARTIAL_PERCENTAGES:
    ggh_r2s = [r['r2'] for r in airfoil_results[partial_perc]['GGH']]
    fusion_r2s = [r['r2'] for r in airfoil_results[partial_perc]['Fusion']]
    tabpfn_r2s = tabpfn_r2_results[partial_perc]
    tabpfn_accs = [r['accuracy'] for r in airfoil_results[partial_perc]['TabPFN']]
    full_r2s = [r['r2'] for r in airfoil_results[partial_perc]['Full']]
    partial_r2s = [r['r2'] for r in airfoil_results[partial_perc]['Partial']]
    
    print(f"\n{'='*60}")
    print(f"PARTIAL {partial_perc*100}%")
    print(f"{'='*60}")
    print(f"{'Method':<20} | {'R2':>12} | {'vs Full Info':>12}")
    print("-" * 50)
    print(f"{'Full Info (oracle)':<20} | {np.mean(full_r2s):>12.4f} | {'-':>12}")
    print(f"{'Partial Only':<20} | {np.mean(partial_r2s):>12.4f} | {np.mean(partial_r2s)-np.mean(full_r2s):>+12.4f}")
    print(f"{'GGH':<20} | {np.mean(ggh_r2s):>12.4f} | {np.mean(ggh_r2s)-np.mean(full_r2s):>+12.4f}")
    print(f"{'TabPFN (5-feat)':<20} | {np.mean(tabpfn_r2s):>12.4f} | {np.mean(tabpfn_r2s)-np.mean(full_r2s):>+12.4f}")
    print(f"{'Fusion (GGH+TabPFN)':<20} | {np.mean(fusion_r2s):>12.4f} | {np.mean(fusion_r2s)-np.mean(full_r2s):>+12.4f}")
    
    print(f"\nTabPFN classification accuracy: {np.mean(tabpfn_accs):.1f}%")
    
    # Statistical tests
    if len(tabpfn_r2s) > 1 and len(ggh_r2s) > 1:
        _, p_tabpfn_ggh = stats.ttest_rel(tabpfn_r2s, ggh_r2s)
        _, p_fusion_tabpfn = stats.ttest_rel(fusion_r2s, tabpfn_r2s)
        _, p_fusion_ggh = stats.ttest_rel(fusion_r2s, ggh_r2s)
        
        print(f"\nStatistical significance (paired t-test):")
        print(f"  TabPFN vs GGH:   {np.mean(tabpfn_r2s)-np.mean(ggh_r2s):+.4f} (p={p_tabpfn_ggh:.4f})")
        print(f"  Fusion vs GGH:   {np.mean(fusion_r2s)-np.mean(ggh_r2s):+.4f} (p={p_fusion_ggh:.4f})")
        print(f"  Fusion vs TabPFN: {np.mean(fusion_r2s)-np.mean(tabpfn_r2s):+.4f} (p={p_fusion_tabpfn:.4f})")

# Best method identification
print("\n" + "=" * 80)
print("CONCLUSION")
print("=" * 80)
for partial_perc in AIRFOIL_PARTIAL_PERCENTAGES:
    ggh_mean = np.mean([r['r2'] for r in airfoil_results[partial_perc]['GGH']])
    fusion_mean = np.mean([r['r2'] for r in airfoil_results[partial_perc]['Fusion']])
    tabpfn_mean = np.mean(tabpfn_r2_results[partial_perc])
    full_mean = np.mean([r['r2'] for r in airfoil_results[partial_perc]['Full']])
    
    methods = {'GGH': ggh_mean, 'TabPFN': tabpfn_mean, 'Fusion': fusion_mean}
    best = max(methods, key=methods.get)
    
    print(f"\nPartial {partial_perc*100}%:")
    print(f"  Best method: {best} (R2={methods[best]:.4f})")
    if methods[best] > full_mean:
        print(f"  >>> EXCEEDS Full Info by {methods[best]-full_mean:+.4f}!")

In [None]:
# =============================================================================
# DIAGNOSTIC: Partial Only vs Full Info (Verify Partial improves with more data)
# =============================================================================

DIAG_PARTIAL_PERCENTAGES = [0.03, 0.10, 0.25]
DIAG_N_RUNS = 10

print("=" * 80)
print("DIAGNOSTIC: Partial Only vs Full Info")
print("=" * 80)
print(f"Partial percentages: {DIAG_PARTIAL_PERCENTAGES}")
print(f"Runs per percentage: {DIAG_N_RUNS}")
print("=" * 80)

diag_results = {p: {'Partial': [], 'Full Info': [], 'n_partial_samples': []} for p in DIAG_PARTIAL_PERCENTAGES}

for partial_perc in DIAG_PARTIAL_PERCENTAGES:
    print(f"\n{'='*60}")
    print(f"PARTIAL: {partial_perc*100}%")
    print(f"{'='*60}")
    
    valid_runs = 0
    r_state = 0
    
    while valid_runs < DIAG_N_RUNS and r_state < 500:
        set_to_deterministic(r_state)
        
        DO = DataOperator(data_path, inpt_vars, target_vars, miss_vars, hypothesis, 
                          partial_perc, r_state, device=DEVICE)
        
        if DO.lack_partial_coverage:
            r_state += 1
            continue
        
        n_shared = len(DO.inpt_vars)
        n_hyp = len(DO.miss_vars)
        out_size = len(DO.target_vars)
        hyp_per_sample = DO.num_hyp_comb
        
        partial_gids = set(DO.df_train_hypothesis[
            (DO.df_train_hypothesis['partial_full_info'] == 1) & 
            (DO.df_train_hypothesis['correct_hypothesis'] == True)
        ].index.tolist())
        
        n_partial = len(partial_gids)
        
        # === Partial Only ===
        set_to_deterministic(r_state + 400)
        model_partial = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                                  MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_partial, _, _ = train_with_soft_weights(
            DO, model_partial, sample_weights={}, partial_gids=partial_gids,
            partial_weight=1.0, lr=GGH_BENCHMARK_LR, n_epochs=GGH_FINAL_EPOCHS
        )
        _, _, partial_r2 = evaluate_on_test(DO, model_partial)
        
        # === Full Info ===
        n_samples_full = len(DO.df_train_hypothesis) // hyp_per_sample
        full_info_weights = {}
        for sample_idx in range(n_samples_full):
            for hyp_idx in range(hyp_per_sample):
                gid = sample_idx * hyp_per_sample + hyp_idx
                if DO.df_train_hypothesis.iloc[gid]['correct_hypothesis']:
                    full_info_weights[gid] = 1.0
        
        set_to_deterministic(r_state + 600)
        model_full = HypothesisAmplifyingModel(n_shared, n_hyp, MODEL_SHARED_HIDDEN,
                                               MODEL_HYPOTHESIS_HIDDEN, MODEL_FINAL_HIDDEN, out_size).to(DEVICE)
        model_full, _, _ = train_with_soft_weights(DO, model_full, full_info_weights, set(),
                                                    1.0, GGH_BENCHMARK_LR, GGH_FINAL_EPOCHS)
        _, _, full_r2 = evaluate_on_test(DO, model_full)
        
        print(f"  Run {valid_runs+1}/{DIAG_N_RUNS} (r_state={r_state}): n_partial={n_partial}, Partial R2={partial_r2:.4f}, Full R2={full_r2:.4f}")
        
        diag_results[partial_perc]['Partial'].append(partial_r2)
        diag_results[partial_perc]['Full Info'].append(full_r2)
        diag_results[partial_perc]['n_partial_samples'].append(n_partial)
        
        valid_runs += 1
        r_state += 1

# Summary
print("\n" + "=" * 80)
print("SUMMARY: Partial vs Full Info")
print("=" * 80)
print(f"{'Partial %':<12} | {'N Samples':<12} | {'Partial R2':<20} | {'Full Info R2':<15}")
print("-" * 70)
for partial_perc in DIAG_PARTIAL_PERCENTAGES:
    partial_r2s = diag_results[partial_perc]['Partial']
    full_r2s = diag_results[partial_perc]['Full Info']
    n_samples = diag_results[partial_perc]['n_partial_samples']
    print(f"{partial_perc*100:>10.0f}% | {np.mean(n_samples):>10.1f} | {np.mean(partial_r2s):>8.4f} ± {np.std(partial_r2s):.4f} | {np.mean(full_r2s):>8.4f} ± {np.std(full_r2s):.4f}")
