# Experiment 083: Pseudo-Labeling / Self-Training

**Rationale**: Use confident predictions on test data to augment training data. This adapts the model to the test distribution.

**Implementation**:
1. Train initial model on training data
2. Make predictions on test data (from CV folds)
3. Select high-confidence predictions (low ensemble variance)
4. Add pseudo-labels to training data
5. Retrain model on augmented data

**Key Insight**: The intercept (0.052) > target (0.0347). Standard CV optimization cannot reach the target. We need approaches that adapt to the test distribution.

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

print('Imports done')

Imports done


In [2]:
# Local data loading functions
def load_data(data_type):
    if data_type == "single_solvent":
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT NAME']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    elif data_type == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    return X, Y

def load_features(feature_type):
    if feature_type == 'spange_descriptors':
        return pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)

print('Data functions defined')

Data functions defined


In [3]:
# Official CV split functions (DO NOT MODIFY)
from typing import Any, Generator

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    for solvent in X["SOLVENT NAME"].unique():
        train_mask = X["SOLVENT NAME"] != solvent
        test_mask = X["SOLVENT NAME"] == solvent
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    ramps = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    for ramp in ramps.unique():
        train_mask = ramps != ramp
        test_mask = ramps == ramp
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

print('CV split functions defined')

CV split functions defined


In [4]:
# Pseudo-labeling model
class PseudoLabelingModel:
    """Model that uses pseudo-labeling to adapt to test distribution."""
    
    def __init__(self, data='single', n_iterations=2, confidence_threshold=0.8):
        self.data = data
        self.mixed = (data == 'full')
        self.n_iterations = n_iterations  # Number of pseudo-labeling iterations
        self.confidence_threshold = confidence_threshold  # Threshold for selecting confident predictions
        
        # Load Spange descriptors
        self.spange = load_features('spange_descriptors')
        
        # Feature scaler
        self.scaler = StandardScaler()
        
    def _get_features(self, X):
        """Extract features from data."""
        if self.mixed:
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            
            # Get solvent features
            feats_a = self.spange.loc[X['SOLVENT A NAME']].values
            feats_b = self.spange.loc[X['SOLVENT B NAME']].values
            
            # Linear mixing
            solvent_feats = (1 - sb_pct) * feats_a + sb_pct * feats_b
            
            combined = np.hstack([res_time, temp, sb_pct, solvent_feats])
        else:
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            solvent_feats = self.spange.loc[X['SOLVENT NAME']].values
            
            combined = np.hstack([res_time, temp, solvent_feats])
        
        return combined.astype(np.float32)
    
    def _train_ensemble(self, X_scaled, y_np, n_models=5):
        """Train an ensemble of models for uncertainty estimation."""
        models = []
        for seed in range(n_models):
            model_list = []
            for t in range(3):
                m = CatBoostRegressor(
                    iterations=300,
                    learning_rate=0.05,
                    depth=6,
                    random_state=SEED + seed,
                    verbose=False
                )
                m.fit(X_scaled, y_np[:, t])
                model_list.append(m)
            models.append(model_list)
        return models
    
    def _predict_with_uncertainty(self, models, X_scaled):
        """Predict with uncertainty estimation from ensemble."""
        all_preds = []
        for model_list in models:
            preds = np.column_stack([m.predict(X_scaled) for m in model_list])
            all_preds.append(preds)
        
        all_preds = np.array(all_preds)  # Shape: (n_models, n_samples, 3)
        mean_pred = all_preds.mean(axis=0)
        std_pred = all_preds.std(axis=0)
        
        return mean_pred, std_pred
    
    def train_model(self, train_X, train_Y):
        X_np = self._get_features(train_X)
        y_np = train_Y.values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Initial training
        self.models = self._train_ensemble(X_scaled, y_np)
        
        # Store training data for pseudo-labeling
        self.X_train_scaled = X_scaled
        self.y_train = y_np
    
    def predict(self, test_X):
        X_np = self._get_features(test_X)
        X_scaled = self.scaler.transform(X_np)
        
        # Pseudo-labeling iterations
        for iteration in range(self.n_iterations):
            # Get predictions with uncertainty
            mean_pred, std_pred = self._predict_with_uncertainty(self.models, X_scaled)
            
            # Select confident predictions (low uncertainty)
            # Confidence = 1 - normalized std
            max_std = std_pred.max(axis=0) + 1e-6
            normalized_std = std_pred / max_std
            confidence = 1 - normalized_std.mean(axis=1)
            
            # Select samples with high confidence
            confident_mask = confidence > self.confidence_threshold
            
            if confident_mask.sum() > 0:
                # Add pseudo-labels to training data
                X_pseudo = X_scaled[confident_mask]
                y_pseudo = mean_pred[confident_mask]
                
                # Augment training data
                X_augmented = np.vstack([self.X_train_scaled, X_pseudo])
                y_augmented = np.vstack([self.y_train, y_pseudo])
                
                # Retrain models on augmented data
                self.models = self._train_ensemble(X_augmented, y_augmented)
        
        # Final prediction
        mean_pred, _ = self._predict_with_uncertainty(self.models, X_scaled)
        
        # Clip to [0, 1]
        mean_pred = np.clip(mean_pred, 0, 1)
        
        return torch.tensor(mean_pred)

print('PseudoLabelingModel defined')

PseudoLabelingModel defined


In [None]:
# Run CV for single solvent data
import tqdm

X, Y = load_data("single_solvent")
print(f"Single solvent data: {len(X)} samples, {len(X['SOLVENT NAME'].unique())} solvents")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = PseudoLabelingModel(data='single', n_iterations=2, confidence_threshold=0.7)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle solvent CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Run CV for full (mixture) data
X, Y = load_data("full")
print(f"Full data: {len(X)} samples")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = PseudoLabelingModel(data='full', n_iterations=2, confidence_threshold=0.7)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull data CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

print(f"Submission shape: {submission.shape}")

# Save
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"\nSubmission saved to /home/submission/submission.csv")

# Verify
submission_check = pd.read_csv("/home/submission/submission.csv")
print(f"\nSubmission rows: {len(submission_check)}")

# Check prediction ranges
target_cols = ['target_1', 'target_2', 'target_3']
for col in target_cols:
    print(f"{col}: min={submission_check[col].min():.4f}, max={submission_check[col].max():.4f}")

In [None]:
# Calculate overall CV score
print("="*50)
print("EXPERIMENT 083 COMPLETE")
print("="*50)