# Experiment 119: Yield Ratio Prediction

**Goal**: Predict yield RATIOS instead of absolute yields, then multiply by predicted total.

**Key Insight**: The distribution shift might be in the absolute scale, not the relative proportions. By predicting ratios and total separately, we may achieve better generalization.

**Approach**:
1. Compute ratios: P2_ratio = P2/total, P3_ratio = P3/total, SM_ratio = SM/total
2. Train separate models for P2_ratio, P3_ratio, and total
3. At prediction: pred = ratio * total

**Hypothesis**: Ratios might be more stable across solvents than absolute yields, potentially changing the CV-LB relationship.

**CRITICAL**: The model class `YieldRatioModel` will be used in BOTH CV computation AND submission cells.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import catboost as cb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Featurizer
class Featurizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Feature dimension: {Featurizer().feats_dim}')

Feature dimension: 145


In [5]:
# Yield Ratio Model - predicts ratios and total separately
class YieldRatioModel:
    """Predict yield ratios instead of absolute yields.
    
    Key insight: The distribution shift might be in the absolute scale,
    not the relative proportions. By predicting ratios and total separately,
    we may achieve better generalization.
    
    This is the SAME class used in both CV and submission cells.
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = Featurizer(mixed=(data=='full'))
        self.scaler = StandardScaler()
        self.ratio_models = []  # Predict P2/total, P3/total
        self.total_model = None  # Predict total = P2 + P3 + SM
        
    def train_model(self, X_train, y_train):
        # Featurize
        X_feat = self.featurizer.featurize(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        
        y_vals = y_train.values
        total = y_vals.sum(axis=1)
        
        # Compute ratios (handle division by zero)
        ratios = y_vals / np.maximum(total.reshape(-1, 1), 1e-6)
        
        # Train ratio models for P2 and P3 (SM ratio = 1 - P2_ratio - P3_ratio)
        for i in range(2):  # Only P2 and P3 ratios
            model = cb.CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
            model.fit(X_scaled, ratios[:, i])
            self.ratio_models.append(model)
        
        # Train total model
        self.total_model = cb.CatBoostRegressor(
            iterations=500,
            learning_rate=0.05,
            depth=6,
            l2_leaf_reg=3,
            random_seed=42,
            verbose=False
        )
        self.total_model.fit(X_scaled, total)
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        X_scaled = self.scaler.transform(X_feat)
        
        # Predict ratios
        p2_ratio = self.ratio_models[0].predict(X_scaled)
        p3_ratio = self.ratio_models[1].predict(X_scaled)
        sm_ratio = 1 - p2_ratio - p3_ratio
        
        # Clip ratios to [0, 1]
        p2_ratio = np.clip(p2_ratio, 0, 1)
        p3_ratio = np.clip(p3_ratio, 0, 1)
        sm_ratio = np.clip(sm_ratio, 0, 1)
        
        # Renormalize ratios to sum to 1
        total_ratio = p2_ratio + p3_ratio + sm_ratio
        p2_ratio = p2_ratio / np.maximum(total_ratio, 1e-6)
        p3_ratio = p3_ratio / np.maximum(total_ratio, 1e-6)
        sm_ratio = sm_ratio / np.maximum(total_ratio, 1e-6)
        
        # Predict total
        total = self.total_model.predict(X_scaled)
        total = np.clip(total, 0, 1.5)  # Reasonable range
        
        # Compute final predictions
        pred = np.column_stack([
            p2_ratio * total,
            p3_ratio * total,
            sm_ratio * total
        ])
        
        return torch.tensor(pred)

print('YieldRatioModel defined - will be used in both CV and submission cells')

YieldRatioModel defined - will be used in both CV and submission cells


In [6]:
# First, let's analyze the ratio distributions in training data
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Compute ratios
y_single = Y_single.values
total_single = y_single.sum(axis=1)
ratios_single = y_single / np.maximum(total_single.reshape(-1, 1), 1e-6)

y_full = Y_full.values
total_full = y_full.sum(axis=1)
ratios_full = y_full / np.maximum(total_full.reshape(-1, 1), 1e-6)

print("Ratio analysis (P2, P3, SM):")
print(f"Single solvent ratios - mean: {ratios_single.mean(axis=0)}")
print(f"Single solvent ratios - std: {ratios_single.std(axis=0)}")
print(f"\nFull data ratios - mean: {ratios_full.mean(axis=0)}")
print(f"Full data ratios - std: {ratios_full.std(axis=0)}")

print(f"\nTotal yield analysis:")
print(f"Single solvent total - mean: {total_single.mean():.4f}, std: {total_single.std():.4f}")
print(f"Full data total - mean: {total_full.mean():.4f}, std: {total_full.std():.4f}")

Ratio analysis (P2, P3, SM):
Single solvent ratios - mean: [0.22965712 0.17191358 0.5984293 ]
Single solvent ratios - std: [0.24002157 0.17880151 0.38454835]

Full data ratios - mean: [0.24816379 0.19796903 0.55386718]
Full data ratios - std: [0.24257406 0.19178661 0.40216966]

Total yield analysis:
Single solvent total - mean: 0.7955, std: 0.1942
Full data total - mean: 0.8035, std: 0.2091


In [7]:
# Cross-validation to compute CV score
print("Computing CV score...")

# Single solvent CV
single_mses = []

for fold_idx, split in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = YieldRatioModel(data='single')  # SAME CLASS AS SUBMISSION
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((predictions - targets) ** 2)
    single_mses.append(mse)
    
    if fold_idx % 6 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

single_mse = np.mean(single_mses)
print(f"\nSingle solvent MSE: {single_mse:.6f}")

Computing CV score...


  Fold 0: MSE = 0.034207


  Fold 6: MSE = 0.004987


  Fold 12: MSE = 0.001943


  Fold 18: MSE = 0.009330



Single solvent MSE: 0.009321


In [8]:
# Full data CV
full_mses = []

for fold_idx, split in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = YieldRatioModel(data='full')  # SAME CLASS AS SUBMISSION
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((predictions - targets) ** 2)
    full_mses.append(mse)
    
    if fold_idx % 3 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

full_mse = np.mean(full_mses)
print(f"\nFull data MSE: {full_mse:.6f}")

  Fold 0: MSE = 0.005945


  Fold 3: MSE = 0.007726


  Fold 6: MSE = 0.011801


  Fold 9: MSE = 0.003466


  Fold 12: MSE = 0.001913

Full data MSE: 0.007335


In [9]:
# Combined CV score
cv_score = (single_mse + full_mse) / 2
print(f"\n=== CV Results ===")
print(f"Single solvent MSE: {single_mse:.6f}")
print(f"Full data MSE: {full_mse:.6f}")
print(f"Combined CV score: {cv_score:.6f}")

# Save metrics
import json
metrics = {
    'cv_score': cv_score,
    'single_mse': single_mse,
    'full_mse': full_mse
}
with open('/home/code/experiments/119_yield_ratio/metrics.json', 'w') as f:
    json.dump(metrics, f)

print(f"\nComparison with best CV: 0.0081")
print(f"This experiment: {cv_score:.6f}")
if cv_score < 0.0081:
    print("IMPROVEMENT! This is better than best CV.")
else:
    print(f"No improvement. Difference: {cv_score - 0.0081:.6f}")


=== CV Results ===
Single solvent MSE: 0.009321
Full data MSE: 0.007335
Combined CV score: 0.008328

Comparison with best CV: 0.0081
This experiment: 0.008328
No improvement. Difference: 0.000228


In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = YieldRatioModel(data='single')  # SAME CLASS AS CV
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = YieldRatioModel(data='full')  # SAME CLASS AS CV
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

print(f"Submission saved with {len(submission)} rows")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################