# Experiment 121: Per-Solvent-Class Models

**Goal**: Train SEPARATE models for different solvent classes to potentially break the CV-LB line.

**Key Insight**: Different solvent classes (alcohols, esters, ethers, etc.) may have different CV-LB relationships. By training class-specific models, we might reduce the intercept.

**Approach**:
1. Classify solvents into chemical classes
2. Train separate CatBoost+XGBoost models for each class
3. Use class-specific hyperparameters (allowed per competition rules)
4. Check if this achieves a DIFFERENT CV-LB relationship

**CRITICAL**: The model class `PerClassModel` will be used in BOTH CV computation AND submission cells.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import catboost as cb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Define solvent classes based on chemical structure
# This classification is based on functional groups
SOLVENT_CLASSES = {
    # Alcohols (contain -OH group)
    'alcohol': [
        'IPA [Propan-2-ol]', 'Ethanol', 'Methanol', 'Ethylene Glycol [1,2-Ethanediol]',
        'tert-Butanol [2-Methylpropan-2-ol]', '2,2,2-Trifluoroethanol', 'Decanol',
        '1,1,1,3,3,3-Hexafluoropropan-2-ol'
    ],
    # Esters (contain -COO- group)
    'ester': [
        'Ethyl Acetate', 'Dimethyl Carbonate', 'Ethyl Lactate', 'Methyl Propionate'
    ],
    # Ethers (contain C-O-C group)
    'ether': [
        '2-Methyltetrahydrofuran [2-MeTHF]', 'Diethyl Ether [Ether]', 
        'MTBE [tert-Butylmethylether]', 'THF [Tetrahydrofuran]'
    ],
    # Ketones/Aldehydes (contain C=O group)
    'ketone': [
        'Butanone [MEK]', 'Dihydrolevoglucosenone (Cyrene)'
    ],
    # Acids (contain -COOH group)
    'acid': [
        'Acetic Acid'
    ],
    # Amides (contain -CONH- group)
    'amide': [
        'DMA [N,N-Dimethylacetamide]'
    ],
    # Nitriles (contain -CN group)
    'nitrile': [
        'Acetonitrile'
    ],
    # Hydrocarbons (no functional group)
    'hydrocarbon': [
        'Cyclohexane'
    ],
    # Water
    'water': [
        'Water'
    ],
    # Mixtures
    'mixture': [
        'Water.Acetonitrile', 'Acetonitrile.Acetic Acid', 'Water.2,2,2-Trifluoroethanol'
    ]
}

# Create reverse mapping: solvent -> class
SOLVENT_TO_CLASS = {}
for cls, solvents in SOLVENT_CLASSES.items():
    for solvent in solvents:
        SOLVENT_TO_CLASS[solvent] = cls

print(f"Defined {len(SOLVENT_CLASSES)} solvent classes:")
for cls, solvents in SOLVENT_CLASSES.items():
    print(f"  {cls}: {len(solvents)} solvents")

Defined 10 solvent classes:
  alcohol: 8 solvents
  ester: 4 solvents
  ether: 4 solvents
  ketone: 2 solvents
  acid: 1 solvents
  amide: 1 solvents
  nitrile: 1 solvents
  hydrocarbon: 1 solvents
  water: 1 solvents
  mixture: 3 solvents


In [5]:
# Featurizer
class Featurizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Feature dimension: {Featurizer().feats_dim}')

Feature dimension: 145


In [6]:
# Per-Class Model - trains separate models for different solvent classes
class PerClassModel:
    """Per-solvent-class model that trains separate models for different chemical classes.
    
    Key insight: Different solvent classes may have different CV-LB relationships.
    By training class-specific models, we might reduce the intercept.
    
    This is the SAME class used in both CV and submission cells.
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = Featurizer(mixed=(data=='full'))
        self.scaler = StandardScaler()
        self.class_models = {}  # class -> list of models for each target
        self.default_models = []  # fallback models for unknown classes
        self.train_mean = None
        
    def _get_solvent_class(self, solvent_name):
        """Get the chemical class of a solvent."""
        return SOLVENT_TO_CLASS.get(solvent_name, 'unknown')
    
    def _get_sample_class(self, X_row):
        """Get the class for a sample (single or mixture)."""
        if self.data_type == 'full':
            # For mixtures, use the class of solvent A (primary solvent)
            return self._get_solvent_class(X_row["SOLVENT A NAME"])
        else:
            return self._get_solvent_class(X_row["SOLVENT NAME"])
        
    def train_model(self, X_train, y_train):
        # Featurize
        X_feat = self.featurizer.featurize(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y_vals = y_train.values
        
        # Store training mean for fallback
        self.train_mean = y_vals.mean(axis=0)
        
        # Group samples by class
        if self.data_type == 'full':
            classes = X_train["SOLVENT A NAME"].apply(self._get_solvent_class).values
        else:
            classes = X_train["SOLVENT NAME"].apply(self._get_solvent_class).values
        
        unique_classes = np.unique(classes)
        
        # Train class-specific models
        for cls in unique_classes:
            mask = classes == cls
            if mask.sum() < 5:  # Need at least 5 samples
                continue
                
            X_cls = X_scaled[mask]
            y_cls = y_vals[mask]
            
            cls_models = []
            for i in range(3):  # 3 targets
                # CatBoost with class-specific hyperparameters
                cb_model = cb.CatBoostRegressor(
                    iterations=500,
                    learning_rate=0.05,
                    depth=6,
                    l2_leaf_reg=3,
                    random_seed=42,
                    verbose=False
                )
                cb_model.fit(X_cls, y_cls[:, i])
                
                # XGBoost
                xgb_model = xgb.XGBRegressor(
                    n_estimators=500,
                    learning_rate=0.05,
                    max_depth=6,
                    reg_lambda=1,
                    random_state=42,
                    verbosity=0
                )
                xgb_model.fit(X_cls, y_cls[:, i])
                
                cls_models.append((cb_model, xgb_model))
            
            self.class_models[cls] = cls_models
        
        # Train default models on all data
        for i in range(3):
            cb_model = cb.CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
            cb_model.fit(X_scaled, y_vals[:, i])
            
            xgb_model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_lambda=1,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(X_scaled, y_vals[:, i])
            
            self.default_models.append((cb_model, xgb_model))
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        X_scaled = self.scaler.transform(X_feat)
        
        # Get class for each sample
        if self.data_type == 'full':
            classes = X["SOLVENT A NAME"].apply(self._get_solvent_class).values
        else:
            classes = X["SOLVENT NAME"].apply(self._get_solvent_class).values
        
        # Predict for each sample
        preds = []
        for idx, cls in enumerate(classes):
            x = X_scaled[idx:idx+1]
            
            # Use class-specific model if available, otherwise default
            if cls in self.class_models:
                models = self.class_models[cls]
            else:
                models = self.default_models
            
            pred = []
            for i, (cb_model, xgb_model) in enumerate(models):
                cb_pred = cb_model.predict(x)[0]
                xgb_pred = xgb_model.predict(x)[0]
                pred.append((cb_pred + xgb_pred) / 2)
            
            preds.append(pred)
        
        pred = np.array(preds)
        pred = np.clip(pred, 0, 1)
        
        return torch.tensor(pred)

print('PerClassModel defined - will be used in both CV and submission cells')

PerClassModel defined - will be used in both CV and submission cells


In [7]:
# Load data and analyze class distribution
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

print("Single solvent class distribution:")
class_counts = X_single["SOLVENT NAME"].apply(lambda x: SOLVENT_TO_CLASS.get(x, 'unknown')).value_counts()
print(class_counts)

print("\nFull data class distribution (by SOLVENT A):")
class_counts_full = X_full["SOLVENT A NAME"].apply(lambda x: SOLVENT_TO_CLASS.get(x, 'unknown')).value_counts()
print(class_counts_full)

Single solvent class distribution:
SOLVENT NAME
alcohol        217
ether          117
mixture         81
ester           71
nitrile         59
amide           41
ketone          36
hydrocarbon     34
Name: count, dtype: int64

Full data class distribution (by SOLVENT A):
SOLVENT A NAME
alcohol        534
ether          158
nitrile        125
mixture        125
amide          110
hydrocarbon    104
ketone          36
ester           35
Name: count, dtype: int64


In [None]:
# Cross-validation to compute CV score
print("Computing CV score...")

# Single solvent CV
single_mses = []

for fold_idx, split in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = PerClassModel(data='single')  # SAME CLASS AS SUBMISSION
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((predictions - targets) ** 2)
    single_mses.append(mse)
    
    if fold_idx % 6 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

single_mse = np.mean(single_mses)
print(f"\nSingle solvent MSE: {single_mse:.6f}")

In [None]:
# Full data CV
full_mses = []

for fold_idx, split in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = PerClassModel(data='full')  # SAME CLASS AS SUBMISSION
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    targets = test_Y.values
    
    mse = np.mean((predictions - targets) ** 2)
    full_mses.append(mse)
    
    if fold_idx % 3 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

full_mse = np.mean(full_mses)
print(f"\nFull data MSE: {full_mse:.6f}")

In [None]:
# Combined CV score
cv_score = (single_mse + full_mse) / 2
print(f"\n=== CV Results ===")
print(f"Single solvent MSE: {single_mse:.6f}")
print(f"Full data MSE: {full_mse:.6f}")
print(f"Combined CV score: {cv_score:.6f}")

# Save metrics
import json
metrics = {
    'cv_score': cv_score,
    'single_mse': single_mse,
    'full_mse': full_mse
}
with open('/home/code/experiments/121_per_class_models/metrics.json', 'w') as f:
    json.dump(metrics, f)

print(f"\nComparison with best CV: 0.0081")
print(f"This experiment: {cv_score:.6f}")
if cv_score < 0.0081:
    print("IMPROVEMENT! This is better than best CV.")
else:
    print(f"No improvement. Difference: {cv_score - 0.0081:.6f}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerClassModel(data='single')  # SAME CLASS AS CV
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerClassModel(data='full')  # SAME CLASS AS CV
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

print(f"Submission saved with {len(submission)} rows")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################