# Experiment 081: Solvent Clustering with Class-Specific Models

**Rationale**: Group solvents by chemical class using Spange descriptors, then train class-specific models that generalize within chemical families. This might reduce the intercept by improving generalization within clusters.

**Key Insight**: The CV-LB relationship has intercept 0.052 > target 0.0347. Standard CV optimization cannot reach the target. We need approaches that CHANGE the relationship, not just improve CV.

In [None]:
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

print('Imports done')

In [None]:
# Local data loading functions
def load_data(data_type):
    if data_type == "single_solvent":
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT NAME']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    elif data_type == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[['Residence Time', 'Temperature', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%']]
        Y = df[['SM', 'Product 2', 'Product 3']]
    return X, Y

def load_features(feature_type):
    if feature_type == 'spange_descriptors':
        return pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)

print('Data functions defined')

In [None]:
# Official CV split functions (DO NOT MODIFY)
from typing import Any, Generator

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    for solvent in X["SOLVENT NAME"].unique():
        train_mask = X["SOLVENT NAME"] != solvent
        test_mask = X["SOLVENT NAME"] == solvent
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    ramps = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    for ramp in ramps.unique():
        train_mask = ramps != ramp
        test_mask = ramps == ramp
        yield (
            (X[train_mask], Y[train_mask]),
            (X[test_mask], Y[test_mask]),
        )

print('CV split functions defined')

In [None]:
# Analyze solvent clusters using Spange descriptors
spange = load_features('spange_descriptors')
print(f"Spange descriptors shape: {spange.shape}")
print(f"Solvents: {spange.index.tolist()}")

# Standardize features for clustering
scaler = StandardScaler()
spange_scaled = scaler.fit_transform(spange.values)

# Try different numbers of clusters
from sklearn.metrics import silhouette_score

for n_clusters in [3, 4, 5, 6]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED, n_init=10)
    labels = kmeans.fit_predict(spange_scaled)
    score = silhouette_score(spange_scaled, labels)
    print(f"K={n_clusters}: Silhouette score = {score:.4f}")

In [None]:
# Use 4 clusters based on silhouette score
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=SEED, n_init=10)
cluster_labels = kmeans.fit_predict(spange_scaled)

# Create cluster mapping
solvent_clusters = pd.DataFrame({
    'SOLVENT NAME': spange.index,
    'cluster': cluster_labels
})

print("\nSolvent clusters:")
for c in range(n_clusters):
    solvents = solvent_clusters[solvent_clusters['cluster'] == c]['SOLVENT NAME'].tolist()
    print(f"\nCluster {c} ({len(solvents)} solvents):")
    print(f"  {solvents}")

In [None]:
# Simple MLP for each cluster
class ClusterMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32], output_dim=3, dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('ClusterMLP defined')

In [None]:
# Cluster-aware model
class ClusterAwareModel:
    """Model that uses cluster information to improve predictions."""
    
    def __init__(self, data='single', n_clusters=4):
        self.data = data
        self.n_clusters = n_clusters
        self.mixed = (data == 'full')
        
        # Load Spange descriptors and create cluster mapping
        self.spange = load_features('spange_descriptors')
        self.scaler_spange = StandardScaler()
        spange_scaled = self.scaler_spange.fit_transform(self.spange.values)
        
        # Cluster solvents
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=SEED, n_init=10)
        self.cluster_labels = self.kmeans.fit_predict(spange_scaled)
        self.solvent_to_cluster = dict(zip(self.spange.index, self.cluster_labels))
        
        # Feature scaler
        self.scaler = StandardScaler()
        
        # Models per cluster (will be trained)
        self.models = {}
        
    def _get_features(self, X):
        """Extract features from data."""
        if self.mixed:
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            sb_pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            
            # Get solvent features
            feats_a = self.spange.loc[X['SOLVENT A NAME']].values
            feats_b = self.spange.loc[X['SOLVENT B NAME']].values
            
            # Linear mixing
            solvent_feats = (1 - sb_pct) * feats_a + sb_pct * feats_b
            
            # Get cluster for dominant solvent
            clusters = []
            for i in range(len(X)):
                if sb_pct[i, 0] < 0.5:
                    clusters.append(self.solvent_to_cluster.get(X.iloc[i]['SOLVENT A NAME'], 0))
                else:
                    clusters.append(self.solvent_to_cluster.get(X.iloc[i]['SOLVENT B NAME'], 0))
            clusters = np.array(clusters)
            
            combined = np.hstack([res_time, temp, sb_pct, solvent_feats])
        else:
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            solvent_feats = self.spange.loc[X['SOLVENT NAME']].values
            clusters = np.array([self.solvent_to_cluster.get(s, 0) for s in X['SOLVENT NAME']])
            
            combined = np.hstack([res_time, temp, solvent_feats])
        
        return combined.astype(np.float32), clusters
    
    def train_model(self, train_X, train_Y, num_epochs=100, lr=1e-3, batch_size=32):
        X_np, clusters = self._get_features(train_X)
        y_np = train_Y.values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Train a model for each cluster
        for c in range(self.n_clusters):
            mask = clusters == c
            if mask.sum() < 5:  # Skip if too few samples
                continue
            
            X_c = X_scaled[mask]
            y_c = y_np[mask]
            
            # Use XGBoost for each cluster
            model = MultiOutputRegressor(xgb.XGBRegressor(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                random_state=SEED,
                n_jobs=-1
            ))
            model.fit(X_c, y_c)
            self.models[c] = model
        
        # Train a global model for fallback
        self.global_model = MultiOutputRegressor(xgb.XGBRegressor(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=SEED,
            n_jobs=-1
        ))
        self.global_model.fit(X_scaled, y_np)
    
    def predict(self, test_X):
        X_np, clusters = self._get_features(test_X)
        X_scaled = self.scaler.transform(X_np)
        
        # Predict using cluster-specific models
        predictions = np.zeros((len(test_X), 3))
        
        for i in range(len(test_X)):
            c = clusters[i]
            if c in self.models:
                pred = self.models[c].predict(X_scaled[i:i+1])
            else:
                pred = self.global_model.predict(X_scaled[i:i+1])
            predictions[i] = pred
        
        # Clip to [0, 1]
        predictions = np.clip(predictions, 0, 1)
        
        return torch.tensor(predictions)

print('ClusterAwareModel defined')

In [None]:
# Run CV for single solvent data
import tqdm

X, Y = load_data("single_solvent")
print(f"Single solvent data: {len(X)} samples, {len(X['SOLVENT NAME'].unique())} solvents")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = ClusterAwareModel(data='single', n_clusters=4)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle solvent CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Run CV for full (mixture) data
X, Y = load_data("full")
print(f"Full data: {len(X)} samples")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_mses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = ClusterAwareModel(data='full', n_clusters=4)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold MSE
    fold_mse = np.mean((predictions_np - test_Y.values) ** 2)
    fold_mses.append(fold_mse)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull data CV MSE: {np.mean(fold_mses):.6f} ± {np.std(fold_mses):.6f}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

print(f"Submission shape: {submission.shape}")

# Save
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"\nSubmission saved to /home/submission/submission.csv")

# Verify
submission_check = pd.read_csv("/home/submission/submission.csv")
print(f"\nSubmission rows: {len(submission_check)}")

# Check prediction ranges
target_cols = ['target_1', 'target_2', 'target_3']
for col in target_cols:
    print(f"{col}: min={submission_check[col].min():.4f}, max={submission_check[col].max():.4f}")

In [None]:
# Calculate overall CV score
print("="*50)
print("EXPERIMENT 081 COMPLETE")
print("="*50)
print(f"\nKey techniques implemented:")
print("1. Cluster solvents using Spange descriptors (K-means, 4 clusters)")
print("2. Train cluster-specific XGBoost models")
print("3. Use global model as fallback for unseen clusters")
print("\nThis approach aims to improve generalization within chemical families.")