# Experiment 003: Simple Random Forest with Strong Regularization

Based on Loop 2 analysis:
- Simple RF (0.0742) outperforms complex MLP+GBDT ensemble (0.081)
- Path forward is SIMPLICITY, not COMPLEXITY
- Strong regularization: max_depth=8, min_samples_leaf=5

**TEMPLATE COMPLIANCE**: Last 3 cells are EXACTLY as template, NO cells after them.

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

SPANGE_DF = load_features('spange_descriptors')
print(f"Loaded Spange descriptors: {SPANGE_DF.shape}")

Loaded Spange descriptors: (26, 13)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- ENHANCED FEATURIZER WITH ARRHENIUS FEATURES ---
class EnhancedFeaturizer(SmilesFeaturizer):
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.featurizer = SPANGE_DF

    def featurize(self, X, flip=False):
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        numeric_feats = np.hstack([rt, temp, inv_temp, log_time, interaction])
        
        if self.mixed:
            A = self.featurizer.loc[X["SOLVENT A NAME"]].values
            B = self.featurizer.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                solvent_feats = B * (1 - pct) + A * pct
            else:
                solvent_feats = A * (1 - pct) + B * pct
            all_feats = np.hstack([numeric_feats, pct, solvent_feats])
        else:
            solvent_feats = self.featurizer.loc[X["SOLVENT NAME"]].values
            all_feats = np.hstack([numeric_feats, solvent_feats])
        
        return all_feats

In [5]:
# --- SIMPLE RANDOM FOREST MODEL ---
class SimpleRFModel(BaseModel):
    """Simple Random Forest with strong regularization.
    
    Based on Loop 2 analysis: RF (0.0742) > Complex Ensemble (0.081)
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = EnhancedFeaturizer(mixed=(data=='full'))
        self.scaler = StandardScaler()
        
        # Random Forest with STRONG regularization to prevent overfitting
        self.model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=200,
                max_depth=8,           # Limit depth
                min_samples_leaf=5,    # Require more samples per leaf
                min_samples_split=10,  # Require more samples to split
                max_features='sqrt',   # Limit features per split
                random_state=42,
                n_jobs=-1
            )
        )

    def train_model(self, X_train, y_train):
        X_feat = self.featurizer.featurize(X_train)
        y = y_train.values
        
        if self.data_type == 'full':
            # Data augmentation with flipped features for mixed solvents
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_feat, X_flip])
            y_all = np.vstack([y, y])
        else:
            X_all = X_feat
            y_all = y
        
        X_scaled = self.scaler.fit_transform(X_all)
        self.model.fit(X_scaled, y_all)

    def predict(self, X):
        if self.data_type == 'full':
            # TTA for mixed solvents
            X_std = self.featurizer.featurize(X, flip=False)
            X_flip = self.featurizer.featurize(X, flip=True)
            
            X_std_sc = self.scaler.transform(X_std)
            X_flip_sc = self.scaler.transform(X_flip)
            
            preds = (self.model.predict(X_std_sc) + self.model.predict(X_flip_sc)) / 2
        else:
            X_feat = self.featurizer.featurize(X)
            X_scaled = self.scaler.transform(X_feat)
            preds = self.model.predict(X_scaled)
        
        preds = np.clip(preds, 0, 1)
        return torch.tensor(preds, dtype=torch.double)

In [6]:
# --- QUICK VALIDATION TEST ---
print("Testing SimpleRFModel...")
X_test, Y_test = load_data("single_solvent")

# Quick leave-one-out test on first 3 solvents
errors = []
split_gen = generate_leave_one_out_splits(X_test, Y_test)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 3: break
    model = SimpleRFModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    print(f"Fold {i}: MAE = {mae:.4f}")

print(f"\nQuick test MAE: {np.mean(errors):.4f}")

Testing SimpleRFModel...


Fold 0: MAE = 0.1878


Fold 1: MAE = 0.1249


Fold 2: MAE = 0.0415

Quick test MAE: 0.1181


## Template-Compliant Cross-Validation

The following 3 cells are the FINAL 3 cells - EXACTLY as in the template.

In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimpleRFModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  2.06it/s]

2it [00:01,  1.98it/s]

3it [00:01,  2.00it/s]

4it [00:02,  1.97it/s]

5it [00:02,  1.96it/s]

6it [00:03,  1.96it/s]

7it [00:03,  1.95it/s]

8it [00:04,  1.95it/s]

9it [00:04,  1.95it/s]

10it [00:05,  1.96it/s]

11it [00:05,  1.89it/s]

12it [00:06,  1.90it/s]

13it [00:06,  1.89it/s]

14it [00:07,  1.90it/s]

15it [00:07,  1.89it/s]

16it [00:08,  1.88it/s]

17it [00:08,  1.90it/s]

18it [00:09,  1.92it/s]

19it [00:09,  1.93it/s]

20it [00:10,  1.92it/s]

21it [00:10,  1.93it/s]

22it [00:11,  1.93it/s]

23it [00:11,  1.97it/s]

24it [00:12,  1.95it/s]

24it [00:12,  1.93it/s]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SimpleRFModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  1.62it/s]

2it [00:01,  1.62it/s]

3it [00:01,  1.60it/s]

4it [00:02,  1.62it/s]

5it [00:03,  1.63it/s]

6it [00:03,  1.63it/s]

7it [00:04,  1.64it/s]

8it [00:04,  1.62it/s]

9it [00:05,  1.62it/s]

10it [00:06,  1.62it/s]

11it [00:06,  1.62it/s]

12it [00:07,  1.63it/s]

13it [00:07,  1.65it/s]

13it [00:07,  1.63it/s]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################