# Experiment 057: 'ens-model' Kernel Approach with ALL Features

**Goal:** Implement the matthewmaree 'ens-model' kernel approach that combines ALL 5 feature sources.

**Key components:**
1. Combined solvent feature table from ALL sources (spange, acs_pca, drfps, fragprints, smiles)
2. Correlation-based feature filtering with priority (spange > acs > drfps > frag > smiles)
3. Numeric feature engineering (T_x_RT, RT_log, T_inv, RT_scaled)
4. CatBoost + XGBoost ensemble (7:6 for single, 1:2 for full)
5. Clip predictions to [0, 1] WITHOUT normalizing to sum to 1

**Hypothesis:** Combining ALL feature sources may provide better generalization to unseen solvents.

In [1]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
from functools import reduce
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

torch.set_default_dtype(torch.double)

# Data path for local execution
DATA_PATH = "/home/data"

print("Imports complete.")

Imports complete.


In [2]:
# Constants from official template
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT A NAME",
    "SOLVENT B NAME",
    "SolventB%",
]

INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT NAME",
]

INPUT_LABELS_NUMERIC = [
    "Residence Time",
    "Temperature",
]

TARGET_LABELS = [
    "Product 2",
    "Product 3",
    "SM",
]

print("Constants defined.")

Constants defined.


In [3]:
# Data loading functions
def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

# CV functions from official template
def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = ~((X["SOLVENT A NAME"] == solvent_pair["SOLVENT A NAME"]) & 
                           (X["SOLVENT B NAME"] == solvent_pair["SOLVENT B NAME"]))
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Data loading and CV functions defined.")

Data loading and CV functions defined.


In [4]:
# Base classes from official template
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print("Base classes defined.")

Base classes defined.


In [5]:
# Global cache for solvent feature table
_SOLVENT_TABLE_CACHE = None

def feature_priority(name: str) -> int:
    """Assign priority score to feature name. Higher = more important to keep."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0

def filter_correlated_features(df: pd.DataFrame, threshold: float = 0.8):
    """Drop columns that are highly correlated with any other column."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find all pairs with corr > threshold
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    
    # For each pair, decide which to drop based on priority
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j
        
        to_drop.add(drop)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop

print("Feature filtering functions defined.")

Feature filtering functions defined.


In [6]:
def add_numeric_features(X_numeric: pd.DataFrame) -> pd.DataFrame:
    """Add engineered numeric features."""
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    
    if {"Temperature", "Residence Time"} <= cols:
        # Convert Temperature to Kelvin
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        
        # Interaction term
        X_num["T_x_RT"] = T * rt
        
        # Log transformation
        X_num["RT_log"] = np.log(rt + 1e-6)
        
        # Inverse temperature
        X_num["T_inv"] = 1 / T
        
        # Scaled residence time
        X_num["RT_scaled"] = rt / rt.mean()
    
    return X_num

print("Numeric feature engineering defined.")

Numeric feature engineering defined.


In [7]:
def build_solvent_feature_table(threshold: float = 0.90):
    """Build combined solvent feature table from all sources."""
    global _SOLVENT_TABLE_CACHE
    
    if _SOLVENT_TABLE_CACHE is not None:
        return _SOLVENT_TABLE_CACHE
    
    print(">>> Building solvent feature table...")
    
    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
        # Skip "smiles" - it's a string column that can't be used directly
    ]
    
    dfs = []
    
    for src in sources:
        df_src = load_features(src).copy()
        
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        
        # Bit-table filtering for binary fingerprints
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            
            # Drop all-zero and all-one columns
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            
            values = df_src.drop(columns={"SOLVENT NAME"})
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols)
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        else:
            if src == "spange_descriptors":
                prefix = "spange"
            elif src == "acs_pca_descriptors":
                prefix = "acs"
            else:
                prefix = src
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        dfs.append(df_src)
        print(f"  {src}: {df_src.shape[1] - 1} features")
    
    # Merge all dataframes
    merged = reduce(
        lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"),
        dfs
    )
    
    print(f">>> Merged shape before filtering: {merged.shape}")
    
    # Apply correlation filtering
    merged_filtered, dropped = filter_correlated_features(merged, threshold=threshold)
    
    print(f">>> Final shape after filtering: {merged_filtered.shape}")
    print(f">>> Dropped {len(dropped)} features")
    
    _SOLVENT_TABLE_CACHE = merged_filtered
    return merged_filtered

print("Solvent feature table builder defined.")

Solvent feature table builder defined.


In [8]:
# Build the feature table once
solvent_table = build_solvent_feature_table(threshold=0.90)
print(f"\nFinal solvent table shape: {solvent_table.shape}")
print(f"Columns: {solvent_table.columns.tolist()[:20]}...")

>>> Building solvent feature table...
  spange_descriptors: 13 features
  acs_pca_descriptors: 5 features
  drfps_catechol: 40 features
  fragprints: 55 features
  smiles: 1 features
>>> Merged shape before filtering: (26, 115)
>>> Final shape after filtering: (26, 68)
>>> Dropped 47 features

Final solvent table shape: (26, 68)
Columns: ['SOLVENT NAME', 'spange_dielectric constant', 'spange_ET(30)', 'spange_beta', 'spange_pi*', 'spange_SB', 'spange_SP', 'spange_SdP', 'spange_N', 'spange_n', 'acs_PC1', 'acs_PC2', 'acs_PC3', 'acs_PC4', 'acs_PC5', 'drfps_34', 'drfps_67', 'drfps_110', 'drfps_125', 'drfps_209']...


In [None]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    """Featurizer for single solvent data using combined features."""
    
    def __init__(self):
        self.solvent_table = build_solvent_feature_table()
        self.solvent_table = self.solvent_table.set_index("SOLVENT NAME")
        self.feats_dim = self.solvent_table.shape[1] + 6  # +6 for numeric features
    
    def featurize(self, X):
        # Numeric features with engineering
        X_num = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        
        # Solvent features
        X_sol = self.solvent_table.loc[X["SOLVENT NAME"]].values
        
        X_num_tensor = torch.tensor(X_num.values, dtype=torch.double)
        X_sol_tensor = torch.tensor(X_sol, dtype=torch.double)
        
        return torch.cat([X_num_tensor, X_sol_tensor], dim=1)


class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    """Featurizer for mixed solvent data using combined features."""
    
    def __init__(self):
        self.solvent_table = build_solvent_feature_table()
        self.solvent_table = self.solvent_table.set_index("SOLVENT NAME")
        self.feats_dim = self.solvent_table.shape[1] + 7  # +7 for numeric features + SolventB%
    
    def featurize(self, X):
        # Numeric features with engineering
        X_num = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        
        # Solvent features (weighted average)
        A = self.solvent_table.loc[X["SOLVENT A NAME"]].values
        B = self.solvent_table.loc[X["SOLVENT B NAME"]].values
        frac_b = X["SolventB%"].values.reshape(-1, 1) / 100.0
        
        mix = A * (1 - frac_b) + B * frac_b
        
        X_num_tensor = torch.tensor(X_num.values, dtype=torch.double)
        X_frac_tensor = torch.tensor(frac_b, dtype=torch.double)
        X_mix_tensor = torch.tensor(mix, dtype=torch.double)
        
        return torch.cat([X_num_tensor, X_frac_tensor, X_mix_tensor], dim=1)

print("Featurizers defined.")

In [None]:
# CatBoost Model
class CatBoostModel(BaseModel):
    """CatBoost regressor with optimized hyperparameters."""
    
    def __init__(self, data: str = "single", random_state: int = 42):
        self.data_mode = data
        self.random_state = random_state
        
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.cat_params = dict(
                iterations=700,
                depth=6,
                learning_rate=0.05,
                l2_leaf_reg=3.0,
                random_seed=random_state,
                verbose=False,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.cat_params = dict(
                iterations=500,
                depth=5,
                learning_rate=0.03,
                l2_leaf_reg=5.0,
                random_seed=random_state,
                verbose=False,
            )
        
        self.models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        self.models = []
        for t in range(self.n_targets):
            m = CatBoostRegressor(**self.cat_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        
        # Clip to [0, 1] - DO NOT normalize to sum to 1
        out = np.clip(out, 0.0, 1.0)
        
        return torch.tensor(out, dtype=torch.double)

print("CatBoostModel defined.")

In [None]:
# XGBoost Model
class XGBModel(BaseModel):
    """XGBoost regressor with optimized hyperparameters."""
    
    def __init__(self, data: str = "single", random_state: int = 42):
        self.data_mode = data
        self.random_state = random_state
        
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.xgb_params = dict(
                n_estimators=600,
                max_depth=5,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=random_state,
                verbosity=0,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.xgb_params = dict(
                n_estimators=400,
                max_depth=4,
                learning_rate=0.03,
                subsample=0.7,
                colsample_bytree=0.7,
                reg_alpha=0.2,
                reg_lambda=2.0,
                random_state=random_state,
                verbosity=0,
            )
        
        self.models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        self.models = []
        for t in range(self.n_targets):
            m = XGBRegressor(**self.xgb_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        
        # Clip to [0, 1] - DO NOT normalize to sum to 1
        out = np.clip(out, 0.0, 1.0)
        
        return torch.tensor(out, dtype=torch.double)

print("XGBModel defined.")

In [None]:
# Ensemble Model
class EnsembleModel(BaseModel):
    """Weighted ensemble of CatBoost and XGBoost."""
    
    def __init__(self, data: str = "single"):
        self.data_mode = data
        
        # Optimized weights from kernel
        if data == "single":
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        # Normalize weights
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # Initialize base models
        self.cat_model = CatBoostModel(data=data)
        self.xgb_model = XGBModel(data=data)
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)
    
    def predict(self, X):
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        # Final clip to [0, 1]
        out = torch.clamp(out, 0.0, 1.0)
        
        return out

print("EnsembleModel defined.")

In [None]:
# Quick test
print("Testing EnsembleModel...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: X={X.shape}, Y={Y.shape}")

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = EnsembleModel()
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

print(f"Predictions shape: {preds.shape}")
print(f"Predictions sample: {preds[0]}")
print(f"Actual sample: {test_Y.iloc[0].values}")
print("Test passed!")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Single solvent predictions: {len(submission_single_solvent)}")
print(f"Unique folds: {submission_single_solvent['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Full data predictions: {len(submission_full_data)}")
print(f"Unique folds: {submission_full_data['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

In [None]:
# Calculate CV for logging
print("\n" + "="*60)
print("CV CALCULATION")
print("="*60)

# Single solvent CV
X, Y = load_data("single_solvent")
fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X, Y)):
    model = EnsembleModel()
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    fold_mses.append(mse)
    if fold_idx % 5 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

single_cv = np.mean(fold_mses)
single_std = np.std(fold_mses)
print(f"\nSingle solvent CV MSE: {single_cv:.6f} ± {single_std:.6f}")

# Full data CV
X, Y = load_data("full")
full_fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X, Y)):
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    full_fold_mses.append(mse)
    print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

full_cv = np.mean(full_fold_mses)
full_std = np.std(full_fold_mses)
print(f"\nFull data CV MSE: {full_cv:.6f} ± {full_std:.6f}")

print(f"\nFINAL CV FOR LOGGING: {single_cv:.6f}")

In [None]:
# Verification
print("\n" + "="*60)
print("SUBMISSION VERIFICATION")
print("="*60)

df = pd.read_csv('/home/submission/submission.csv')

print(f"\nColumns: {df.columns.tolist()}")
print(f"Total rows: {len(df)}")

print(f"\nTask 0 (single solvent):")
task0 = df[df['task'] == 0]
print(f"  Rows: {len(task0)}")
print(f"  Folds: {task0['fold'].nunique()}")
print(f"  Fold values: {sorted(task0['fold'].unique())}")

print(f"\nTask 1 (full data):")
task1 = df[df['task'] == 1]
print(f"  Rows: {len(task1)}")
print(f"  Folds: {task1['fold'].nunique()}")
print(f"  Fold values: {sorted(task1['fold'].unique())}")

print(f"\nTarget statistics:")
for col in ['target_1', 'target_2', 'target_3']:
    print(f"  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}, mean={df[col].mean():.6f}")
    print(f"    Values > 1: {(df[col] > 1).sum()}, Values < 0: {(df[col] < 0).sum()}, NaN: {df[col].isna().sum()}")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT 057: ENS-MODEL ALL FEATURES SUMMARY")
print("="*60)

print("\nAPPROACH (from matthewmaree 'ens-model' kernel):")
print("  - Combined ALL 5 feature sources (spange, acs_pca, drfps, fragprints, smiles)")
print("  - Correlation-based filtering with priority (spange > acs > drfps > frag > smiles)")
print("  - Numeric feature engineering (T_x_RT, RT_log, T_inv, RT_scaled)")
print("  - CatBoost + XGBoost ensemble (7:6 for single, 1:2 for full)")
print("  - Clip to [0, 1] WITHOUT normalizing to sum to 1")

print(f"\nCV SCORES:")
print(f"  Single solvent: {single_cv:.6f} ± {single_std:.6f}")
print(f"  Full data: {full_cv:.6f} ± {full_std:.6f}")

print(f"\nSUBMISSION FORMAT:")
print(f"  Total rows: {len(df)}")
print(f"  Task 0: {len(task0)} rows, {task0['fold'].nunique()} folds")
print(f"  Task 1: {len(task1)} rows, {task1['fold'].nunique()} folds")
print(f"  All targets in [0, 1]: {(df['target_1'].between(0, 1)).all() and (df['target_2'].between(0, 1)).all() and (df['target_3'].between(0, 1)).all()}")

print("\nHYPOTHESIS:")
print("  Combining ALL feature sources may provide better generalization to unseen solvents.")
print("  This could potentially CHANGE the CV-LB relationship (reduce the intercept).")

# Predicted LB using the CV-LB relationship
predicted_lb = 4.31 * single_cv + 0.0525
print(f"\nPREDICTED LB (using CV-LB relationship): {predicted_lb:.4f}")
print(f"  Best LB so far: 0.0877")
print(f"  Target: 0.0347")