# TabNet - Attention-Based Tabular Learning

**Hypothesis**: TabNet is a fundamentally different architecture (attention-based) that could have a different CV-LB relationship than our current GP+MLP+LGBM ensemble.

**Key Insight from Analysis**:
- CV-LB relationship: LB = 4.23×CV + 0.0533 (R²=0.98)
- Intercept (0.0533) > Target (0.0347) - CV minimization alone CANNOT reach target
- We need an approach that CHANGES the CV-LB relationship

**Why TabNet**:
1. Uses sequential attention to select features at each decision step
2. Specifically designed for tabular data
3. Has been shown to outperform gradient boosting on many datasets
4. Different architecture may have different generalization properties

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.tab_model import TabNetRegressor
import tqdm
import warnings
warnings.filterwarnings('ignore')

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Full Featurizer - 145 features
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Full feature dimension: {FullFeaturizer().feats_dim}')

Full feature dimension: 145


In [7]:
# TabNet Model Wrapper
class TabNetModel:
    """TabNet model for multi-output regression."""
    def __init__(self, data='single'):
        self.data = data
        self.mixed = (data == 'full')
        self.featurizer = FullFeaturizer(mixed=self.mixed)
        self.scaler = StandardScaler()
        self.models = []  # One TabNet per target
        
    def train_model(self, X, Y):
        X_feat = self.featurizer.featurize(X)
        Y_np = Y.values.astype(np.float32)  # TabNet expects float32
        
        X_scaled = self.scaler.fit_transform(X_feat).astype(np.float32)
        
        # Train TabNet for each target
        for i in range(3):
            model = TabNetRegressor(
                n_d=16,  # Width of decision prediction layer
                n_a=16,  # Width of attention embedding
                n_steps=3,  # Number of decision steps
                gamma=1.3,  # Coefficient for feature reusage
                lambda_sparse=1e-3,  # Sparsity regularization
                optimizer_fn=torch.optim.Adam,
                optimizer_params=dict(lr=2e-2),
                scheduler_params={"step_size": 50, "gamma": 0.9},
                scheduler_fn=torch.optim.lr_scheduler.StepLR,
                mask_type='entmax',  # sparsemax or entmax
                verbose=0,
                seed=42,
                device_name='cuda' if torch.cuda.is_available() else 'cpu'
            )
            
            model.fit(
                X_train=X_scaled,
                y_train=Y_np[:, i:i+1],
                max_epochs=100,
                patience=20,
                batch_size=256,
                virtual_batch_size=128
            )
            self.models.append(model)
        
        return self
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        X_scaled = self.scaler.transform(X_feat).astype(np.float32)
        
        # Get predictions from each TabNet
        predictions = np.column_stack([model.predict(X_scaled).flatten() for model in self.models])
        
        # TTA for mixtures
        if self.mixed:
            X_feat_flip = self.featurizer.featurize(X, flip=True)
            X_scaled_flip = self.scaler.transform(X_feat_flip).astype(np.float32)
            predictions_flip = np.column_stack([model.predict(X_scaled_flip).flatten() for model in self.models])
            predictions = (predictions + predictions_flip) / 2
        
        predictions = np.clip(predictions, 0, 1)
        return torch.tensor(predictions)

print('TabNet Model defined')

TabNet Model defined


In [8]:
# Quick test
X_single, Y_single = load_data("single_solvent")
print(f'Single solvent data: X={X_single.shape}, Y={Y_single.shape}')

# Test on a small subset
X_test = X_single.iloc[:100]
Y_test = Y_single.iloc[:100]

model = TabNetModel(data='single')
model.train_model(X_test, Y_test)
preds = model.predict(X_test)
print(f'Test predictions shape: {preds.shape}')
print(f'Test predictions range: [{preds.min():.4f}, {preds.max():.4f}]')

Single solvent data: X=(656, 3), Y=(656, 3)
Test predictions shape: torch.Size([100, 3])
Test predictions range: [0.0000, 0.7874]


In [9]:
# Run CV on single solvent data
print('\n=== Single Solvent CV (TabNet) ===')
X_single, Y_single = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X_single, Y_single)
all_predictions_single = []
all_actuals_single = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = TabNetModel(data='single')
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)
    
    all_predictions_single.append(predictions.numpy())
    all_actuals_single.append(test_Y.values)

preds_single = np.vstack(all_predictions_single)
actuals_single = np.vstack(all_actuals_single)
mse_single = np.mean((preds_single - actuals_single) ** 2)
print(f'Single Solvent MSE: {mse_single:.6f} (n={len(preds_single)})')


=== Single Solvent CV (TabNet) ===


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:07<02:56,  7.66s/it]

  8%|▊         | 2/24 [00:15<02:45,  7.54s/it]

 12%|█▎        | 3/24 [00:22<02:38,  7.55s/it]

 17%|█▋        | 4/24 [00:30<02:30,  7.53s/it]

 21%|██        | 5/24 [00:38<02:25,  7.67s/it]

 25%|██▌       | 6/24 [00:45<02:19,  7.74s/it]

 29%|██▉       | 7/24 [00:53<02:10,  7.67s/it]

 33%|███▎      | 8/24 [01:00<02:01,  7.61s/it]

 38%|███▊      | 9/24 [01:08<01:53,  7.58s/it]

 42%|████▏     | 10/24 [01:16<01:45,  7.57s/it]

 46%|████▌     | 11/24 [01:23<01:38,  7.56s/it]

 50%|█████     | 12/24 [01:31<01:30,  7.56s/it]

 54%|█████▍    | 13/24 [01:38<01:23,  7.59s/it]

 58%|█████▊    | 14/24 [01:46<01:15,  7.58s/it]

 62%|██████▎   | 15/24 [01:53<01:08,  7.60s/it]

 67%|██████▋   | 16/24 [02:01<01:00,  7.60s/it]

 71%|███████   | 17/24 [02:09<00:53,  7.59s/it]

 75%|███████▌  | 18/24 [02:16<00:45,  7.59s/it]

 79%|███████▉  | 19/24 [02:24<00:37,  7.58s/it]

 83%|████████▎ | 20/24 [02:31<00:30,  7.59s/it]

 88%|████████▊ | 21/24 [02:39<00:22,  7.59s/it]

 92%|█████████▏| 22/24 [02:47<00:15,  7.59s/it]

 96%|█████████▌| 23/24 [02:54<00:07,  7.59s/it]

100%|██████████| 24/24 [03:02<00:00,  7.60s/it]

100%|██████████| 24/24 [03:02<00:00,  7.60s/it]

Single Solvent MSE: 0.027641 (n=656)





In [None]:
# Run CV on full data
print('\n=== Full Data CV (TabNet) ===')
X_full, Y_full = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
all_predictions_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = TabNetModel(data='full')
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)
    
    all_predictions_full.append(predictions.numpy())
    all_actuals_full.append(test_Y.values)

preds_full = np.vstack(all_predictions_full)
actuals_full = np.vstack(all_actuals_full)
mse_full = np.mean((preds_full - actuals_full) ** 2)
print(f'Full Data MSE: {mse_full:.6f} (n={len(preds_full)})')

In [None]:
# Calculate overall MSE
n_single = len(preds_single)
n_full = len(preds_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== CV SCORE SUMMARY (TabNet) ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest CV (exp_030): 0.008194')

if overall_mse < 0.008194:
    improvement = (0.008194 - overall_mse) / 0.008194 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than best CV!')
else:
    degradation = (overall_mse - 0.008194) / 0.008194 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than best CV')

# Predict LB using CV-LB relationship
predicted_lb = 4.23 * overall_mse + 0.0533
print(f'\nPredicted LB (using LB = 4.23*CV + 0.0533): {predicted_lb:.4f}')
print(f'Best LB achieved: 0.0877')
print(f'Target: 0.0347')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = TabNetModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = TabNetModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Final verification
print(f'\n=== FINAL CV SCORE ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest CV (exp_030): 0.008194')
print(f'Predicted LB: {predicted_lb:.4f}')
print(f'Target: 0.0347')