# Experiment 109: Ens-Model with CORRECT Submission Format

## Goal
Adapt the ens-model kernel approach (CatBoost + XGBoost ensemble) with the CORRECT submission format.

## Key Fix
The submission format MUST have columns: task, fold, row, target_1, target_2, target_3
NOT: Product 2, Product 3, SM

## Approach
1. Copy the EnsembleModel class from ens-model kernel
2. Use the exact submission cell structure from the template
3. Verify format before submitting

In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add data path
sys.path.append('/home/data/')

from utils import (
    INPUT_LABELS_FULL_SOLVENT, INPUT_LABELS_SINGLE_SOLVENT, 
    INPUT_LABELS_NUMERIC, INPUT_LABELS_SINGLE_FEATURES, 
    INPUT_LABELS_FULL_FEATURES,
    generate_leave_one_out_splits, generate_leave_one_ramp_out_splits
)

# Override load functions to use local paths
DATA_PATH = '/home/data/'
TARGET_LABELS = ['Product 2', 'Product 3', 'SM']

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}{name}_lookup.csv', index_col=0)
    return features

print("Imports successful")

Imports successful


In [2]:
# Base classes and feature engineering (from ens-model kernel)
from abc import ABC, abstractmethod
from functools import reduce

torch.set_default_dtype(torch.double)

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

_SOLVENT_TABLE_CACHE = None

def feature_priority(name: str) -> int:
    if name.startswith("spange_"): return 5
    if name.startswith("acs_"): return 4
    if name.startswith("drfps_"): return 3
    if name.startswith("frag_"): return 2
    if name.startswith("smiles_"): return 1
    return 0

def filter_correlated_features(df, threshold=0.8):
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.shape[1] == 0:
        return df, []
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    cols = upper.columns.tolist()
    to_drop = set()
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j
        to_drop.add(drop)
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    return df_filtered, all_to_drop

def add_numeric_features(X_numeric):
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    if {"Temperature", "Residence Time"} <= cols:
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        X_num["T_x_RT"] = T * rt
        X_num["RT_log"] = np.log(rt + 1e-6)
        X_num["T_inv"] = 1 / T
        X_num["RT_scaled"] = rt / rt.mean()
    return X_num

print("Feature engineering functions defined")

Feature engineering functions defined


In [3]:
def build_solvent_feature_table(threshold=0.90):
    global _SOLVENT_TABLE_CACHE
    if _SOLVENT_TABLE_CACHE is not None:
        return _SOLVENT_TABLE_CACHE
    print(">>> Building solvent feature table...")
    # Exclude "smiles" since it contains string data, not numeric features
    sources = ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints"]
    dfs = []
    for src in sources:
        df_src = load_features(src).copy()
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            values = df_src.drop(columns={"SOLVENT NAME"})
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols)
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        else:
            if src == "spange_descriptors":
                prefix = "spange"
            elif src == "acs_pca_descriptors":
                prefix = "acs"
            else:
                prefix = src
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        dfs.append(df_src)
    combined = reduce(lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"), dfs)
    combined = combined.set_index("SOLVENT NAME")
    print(f"Combined feature table shape (before corr filter): {combined.shape}")
    combined, _ = filter_correlated_features(combined, threshold=threshold)
    print(f"Final solvent feature table shape: {combined.shape}")
    _SOLVENT_TABLE_CACHE = combined
    return combined

print("build_solvent_feature_table defined")

build_solvent_feature_table defined


In [4]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self):
        self.featurizer = build_solvent_feature_table()
        dummy_num = pd.DataFrame([[0] * len(INPUT_LABELS_NUMERIC)], columns=INPUT_LABELS_NUMERIC)
        numeric_dim = add_numeric_features(dummy_num).shape[1]
        self.feats_dim = numeric_dim + self.featurizer.shape[1]

    def featurize(self, X):
        X_numeric = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        X_solvent = self.featurizer.loc[X["SOLVENT NAME"]]
        X_out = np.concatenate([X_numeric.values, X_solvent.values], axis=1)
        return torch.tensor(X_out, dtype=torch.double)

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self):
        self.featurizer = build_solvent_feature_table()
        dummy_num = pd.DataFrame([[0] * len(INPUT_LABELS_NUMERIC)], columns=INPUT_LABELS_NUMERIC)
        numeric_dim = add_numeric_features(dummy_num).shape[1]
        self.feats_dim = numeric_dim + 2 * self.featurizer.shape[1] + 1

    def featurize(self, X):
        X_numeric = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        X_solvent_A = self.featurizer.loc[X["SOLVENT A NAME"]].values
        X_solvent_B = self.featurizer.loc[X["SOLVENT B NAME"]].values
        X_solvent_B_pct = X["SolventB%"].values.reshape(-1, 1)
        X_out = np.concatenate([X_numeric.values, X_solvent_A, X_solvent_B, X_solvent_B_pct], axis=1)
        return torch.tensor(X_out, dtype=torch.double)

print("Featurizers defined")

Featurizers defined


In [5]:
# CatBoost Model
from catboost import CatBoostRegressor

class CatBoostModel(BaseModel):
    def __init__(self, data="single", verbose=False, random_state=42):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.cat_params = dict(
                random_seed=random_state, loss_function="MultiRMSE",
                depth=3, learning_rate=0.07, n_estimators=1050,
                l2_leaf_reg=3.5, bootstrap_type="Bayesian",
                bagging_temperature=0.225, grow_policy="SymmetricTree",
                rsm=0.75, verbose=verbose,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.cat_params = dict(
                random_seed=random_state, loss_function="MultiRMSE",
                depth=3, learning_rate=0.06, n_estimators=1100,
                l2_leaf_reg=2.5, bootstrap_type="Bayesian",
                bagging_temperature=0.2, grow_policy="SymmetricTree",
                rsm=0.7, verbose=verbose,
            )
        self.model = None

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.model = CatBoostRegressor(**self.cat_params)
        self.model.fit(X_np, Y_np)

    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        out = self.model.predict(X_np)
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.ndim == 1:
            out = out.reshape(-1, 1)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        return torch.tensor(out, dtype=torch.double)

print("CatBoostModel defined")

CatBoostModel defined


In [6]:
# XGBoost Model
from xgboost import XGBRegressor

class XGBModel(BaseModel):
    def __init__(self, data="single", random_state=42, verbose=False):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.xgb_params = dict(
                random_state=random_state, objective="reg:squarederror",
                tree_method="hist", subsample=0.5, reg_lambda=0.6,
                reg_alpha=0.0, n_estimators=1000, min_child_weight=1,
                max_depth=4, max_delta_step=1, learning_rate=0.02,
                grow_policy="depthwise", gamma=0.0, colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.xgb_params = dict(
                random_state=random_state, objective="reg:squarederror",
                tree_method="approx", subsample=0.8, reg_lambda=0.5,
                reg_alpha=0.0, n_estimators=1200, min_child_weight=1,
                max_depth=5, max_delta_step=1, learning_rate=0.015,
                grow_policy="depthwise", gamma=0.0, colsample_bytree=0.4,
                colsample_bylevel=0.5,
            )
        self.models = None
        self.n_targets = None

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        self.models = []
        for t in range(self.n_targets):
            m = XGBRegressor(**self.xgb_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)

    def predict(self, X):
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        return torch.tensor(out, dtype=torch.double)

print("XGBModel defined")

XGBModel defined


In [7]:
# Ensemble Model (from ens-model kernel)
class EnsembleModel(BaseModel):
    def __init__(self, data="single", verbose=False):
        self.data_mode = data
        self.verbose = verbose
        # Optimised fixed weights per dataset
        if data == "single":
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        self.cat_model = CatBoostModel(data=data, verbose=verbose)
        self.xgb_model = XGBModel(data=data, verbose=verbose)

    def train_model(self, train_X, train_Y, device=None, verbose=False):
        self.cat_model.train_model(train_X, train_Y, device, verbose)
        self.xgb_model.train_model(train_X, train_Y, device, verbose)

    def predict(self, X):
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        return out

print("EnsembleModel defined")

EnsembleModel defined


In [8]:
# Quick CV evaluation to get CV score
import tqdm

def evaluate_cv():
    """Evaluate using leave-one-out CV"""
    # Single solvent
    X, Y = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X, Y)
    all_preds_single = []
    all_true_single = []
    
    for fold_idx, split in tqdm.tqdm(enumerate(split_generator), desc="single"):
        (train_X, train_Y), (test_X, test_Y) = split
        model = EnsembleModel(data='single')
        model.train_model(train_X, train_Y)
        predictions = model.predict(test_X)
        all_preds_single.append(predictions.numpy())
        all_true_single.append(test_Y.values)
    
    preds_single = np.vstack(all_preds_single)
    true_single = np.vstack(all_true_single)
    mse_single = np.mean((preds_single - true_single) ** 2)
    print(f"Single Solvent MSE: {mse_single:.6f}")
    
    # Full data
    X, Y = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X, Y)
    all_preds_full = []
    all_true_full = []
    
    for fold_idx, split in tqdm.tqdm(enumerate(split_generator), desc="full"):
        (train_X, train_Y), (test_X, test_Y) = split
        model = EnsembleModel(data='full')
        model.train_model(train_X, train_Y)
        predictions = model.predict(test_X)
        all_preds_full.append(predictions.numpy())
        all_true_full.append(test_Y.values)
    
    preds_full = np.vstack(all_preds_full)
    true_full = np.vstack(all_true_full)
    mse_full = np.mean((preds_full - true_full) ** 2)
    print(f"Full Data MSE: {mse_full:.6f}")
    
    # Combined
    n_single = preds_single.shape[0] * preds_single.shape[1]
    n_full = preds_full.shape[0] * preds_full.shape[1]
    combined_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)
    print(f"\nCombined MSE (CV score): {combined_mse:.6f}")
    
    return combined_mse, mse_single, mse_full

cv_score, mse_single, mse_full = evaluate_cv()

single: 0it [00:00, ?it/s]

>>> Building solvent feature table...
Combined feature table shape (before corr filter): (26, 113)
Final solvent feature table shape: (26, 66)


single: 1it [00:01,  1.31s/it]

single: 2it [00:02,  1.08s/it]

single: 3it [00:03,  1.01s/it]

single: 4it [00:04,  1.02it/s]

single: 5it [00:05,  1.03it/s]

single: 6it [00:05,  1.04it/s]

single: 7it [00:06,  1.04it/s]

single: 8it [00:07,  1.05it/s]

single: 9it [00:08,  1.06it/s]

single: 10it [00:09,  1.06it/s]

single: 11it [00:10,  1.06it/s]

single: 12it [00:11,  1.07it/s]

single: 13it [00:12,  1.07it/s]

single: 14it [00:13,  1.03it/s]

single: 15it [00:14,  1.05it/s]

single: 16it [00:15,  1.06it/s]

single: 17it [00:16,  1.07it/s]

single: 18it [00:17,  1.07it/s]

single: 19it [00:18,  1.06it/s]

single: 20it [00:19,  1.06it/s]

single: 21it [00:20,  1.06it/s]

single: 22it [00:21,  1.07it/s]

single: 23it [00:22,  1.06it/s]

single: 24it [00:22,  1.07it/s]

single: 24it [00:22,  1.05it/s]




Single Solvent MSE: 0.008724


full: 0it [00:00, ?it/s]

full: 1it [00:01,  1.97s/it]

full: 2it [00:03,  1.96s/it]

full: 3it [00:05,  1.97s/it]

full: 4it [00:07,  1.97s/it]

full: 5it [00:09,  1.98s/it]

full: 6it [00:11,  1.96s/it]

full: 7it [00:13,  1.92s/it]

full: 8it [00:15,  1.94s/it]

full: 9it [00:17,  1.95s/it]

full: 10it [00:19,  1.96s/it]

full: 11it [00:21,  1.98s/it]

full: 12it [00:23,  1.97s/it]

full: 13it [00:25,  1.95s/it]

full: 13it [00:25,  1.96s/it]

Full Data MSE: 0.013733

Combined MSE (CV score): 0.011988





In [9]:
# Save metrics
import json

metrics = {
    'cv_score': float(cv_score),
    'mse_single': float(mse_single),
    'mse_full': float(mse_full),
    'notes': 'Ens-model kernel approach (CatBoost + XGBoost ensemble) with CORRECT submission format.'
}

with open('/home/code/experiments/109_ens_model_correct_format/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Metrics saved")
print(f"\nCV Score: {cv_score:.6f}")

Metrics saved

CV Score: 0.011988


## Submission Cells (CORRECT FORMAT)

These cells use the EXACT format from the template:
- task, fold, row, target_1, target_2, target_3 columns
- task=0 for single_solvent, task=1 for full data

In [13]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.06s/it]

2it [00:02,  1.00it/s]

3it [00:03,  1.00s/it]

4it [00:03,  1.03it/s]

5it [00:04,  1.04it/s]

6it [00:05,  1.05it/s]

7it [00:06,  1.04it/s]

8it [00:07,  1.02it/s]

9it [00:08,  1.03it/s]

10it [00:09,  1.05it/s]

11it [00:10,  1.06it/s]

12it [00:11,  1.06it/s]

13it [00:12,  1.06it/s]

14it [00:13,  1.07it/s]

15it [00:14,  1.07it/s]

16it [00:15,  1.07it/s]

17it [00:16,  1.07it/s]

18it [00:17,  1.06it/s]

19it [00:18,  1.07it/s]

20it [00:19,  1.03it/s]

21it [00:20,  1.04it/s]

22it [00:21,  1.03it/s]

23it [00:22,  1.02it/s]

24it [00:23,  1.00it/s]

24it [00:23,  1.04it/s]




In [14]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.00s/it]

2it [00:04,  2.00s/it]

3it [00:06,  2.02s/it]

4it [00:08,  2.03s/it]

5it [00:10,  2.03s/it]

6it [00:12,  2.02s/it]

7it [00:14,  2.01s/it]

8it [00:16,  2.01s/it]

9it [00:18,  2.00s/it]

10it [00:20,  2.00s/it]

11it [00:22,  2.01s/it]

12it [00:24,  2.07s/it]

13it [00:26,  2.06s/it]

13it [00:26,  2.03s/it]




In [15]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index(drop=True)  # Drop the old index
submission.index.name = "id"
submission.to_csv("/home/code/experiments/109_ens_model_correct_format/submission.csv", index=True)

# Also copy to main submission folder
import shutil
shutil.copy("/home/code/experiments/109_ens_model_correct_format/submission.csv", "/home/submission/submission.csv")

print(f"Submission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(submission.head())
print(f"\nLast 5 rows:")
print(submission.tail())

# Read back and verify format
sub_check = pd.read_csv("/home/submission/submission.csv")
print(f"\nRead back columns: {sub_check.columns.tolist()}")
expected_cols = ['id', 'task', 'fold', 'row', 'target_1', 'target_2', 'target_3']
assert list(sub_check.columns) == expected_cols, f"Wrong columns: {list(sub_check.columns)}"
print(f"\n✅ FORMAT VERIFIED: {expected_cols}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

Submission shape: (1883, 6)
Submission columns: ['task', 'fold', 'row', 'target_1', 'target_2', 'target_3']

First 5 rows:
    task  fold  row  target_1  target_2  target_3
id                                               
0      0     0    0  0.025106  0.024003  0.864946
1      0     0    1  0.029176  0.028944  0.873540
2      0     0    2  0.035391  0.041936  0.839781
3      0     0    3  0.060277  0.068248  0.753215
4      0     0    4  0.081435  0.091881  0.700487

Last 5 rows:
      task  fold  row  target_1  target_2  target_3
id                                                 
1878     1    12   30  0.079368  0.075368  0.682003
1879     1    12   31  0.079368  0.075368  0.682003
1880     1    12   32  0.079368  0.075368  0.682003
1881     1    12   33  0.079368  0.075368  0.682003
1882     1    12   34  0.079368  0.075368  0.682003

Read back columns: ['id', 'task', 'fold', 'row', 'target_1', 'target_2', 'target_3']

✅ FORMAT VERIFIED: ['id', 'task', 'fold', 'row', 'target_1', '