In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')

from utils import INPUT_LABELS_FULL_SOLVENT, INPUT_LABELS_SINGLE_SOLVENT, INPUT_LABELS_NUMERIC, INPUT_LABELS_SINGLE_FEATURES, INPUT_LABELS_FULL_FEATURES, load_data, load_features, generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

In [None]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

In [None]:
_SOLVENT_TABLE_CACHE = None

from functools import reduce

import torch

torch.set_default_dtype(torch.double)


def feature_priority(name : str) -> int:
    """
    Assign a priority score to a feature name based on its prefix.
    Higher number = more important to keep during correlation filtering.
    """
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0


def filter_correlated_features(
    df : pd.DataFrame,
    threshold : float = 0.8,
):
    """
    Drop columns that are highly correlated with any other column.

    Logic:
      - Only numeric columns are considered.
      - Find all pairs with |corr| > threshold.
      - For each pair, drop ONE feature:
          * Prefer to KEEP higher-priority prefixes (spange > acs > drfps > frag > smiles).
          * If equal priority, drop the one that appears later in the original column order.
      - Constant (zero-variance) columns are removed first.
    """
    numeric_df = df.select_dtypes(include = [np.number])

    print(f"[filter_correlated_features] numeric shape: {numeric_df.shape}")

    if numeric_df.shape[1] == 0:
        print("No numeric columns found, skipping correlation filter.")
        return df, []

    # Drop constant columns first (std = 0) to avoid NaNs in correlation
    std = numeric_df.std(axis = 0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        print(
            f"[filter_correlated_features] dropping {len(constant_cols)} "
            f"constant columns before corr"
        )
        numeric_df = numeric_df.drop(columns = constant_cols)

    # Correlation matrix
    corr = numeric_df.corr().abs()

    # Upper triangle only
    upper = corr.where(np.triu(np.ones(corr.shape), k = 1).astype(bool)).fillna(0.0)

    cols = upper.columns.tolist()
    to_drop = set()

    # Build list of all pairs (i, j) with corr > threshold
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))

    print(
        f"[filter_correlated_features] found {len(high_corr_pairs)} "
        f"pairs with |corr| > {threshold}"
    )

    # For each pair, decide which column to drop
    for col_i, col_j, cval in high_corr_pairs:
        # If either already marked to drop, skip
        if col_i in to_drop or col_j in to_drop:
            continue

        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)

        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            # Same priority; drop the one that appears later in original df
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j

        to_drop.add(drop)

    # Merge with constant cols
    all_to_drop = list(set(constant_cols).union(to_drop))

    print(
        f"[filter_correlated_features] threshold = {threshold}, "
        f"dropping {len(to_drop)} correlated + {len(constant_cols)} constant "
        f"= {len(all_to_drop)} total columns"
    )

    df_filtered = df.drop(columns = all_to_drop, errors = "ignore")

    return df_filtered, all_to_drop


# ---------------- NUMERIC FEATURE ENGINEERING ---------------- #

def add_numeric_features(X_numeric : pd.DataFrame) -> pd.DataFrame:
    """
    Add engineered numeric features (e.g. temperature transformations,
    interaction terms, and scaled residence time).
    """
    X_num = X_numeric.copy()
    cols = set(X_num.columns)

    if {"Temperature", "Residence Time"} <= cols:
        # Convert Temperature to Kelvin
        X_num["Temperature"] = X_num["Temperature"] + 273.15

        T = X_num["Temperature"]
        rt = X_num["Residence Time"]

        # Interaction term
        X_num["T_x_RT"] = T * rt

        # Log transformation (avoid log(0))
        X_num["RT_log"] = np.log(rt + 1e-6)

        # Inverse temperature
        X_num["T_inv"] = 1 / T

        # Scaled residence time
        X_num["RT_scaled"] = rt / rt.mean()

    return X_num


# ---------------- SOLVENT TABLE COMBINER ---------------- #

def build_solvent_feature_table(threshold : float = 0.90):
    """
    Build and cache a combined solvent feature table from multiple sources,
    then apply correlation-based feature filtering.
    """
    global _SOLVENT_TABLE_CACHE

    # If already built the table once, reuse it.
    if _SOLVENT_TABLE_CACHE is not None:
        return _SOLVENT_TABLE_CACHE

    print(">>> Building solvent feature table (first and only time)...")

    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
        "smiles",
    ]

    dfs = []

    for src in sources:
        df_src = load_features(src).copy()

        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns = {"index" : "SOLVENT NAME"})

        # --- Bit-table filtering for binary fingerprints ---
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"

            # Drop all-zero and all-one columns
            df_src = df_src.loc[:, (df_src != 0).any(axis = 0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis = 0)]

            values = df_src.drop(columns = {"SOLVENT NAME"})
            count = values.sum(axis = 0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns = drop_cols)

            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns = {c : f"{prefix}_{c}" for c in cols_to_rename})

        else:
            if src == "spange_descriptors":
                prefix = "spange"
            elif src == "acs_pca_descriptors":
                prefix = "acs"
            elif src == "smiles":
                prefix = "smiles"
            else:
                prefix = src

            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns = {c : f"{prefix}_{c}" for c in cols_to_rename})

        # Drop any SMILES-like columns that slipped through
        smiles_like = [c for c in df_src.columns if "SMILES" in c.upper()]
        df_src = df_src.drop(columns = smiles_like, errors = "ignore")

        df_src = df_src.set_index("SOLVENT NAME")
        dfs.append(df_src)

    # Join all feature tables on solvent name
    featurizer = reduce(lambda l, r : l.join(r, how = "inner"), dfs)
    print(f"Combined feature table shape (before corr filter): {featurizer.shape}")

    featurizer_filtered, dropped_cols = filter_correlated_features(
        featurizer,
        threshold = threshold,
    )

    print(f"Dropped {len(dropped_cols)} columns at threshold = {threshold}")
    print(f"Final solvent feature table shape: {featurizer_filtered.shape}")

    # Cache the final table so it won't rebuild again
    _SOLVENT_TABLE_CACHE = featurizer_filtered

    return featurizer_filtered


# ---------------- SINGLE-SOLVENT FEATURIZER ---------------- #

class PrecomputedFeaturizer(SmilesFeaturizer):
    """
    Featurizer for single-solvent experiments:
      - engineered numeric features
      - joined solvent descriptor table
    """
    def __init__(self):
        # Build full solvent table (cached)
        self.featurizer = build_solvent_feature_table()

        # Compute numeric dim using a dummy row
        dummy_num = pd.DataFrame(
            [[0] * len(INPUT_LABELS_NUMERIC)],
            columns = INPUT_LABELS_NUMERIC,
        )
        numeric_dim = add_numeric_features(dummy_num).shape[1]

        self.feats_dim = numeric_dim + self.featurizer.shape[1]

    def featurize(self, X):
        X_numeric = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())
        X_solvent = self.featurizer.loc[X["SOLVENT NAME"]]

        X_out = np.concatenate([X_numeric.values, X_solvent.values], axis = 1)

        return torch.tensor(
            X_out,
            dtype = torch.double,
        )


# ---------------- MIXED-SOLVENT FEATURIZER ---------------- #

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    """
    Featurizer for mixed-solvent experiments:
      - engineered numeric features
      - solvent features for A and B
      - linear mixture according to SolventB%
    """
    def __init__(self):
        self.featurizer = build_solvent_feature_table()

        dummy_num = pd.DataFrame(
            [[0] * len(INPUT_LABELS_NUMERIC)],
            columns = INPUT_LABELS_NUMERIC,
        )
        numeric_dim = add_numeric_features(dummy_num).shape[1]

        self.feats_dim = numeric_dim + self.featurizer.shape[1]

    def featurize(self, X):
        X_numeric = add_numeric_features(X[INPUT_LABELS_NUMERIC].copy())

        A = self.featurizer.loc[X["SOLVENT A NAME"]].values
        B = self.featurizer.loc[X["SOLVENT B NAME"]].values

        frac_B = X["SolventB%"].values.reshape(-1, 1)
        frac_A = 1 - frac_B

        mixed = A * frac_A + B * frac_B

        X_out = np.concatenate([X_numeric.values, mixed], axis = 1)

        return torch.tensor(
            X_out,
            dtype = torch.double,
        )

In [None]:
from catboost import CatBoostRegressor


class CatBoostModel(BaseModel):
    """
    CatBoost-based model for reaction yields.

    Uses different hyperparameters for:
      - data="single": single-solvent dataset
      - data!="single": full / mixed-solvent dataset
    """

    def __init__(
        self,
        data: str = "single",
        verbose: bool = False,
        random_state: int = 42,
    ):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state

        # Select featurizer and tuned CatBoost parameters based on data mode
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()

            self.cat_params = dict(
                random_seed = random_state,
                loss_function = "MultiRMSE",
                depth = 3,
                learning_rate = 0.07,
                n_estimators = 1050,
                l2_leaf_reg = 3.5,
                bootstrap_type = "Bayesian",
                bagging_temperature = 0.225,
                grow_policy = "SymmetricTree",
                rsm = 0.75,
                verbose = verbose,
            )

        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()

            self.cat_params = dict(
                random_seed = random_state,
                loss_function = "MultiRMSE",
                depth = 3,
                learning_rate = 0.06,
                n_estimators = 1100,
                l2_leaf_reg = 2.5,
                bootstrap_type = "Bayesian",
                bagging_temperature = 0.25,
                grow_policy = "SymmetricTree",
                rsm = 0.75,
                verbose = verbose,
            )

        self.model = None
        self.n_targets = None

    def train_model(
        self,
        train_X,
        train_Y,
        device = None,
        verbose: bool = False,
    ):
        """
        Featurize inputs and fit CatBoostRegressor on multi-target labels.
        """
        # Featurize SMILES + numeric inputs, then convert to NumPy for CatBoost
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()

        # train_Y expected as shape (n_samples, n_targets)
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]

        self.model = CatBoostRegressor(**self.cat_params)
        self.model.fit(X_np, Y_np)

        if verbose or self.verbose:
            print(
                f"[CatBoostModel] Training complete in '{self.data_mode}' mode "
                f"with {self.n_targets} target(s)."
            )

    def predict(self, X):
        """
        Predict yields, clip negatives to 0, and for multi-target outputs
        ensure non-negative rows with sum <= 1 by down-scaling if needed.
        """
        if self.model is None:
            raise RuntimeError("Model is not trained. Call train_model(...) first.")

        # Featurize and convert to NumPy for prediction
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()

        out = self.model.predict(X_np)
        out = np.asarray(out)

        # Ensure 2D shape: (n_samples, n_targets)
        if out.ndim == 1:
            out = out.reshape(-1, 1)

        # Clip to non-negative yields
        out_before_clip = out.copy()
        out = np.clip(out, a_min = 0.0, a_max = None)

        # For multi-target: if row-sum > 1, scale down so sum becomes 1
        if out.shape[1] > 1:
            totals = out.sum(axis = 1, keepdims = True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor

    
        return torch.tensor(out, dtype=torch.double)

In [None]:
from xgboost import XGBRegressor

class XGBModel(BaseModel):
    """
    XGBoost-based model for reaction yields.

    Uses different hyperparameters for:
      - data = "single": single-solvent dataset
      - data != "single": full / mixed-solvent dataset
    """

    def __init__(
        self,
        data : str = "single",
        random_state : int = 42,
        verbose : bool = False,
    ):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state

        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()

            self.xgb_params = dict(
                random_state = random_state,
                objective = "reg:squarederror",
                tree_method = "hist",
                subsample = 0.5,
                reg_lambda = 0.6,
                reg_alpha = 0.0,
                n_estimators = 1000,
                min_child_weight = 1,
                max_depth = 4,
                max_delta_step = 1,
                learning_rate = 0.02,
                grow_policy = "depthwise",
                gamma = 0.0,
                colsample_bytree = 0.3,
                colsample_bylevel = 0.6,
            )

        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()

            self.xgb_params = dict(
                random_state = random_state,
                objective = "reg:squarederror",
                tree_method = "approx",
                subsample = 0.5,
                reg_lambda = 0.6,
                reg_alpha = 0.0,
                n_estimators = 1000,
                min_child_weight = 1,
                max_depth = 4,
                max_delta_step = 1,
                learning_rate = 0.02,
                grow_policy = "lossguide",
                gamma = 0.0,
                colsample_bytree = 0.3,
                colsample_bylevel = 0.6,
            )

        self.models = None
        self.n_targets = None

    def train_model(
        self,
        train_X,
        train_Y,
        device = None,
        verbose : bool = False,
    ):
        """
        Featurize inputs and fit one XGBRegressor per target.
        """
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()

        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]

        self.models = []
        for t in range(self.n_targets):
            model_t = XGBRegressor(**self.xgb_params)
            model_t.fit(X_np, Y_np[:, t])
            self.models.append(model_t)

        if verbose or self.verbose:
            print(
                f"[XGBModel] Training complete in '{self.data_mode}' mode "
                f"with {self.n_targets} target(s) and {len(self.models)} model(s)."
            )

    def predict(self, X):
        """
        Predict yields, clip negatives to 0, and for multi-target outputs
        ensure non-negative rows with sum <= 1 by down-scaling if needed.
        """
        if self.models is None or self.n_targets is None:
            raise RuntimeError("Model is not trained. Call train_model(...) first.")

        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()

        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)

        out = np.clip(out, a_min = 0.0, a_max = None)

        if out.shape[1] > 1:
            totals = out.sum(axis = 1, keepdims = True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor

        return torch.tensor(out, dtype = torch.double)

In [None]:
from sklearn.ensemble import RandomForestRegressor


class RFModel(BaseModel):
    """
    Random Forest model for reaction yields.
    Trains one regressor per target and applies the same postprocessing rules
    used in the CatBoost and XGB models.

    This model actually performed much worse than the CatBoost and XGB models, and harmed performance when added in the ensemble, 
    so it is not actually used.
    """

    def __init__(
        self,
        data : str = "single",
        random_state : int = 42,
        verbose : bool = False,
    ):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state

        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()

            self.rf_params = dict(
                random_state = random_state,
                n_estimators = 450,
                min_samples_split = 2,
                min_samples_leaf = 1,
                max_features = "sqrt",
                max_depth = 10,
                bootstrap = True,
            )

        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()

            self.rf_params = dict(
                random_state = random_state,
                n_estimators = 300,
                min_samples_split = 2,
                min_samples_leaf = 1,
                max_features = "sqrt",
                max_depth = None,
                bootstrap = True,
            )

        self.models = None
        self.n_targets = None

    def train_model(
        self,
        train_X,
        train_Y,
        device = None,
        verbose : bool = False,
    ):
        """
        Featurize inputs and train one RandomForestRegressor per target.
        """
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()

        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]

        self.models = []
        for t in range(self.n_targets):
            m = RandomForestRegressor(**self.rf_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)

        if verbose or self.verbose:
            print(
                f"[RFModel] Training complete in '{self.data_mode}' mode "
                f"with {self.n_targets} target(s) and {len(self.models)} model(s)."
            )

    def predict(self, X):
        """
        Predict yields, enforce non-negativity, and renormalise if needed.
        """
        if self.models is None:
            raise RuntimeError("RFModel not trained")

        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()

        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)

        out = np.clip(out, a_min = 0.0, a_max = None)

        if out.shape[1] > 1:
            totals = out.sum(axis = 1, keepdims = True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor

        return torch.tensor(out, dtype = torch.double)

In [None]:
class EnsembleModel(BaseModel):
    """
    Weighted ensemble of CatBoostModel and XGBModel.
    Each base model predicts independently; outputs are combined
    via a weighted average. Base models already handle clipping
    and renormalisation internally.
    """

    def __init__(
        self,
        data : str = "single",
        verbose : bool = False,
    ):
        self.data_mode = data
        self.verbose = verbose

        # Optimised fixed weights per dataset
        if data == "single":
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            # multi-solvent / full dataset
            cat_weight = 1.0
            xgb_weight = 2.0

        # Normalise ensemble weights
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum

        # Initialise base models (fixed hyperparameters)
        self.cat_model = CatBoostModel(data = data)
        self.xgb_model = XGBModel(data = data)

    def train_model(
        self,
        train_X,
        train_Y,
        device = None,
        verbose : bool = False,
    ):
        """
        Train each base model on the same dataset.
        """
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)

        if verbose or self.verbose:
            print(
                f"[EnsembleModel] Trained CatBoost and XGB models "
                f"in '{self.data_mode}' mode."
            )

    def predict(self, X):
        """
        Predict with each model and return a weighted average.
        Base models already perform clipping and multi-target normalisation.
        """
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)

        out = (
            self.cat_weight * cat_pred
            + self.xgb_weight * xgb_pred
        )

        return out

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################