# Finetuning Pair Model
**Install module from git**

In [1]:
!pip install git+https://github.com/laurahsisson/dream.git#egg=odor-pair
!pip install optuna

Collecting odor-pair
  Cloning https://github.com/laurahsisson/dream.git to /tmp/pip-install-clil40mn/odor-pair_10e4f9d5475b458b970024ce9805a939
  Running command git clone --filter=blob:none --quiet https://github.com/laurahsisson/dream.git /tmp/pip-install-clil40mn/odor-pair_10e4f9d5475b458b970024ce9805a939
  Resolved https://github.com/laurahsisson/dream.git to commit 32c25530535aa8354107ee6f587afd691ba6c1f0
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-geometric (from odor-pair)
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit (from odor-pair)
  Downloading rdkit-2025.3.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting ogb (from odor-pair)
  Downloading ogb-1.3.6-py3-none-any.whl.metadata (6.2 kB)
Collecting outdated>=0.2.0 (from ogb->odor-pair)
  Downloading outdated-0.2.2-py2.

**Initialize seeds**

In [2]:
import torch

SEED = 42
torch.manual_seed(SEED)

<torch._C.Generator at 0x7b807eaeabd0>

**Read in inputs**

In [3]:
import os
import pandas as pd

WORKSPACE_DIR = "drive/MyDrive/MIXTURE"
DATA_DIR = os.path.join(WORKSPACE_DIR,"Data")

def read_and_parse(fname):
  df = pd.read_csv(os.path.join(DATA_DIR,fname))
  df["components_SMILES"] = df["components_SMILES"].apply(lambda cs: cs.split(";"))
  return df

all_train_data_df = read_and_parse("train_data.csv")
print("Train Data")
display(all_train_data_df.head(3))

test_form_df = read_and_parse("test_form.csv")
print("Test Form")
display(test_form_df.head(3))

Train Data


Unnamed: 0,stimulus,Green,Cucumber,Herbal,Mint,Woody,Pine,Floral,Powdery,Fruity,...,Animal,Medicinal,Cooling,Sharp,Chlorine,Alcoholic,Plastic,Ozone,Metallic,components_SMILES
0,AA007,0.653846,2.807692,0.076923,0.038462,0.0,0.115385,0.461538,0.5,0.192308,...,0.192308,0.153846,0.269231,0.0,0.346154,0.076923,0.269231,0.0,0.0,"[CC/C=C\CCCCC=O, CC1COCC2=CC3=C(C=C12)C(C(C3(C..."
1,AA085,0.076923,0.0,0.692308,0.730769,0.0,0.0,0.153846,0.0,0.153846,...,0.0,2.346154,2.346154,0.923077,0.076923,0.692308,0.0,0.115385,0.0,"[CCOC(C)OCC, CC1(CCCC(N1[O])(C)C)C]"
2,AA088,0.884615,0.038462,0.692308,0.115385,0.153846,0.0,0.192308,0.307692,0.230769,...,0.0,0.846154,0.038462,0.038462,0.653846,0.346154,0.192308,0.384615,0.192308,"[COC1=C(C=C(C=C1)CC=C)OC, CC(C)CC=O]"


Test Form


Unnamed: 0,stimulus,Green,Cucumber,Herbal,Mint,Woody,Pine,Floral,Powdery,Fruity,...,Animal,Medicinal,Cooling,Sharp,Chlorine,Alcoholic,Plastic,Ozone,Metallic,components_SMILES
0,AA322,,,,,,,,,,...,,,,,,,,,,"[CCCC(=O)SC, CCCCCC1CCCC(=O)O1]"
1,AA374,,,,,,,,,,...,,,,,,,,,,"[CSCCC=O, CC(C1CCCCC1)OC(=O)C]"
2,AA444,,,,,,,,,,...,,,,,,,,,,"[CCOC(C)OCC, CC(=O)C1=CC=CC=N1]"


In [4]:
NOTES_COLS = [c for c in all_train_data_df.columns if not c in {'stimulus', 'components_SMILES'}]
print("Max Val", all_train_data_df[NOTES_COLS].max(axis=None))
len(NOTES_COLS), NOTES_COLS

Max Val 4.5


(51,
 ['Green',
  'Cucumber',
  'Herbal',
  'Mint',
  'Woody',
  'Pine',
  'Floral',
  'Powdery',
  'Fruity',
  'Citrus',
  'Tropical',
  'Berry',
  'Peach',
  'Sweet',
  'Caramellic',
  'Vanilla',
  'BrownSpice',
  'Smoky',
  'Burnt',
  'Roasted',
  'Grainy',
  'Meaty',
  'Nutty',
  'Fatty',
  'Coconut',
  'Waxy',
  'Dairy',
  'Buttery',
  'Cheesy',
  'Sour',
  'Fermented',
  'Sulfurous',
  'Garlic.Onion',
  'Earthy',
  'Mushroom',
  'Musty',
  'Ammonia',
  'Fishy',
  'Fecal',
  'Rotten.Decay',
  'Rubber',
  'Phenolic',
  'Animal',
  'Medicinal',
  'Cooling',
  'Sharp',
  'Chlorine',
  'Alcoholic',
  'Plastic',
  'Ozone',
  'Metallic'])

In [5]:
TARGET_CAPS = torch.tensor(all_train_data_df[NOTES_COLS].max().values,
                           dtype=torch.float32)
TARGET_CAPS

tensor([2.9333, 4.1176, 1.9667, 4.5000, 2.1000, 1.2500, 2.2667, 1.4000, 3.1667,
        3.8000, 1.8000, 2.3667, 1.0000, 3.6667, 2.2667, 2.2000, 3.3667, 2.2222,
        1.7667, 2.6667, 2.5862, 1.8621, 2.2759, 1.4412, 1.4667, 1.9000, 1.7778,
        1.6923, 1.9667, 1.6111, 1.3571, 1.7000, 4.1509, 1.6000, 1.0741, 1.7000,
        3.6000, 2.0370, 1.2059, 2.4333, 1.6000, 1.4444, 0.8667, 3.1154, 2.5385,
        2.2667, 1.4000, 4.2593, 1.1071, 0.9000, 0.7333])

In [6]:
import json

with open(os.path.join(DATA_DIR,"cid_to_smiles.json")) as f:
  cid_to_smiles = json.load(f)

len(cid_to_smiles), next(iter(cid_to_smiles.items()))

(209, ('10313079', 'CN(C)C.Cl'))

**Convert SMILES to torch graph**

In [7]:
from tqdm.notebook import tqdm
from ogb.utils import smiles2graph
from odorpair import pairdata

graph_data = dict()
errored = 0

# Compute individual graphs for all unique SMILES
for smiles in tqdm(cid_to_smiles.values(), desc="Processing SMILES to graphs"):
    try:
        graph_data[smiles] = pairdata.to_torch(smiles2graph(smiles))
    except AttributeError as e:
        print(f"Error processing SMILES: {smiles}. Error: {e}")
        errored += 1
    except TypeError as e:
        print(f"Error processing SMILES: {smiles}. Error: {e}")
        errored += 1

f"Errored smiles: {errored}"

Processing SMILES to graphs:   0%|          | 0/209 [00:00<?, ?it/s]

'Errored smiles: 0'

**Load in production model**

In [8]:
from odorpair import production

BASE_MODEL, CONFIG = production.load_pretrained()
BASE_MODEL

GCN(
  (feature_norm): BatchNorm1d(9, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (project_node_feats): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.04808747892762695, inplace=False)
  )
  (convs): ModuleList(
    (0-2): 3 x GINConv(nn=Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=128, bias=True)
    ))
  )
  (norms): ModuleList(
    (0-2): 3 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (dropout): Dropout(p=0.04808747892762695, inplace=False)
  (readout): BlendAggregator(
    (readout): SetTransformerAggregation(128, num_seed_points=1, heads=2, layer_norm=False, dropout=0.04808747892762695)
  )
  (notes_predictor): Linear(in_features=128, out_features=101, bias=True)
)

In [9]:
import copy
from odorpair import gcn

def make_gcn(base_model: gcn.GCN, notes_dim: int, do_mlp: bool = False):
  base_gcn = copy.deepcopy(base_model)
  in_dim = base_model.readout.readout.channels
  if do_mlp:
    base_model.notes_predictor = torch.nn.Sequential(torch.nn.Linear(in_dim, in_dim),
                                                     torch.nn.ReLU(),
                                                     torch.nn.Linear(in_dim, notes_dim))
  else:
    base_model.notes_predictor = torch.nn.Linear(in_dim, notes_dim)

  return base_gcn

make_gcn(BASE_MODEL, len(NOTES_COLS))

GCN(
  (feature_norm): BatchNorm1d(9, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (project_node_feats): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.04808747892762695, inplace=False)
  )
  (convs): ModuleList(
    (0-2): 3 x GINConv(nn=Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=128, bias=True)
    ))
  )
  (norms): ModuleList(
    (0-2): 3 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (dropout): Dropout(p=0.04808747892762695, inplace=False)
  (readout): BlendAggregator(
    (readout): SetTransformerAggregation(128, num_seed_points=1, heads=2, layer_norm=False, dropout=0.04808747892762695)
  )
  (notes_predictor): Linear(in_features=128, out_features=101, bias=True)
)

In [12]:
from odorpair import utils
utils.readout_counts(BASE_MODEL)

{'total': 537157,
 'feature_norm': 18,
 'project_node_feats': 1280,
 'convs': 99072,
 'norms': 768,
 'dropout': 0,
 'readout': 429440,
 'notes_predictor': 6579}

In [17]:
import optuna

hyperparameter_trial_name = "finetune_second"
DB_PATH = os.path.join(WORKSPACE_DIR, "optuna.db")
storage_name = f"sqlite:///{DB_PATH}"

print("Study DB", DB_PATH)

def create_study():
    sampler = optuna.samplers.TPESampler(
        seed=SEED,               # full reproducibility
        multivariate=True,       # model all params jointly
        group=True,              # sample a whole vector at once
        n_startup_trials=15,     # pure random → good density estimate
        n_ei_candidates=32,       # more EI draws ≈ better next point
        warn_independent_sampling=False,
    )

    return optuna.create_study(
        study_name=hyperparameter_trial_name,
        direction="maximize",   # pearson - cosine
        sampler=sampler,
        storage=storage_name,
        load_if_exists=True,
        pruner=None            # no pruning for k‑fold objective
    )

def delete_study():
  optuna.delete_study(study_name=hyperparameter_trial_name, storage=storage_name)

# delete_study()
study = create_study()
study.best_trial.values, study.best_trial.params, {k: v for k, v in study.best_trial.user_attrs.items() if k!="test_form"}

[I 2025-08-08 17:21:39,620] Using an existing study with name 'finetune_second' instead of creating a new one.


Study DB drive/MyDrive/MIXTURE/optuna.db


([0.5138469338417053],
 {'lr': 0.0039045179975724315,
  'weight_decay': 0.0003456459197114292,
  'warmup': 0.17066739421996546,
  'xi': 0.12985208127484738,
  'gamma': 0.01215390412457112,
  'use_target_caps': False,
  'do_mlp': False},
 {'all_cosine': [0.17322880029678345,
   0.18934112787246704,
   0.19801151752471924],
  'all_pearson': [0.7296997904777527, 0.691651463508606, 0.680770993232727],
  'cosine': 0.1868604818979899,
  'epochs': [109, 106, 114],
  'pearson': 0.7007074157396952})

In [10]:
raise KeyboardInterrupt

KeyboardInterrupt: 

In [None]:
from odorpair import data

def components_to_graph(components_SMILES):
  blend_smiles = sorted(components_SMILES)
  graphs = [graph_data[smiles] for smiles in blend_smiles]
  return data.combine_graphs(graphs)

components_to_graph(all_train_data_df.iloc[0]["components_SMILES"])

In [None]:
def get_layerwise_lr_params(model, lr, decay_rate):
    """
    Generates parameter groups for a layerwise decaying learning rate, processing only top-level and immediate children.

    Args:
        model (torch.nn.Module): The PyTorch model.
        lr (float): Initial learning rate for the topmost layer.
        decay_rate (float): Decay rate to scale the learning rate for deeper layers.

    Returns:
        tuple:
            - list: Parameter groups with associated learning rates for the optimizer.
            - list: Names of modules with their corresponding learning rates.
    """
    param_groups = []
    layer_lr_info = []

    # Collect all children and their submodules in reverse order first
    modules = []
    for name, child in model.named_children():
        if len(list(child.named_children())) > 0:  # If child has submodules
            for sub_name, sub_child in child.named_children():
                sub_child_name = f"{name}.{sub_name}"
                modules.append((sub_child_name, sub_child))
        else:
            modules.append((name, child))

    # Reverse the order to process from the deepest layer upwards
    modules.reverse()

    current_lr = lr
    for name, module in modules:
        param_groups.append({"params": module.parameters(), "lr": current_lr})
        layer_lr_info.append((name, current_lr))
        current_lr *= decay_rate

    return param_groups, layer_lr_info

# Example usage:
layerwise_params, layerwise_info = get_layerwise_lr_params(BASE_MODEL, 1e-1, 0.99)

# Print the layerwise info for inspection
layerwise_info

In [None]:
import warnings
from scipy.stats import ConstantInputWarning   # SciPy ≥1.11

# Suppress the PyTorch scheduler warning
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    module=r"torch\.optim\.lr_scheduler",
    message=r".*epoch parameter in `scheduler\.step\(\)` was not necessary.*",
)

# Suppress the “input array is constant” Pearson‑R warning
warnings.filterwarnings(
    "ignore",                # action
    category=ConstantInputWarning
)

Rank: 1, Pearson: 0.771, Cosine: 0.148, Team: PL21

In [None]:
import torch_geometric as pyg
import sklearn
import numpy as np
import scipy
from odorpair import utils
from tqdm.auto import tqdm

BATCH_SIZE = 1024
# Even though the actual max is 4.5, we use suggested maximum of 5
MAX_VAL = 5
MIN_DELTA = 1e-4

def act_fn(
    logits: torch.Tensor,
    use_target_caps: bool = False   # default keeps old behaviour
) -> torch.Tensor:
    """
    Post‑process logits for scoring / submission.

    • use_target_caps = False  → clip to [0, 5]  (legacy)
    • use_target_caps = True   → clip each target to its empirical max
    """
    if use_target_caps:
        caps = TARGET_CAPS.to(logits.device)       # shape (51,)
        return torch.minimum(logits.clamp_min(0.), caps)

    return logits.clamp(0., 5.)

def make_data_graph(row):
  graph = components_to_graph(row["components_SMILES"])
  graph.y = torch.from_numpy(row[NOTES_COLS].values.astype(np.float32)).unsqueeze(0)
  return graph

def make_loader(data_df, **kwargs):
  graphs = data_df.apply(make_data_graph,axis=1).tolist()
  return pyg.loader.DataLoader(graphs,
                               batch_size=BATCH_SIZE,
                               **kwargs)

def cosine_dist(y_true, y_pred):
  return 1 - torch.nn.functional.cosine_similarity(y_true, y_pred).mean()

def loss_fn(y_true: torch.Tensor,
          y_pred: torch.Tensor,
          gamma: float) -> torch.Tensor:
  """
  Hybrid loss = γ·MSE + (1−γ)·cosine‑distance.
  """
  mse = torch.nn.functional.mse_loss(y_pred, y_true, reduction='mean')
  cos_dist = cosine_dist(y_pred, y_true)
  return gamma * mse + (1.0 - gamma) * cos_dist

def pearson_r(y_true, y_pred):
  return np.mean([scipy.stats.pearsonr(u, v)[0] for u, v in zip(y_true, y_pred)])

def get_single_batch(data_df, **kwargs):
  assert len(data_df) <= BATCH_SIZE
  return next(iter(make_loader(data_df, **kwargs)))

def calculate_score(config, model, data_df):
  batch = get_single_batch(data_df)
  batch.cuda()
  with torch.no_grad():
    pred = act_fn(model(batch)["logits"], config["use_target_caps"])
    loss = loss_fn(batch.y, pred, config["gamma"]).cpu().item()

  cos_score = cosine_dist(batch.y, pred).item()
  pearson_score = pearson_r(batch.y.cpu(), pred.cpu()).item()

  return {"pearson": pearson_score, "cosine": cos_score, "loss": loss}

def model_formatted_output(config, model, data_df):
  batch = get_single_batch(data_df)
  batch.cuda()
  with torch.no_grad():
    pred = act_fn(model(batch)["logits"], config["use_target_caps"])
    return pred.cpu().numpy().tolist()

def train_model(config, train_idx, test_idx, verbose=False):
  model = make_gcn(BASE_MODEL, len(NOTES_COLS), config["do_mlp"])
  model.cuda()

  train_df = all_train_data_df.iloc[train_idx]
  test_df = all_train_data_df.iloc[test_idx]
  train_loader = make_loader(train_df, shuffle=True)

  param_groups, _ = get_layerwise_lr_params(model,config["lr"],config["xi"])
  optimizer = torch.optim.AdamW(param_groups, weight_decay=config["weight_decay"])
  scheduler = utils.make_scheduler(optimizer, config["warmup"], config["epochs"]*len(train_loader))
  best_loss = float('inf')

  for epoch in range(config["epochs"]):
    for batch in train_loader:
      batch.cuda()

      # Calculate loss from forward pass
      pred = model(batch)["logits"]
      loss = loss_fn(pred, batch.y, config['gamma'])

      # Backward pass and optimizer step
      loss.backward()
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()

    current_score = calculate_score(config, model, test_df)
    current_loss = current_score["loss"]
    if verbose:
      print(epoch, current_score, best_loss-current_loss)

    if current_loss > best_loss - MIN_DELTA:
      break
    else:
      best_loss = current_loss

  return calculate_score(config, model, test_df) | {
      "epoch": epoch,
      "test_form": model_formatted_output(config, model, test_form_df),
  }

ex_train, ex_test = sklearn.model_selection.train_test_split(all_train_data_df.index, random_state=SEED)
ex_results = train_model({'use_target_caps':False, 'epochs':10, 'lr': 0.2e-2, 'weight_decay': 3e-3, 'warmup': 0.5, 'xi': 0.9, 'gamma': 0.2, "do_mlp":True}, ex_train, ex_test, verbose=False)
print(f"epoch {ex_results['epoch']}, pearson {ex_results['pearson']:.3f}, cosine {ex_results['cosine']:.3f}\n{ex_results.keys()}")

In [None]:
import optuna

hyperparameter_trial_name = "finetune_second"
DB_PATH = os.path.join(WORKSPACE_DIR, "optuna.db")
storage_name = f"sqlite:///{DB_PATH}"

print("Study DB", DB_PATH)

def create_study():
    sampler = optuna.samplers.TPESampler(
        seed=SEED,               # full reproducibility
        multivariate=True,       # model all params jointly
        group=True,              # sample a whole vector at once
        n_startup_trials=15,     # pure random → good density estimate
        n_ei_candidates=32,       # more EI draws ≈ better next point
        warn_independent_sampling=False,
    )

    return optuna.create_study(
        study_name=hyperparameter_trial_name,
        direction="maximize",   # pearson - cosine
        sampler=sampler,
        storage=storage_name,
        load_if_exists=True,
        pruner=None            # no pruning for k‑fold objective
    )

def delete_study():
  optuna.delete_study(study_name=hyperparameter_trial_name, storage=storage_name)

# delete_study()
study = create_study()

In [None]:
def best_trials(with_params=False):
  return [f"{trial.number}: {trial.user_attrs['epochs']} (pearson: {trial.value:.3f}, cosine: {trial.value:.3f}) {trial.params if with_params else ''}" for trial in study.best_trials]
best_trials(with_params=True)

Best is 0.8050 - .12 = 0.685

In [None]:
import gc
import torch
import optuna
import collections

TOTAL_SPLITS = 3
rs = sklearn.model_selection.ShuffleSplit(n_splits=TOTAL_SPLITS, random_state=SEED)

def get_config(trial):
    """Sample a configuration dictionary from *trial*."""
    return {
        "epochs": 500,
        # For Adam
        "lr": trial.suggest_float("lr", 1e-4, 1e-2, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True),
        # For slanted triangular learning rate
        "warmup": trial.suggest_float("warmup", 0, 1),
        # Layerwise weight decay
        "xi": trial.suggest_float("xi", 1e-2,1.01, log=True),
        # For loss function
        "gamma": trial.suggest_float("gamma", 0, .4),
        # For activation function
        "use_target_caps": trial.suggest_categorical("use_target_caps", [True, False]),
        # For notes prediction
        "do_mlp": trial.suggest_categorical("do_mlp", [True, False]),
    }

def do_trial(trial):
    gc.collect()
    torch.cuda.empty_cache()

    config = get_config(trial)

    results = collections.defaultdict(list)
    for i, (train_index, test_index) in tqdm(enumerate(rs.split(all_train_data_df.index)),
                                              total=TOTAL_SPLITS, smoothing=0):
      fold_results = train_model(config, train_index, test_index)
      for k, v in fold_results.items():
        results[k].append(v)

    results = {
        "epochs": results["epoch"],
        "pearson": np.mean(results["pearson"]).item(),
        "cosine": np.mean(results["cosine"]).item(),
        "all_pearson": results["pearson"],
        "all_cosine": results["cosine"],
        "test_form": np.mean(results["test_form"], axis=0).tolist(),
    }

    # Store extra metrics for later inspection
    for k, v in results.items():
        trial.set_user_attr(k, v)

    print(results["epochs"], results["pearson"], results["cosine"])
    return results["pearson"] - results["cosine"]

TOTAL_TRIALS = None

study.optimize(
    do_trial,
    n_trials=TOTAL_TRIALS,
)


TODO:
* Should add an MLP for notes prediction?
* Sigmoid activation?