In [1]:
from IPython.core.display import HTML
import os
import rdkit
from rdkit import Chem
from rdkit.Chem import PandasTools
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from scikit_mol.fingerprints import MorganFingerprintTransformer
from scikit_mol.conversions import SmilesToMolTransformer
from scikit_mol.standardizer import Standardizer
from scikit_mol.descriptors import MolecularDescriptorTransformer


In [2]:
import torch

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.Chem.rdFingerprintGenerator as rdFingerprintGenerator

#from rdkit.Chem.rdFingerprintGenerator import MorganFingerprintGenerator


In [3]:
import yaml

# Path to your config file
CONFIG_PATH = r"D:\Skills\new\NeurIPS2\config.yaml"

# Load it
with open(CONFIG_PATH, 'r') as file:
    config = yaml.safe_load(file)


In [4]:
ffv_merged = config['output']['ffv_merged_csv']
Tc_cleaned = config['output']['Tc_csv']
Tg_cleaned = config['output']['Tg_csv']



In [5]:
rdkit_ffv_pt = config['output']['rdkit_ffv']
rdkit_ffv_meta = config['output']['rdkit_ffv_meta']

In [6]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, random_split

# Load tensors
X = torch.load(rdkit_ffv_pt)  # [N, 1024]
meta = pd.read_csv(rdkit_ffv_meta)

# Load target
y = torch.tensor(meta['FFV'].values, dtype=torch.float32).unsqueeze(1)  # [N, 1]


In [10]:
X[21]

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [12]:
import torch.nn as nn

class FFVPredictor(nn.Module):
    def __init__(self, input_dim=1024):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)  # regression output
        )

    def forward(self, x):
        return self.model(x)


In [20]:
import torch
import torch.nn as nn
from tqdm import tqdm
import numpy as np

# --- Step 1: Define the weighted MAE (wMAE) function ---
def compute_ffv_weight(val_df, column='FFV'):
    r_ffv = val_df[column].max() - val_df[column].min()
    n_ffv = val_df[column].notna().sum()

    # Simulate weights of 5 tasks (competition setup)
    dummy_counts = {
        "FFV": n_ffv,
        "Tg": 50,
        "Tc": 40,
        "Density": 30,
        "Rg": 25
    }
    K = len(dummy_counts)
    denom = sum([np.sqrt(1 / c) for c in dummy_counts.values()])
    scale = (K * np.sqrt(1 / n_ffv)) / denom
    weight = (1 / r_ffv) * scale
    return weight


def ffv_weighted_mae(preds, targets, weight):
    mask = ~torch.isnan(targets)
    preds = preds[mask]
    targets = targets[mask]
    if len(preds) == 0:
        return None
    mae = torch.mean(torch.abs(preds - targets))
    return (weight * mae).item()


In [21]:
model = FFVPredictor(input_dim=1024)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Compute FFV weight from the meta CSV

In [23]:
ffv_weight = compute_ffv_weight(meta, column="FFV")
print(f"FFV Weight (competition metric): {ffv_weight:.6f}")


FFV Weight (competition metric): 0.147560


# Training loop

In [24]:
EPOCHS = 20
best_wmae = float("inf")

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()

        preds = model(batch_X)
        loss = loss_fn(preds, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * batch_X.size(0)

    # --- Validation ---
    model.eval()
    val_preds = []
    val_targets = []

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            preds = model(batch_X)
            val_preds.append(preds)
            val_targets.append(batch_y)

    val_preds = torch.cat(val_preds, dim=0)
    val_targets = torch.cat(val_targets, dim=0)

    # Compute wMAE
    wmae_score = ffv_weighted_mae(val_preds, val_targets, ffv_weight)

    print(f"Epoch {epoch+1:02d} | Train Loss: {train_loss/len(train_loader.dataset):.6f} | Val wMAE: {wmae_score:.6f}")

    # Optional: Save best model
    if wmae_score < best_wmae:
        best_wmae = wmae_score
        torch.save(model.state_dict(), "best_ffv_model.pth")
        print("🔥 Best model updated.")


Epoch 01 | Train Loss: 0.004750 | Val wMAE: 0.006100
🔥 Best model updated.
Epoch 02 | Train Loss: 0.002970 | Val wMAE: 0.004564
🔥 Best model updated.
Epoch 03 | Train Loss: 0.002292 | Val wMAE: 0.004407
🔥 Best model updated.
Epoch 04 | Train Loss: 0.001596 | Val wMAE: 0.003261
🔥 Best model updated.
Epoch 05 | Train Loss: 0.001210 | Val wMAE: 0.003377
Epoch 06 | Train Loss: 0.000941 | Val wMAE: 0.003530
Epoch 07 | Train Loss: 0.000981 | Val wMAE: 0.003765
Epoch 08 | Train Loss: 0.000754 | Val wMAE: 0.002903
🔥 Best model updated.
Epoch 09 | Train Loss: 0.000801 | Val wMAE: 0.002960
Epoch 10 | Train Loss: 0.000562 | Val wMAE: 0.003020
Epoch 11 | Train Loss: 0.000489 | Val wMAE: 0.002205
🔥 Best model updated.
Epoch 12 | Train Loss: 0.000493 | Val wMAE: 0.002760
Epoch 13 | Train Loss: 0.000445 | Val wMAE: 0.002336
Epoch 14 | Train Loss: 0.000466 | Val wMAE: 0.002501
Epoch 15 | Train Loss: 0.000427 | Val wMAE: 0.002353
Epoch 16 | Train Loss: 0.000407 | Val wMAE: 0.002544
Epoch 17 | Train Los