In [1]:
import os, types, math, torch, torch.nn as nn
from torch_geometric.loader import DataLoader
import numpy as np
import pandas as pd
from data_preparation import PolymerDataset, TARGETS, make_smile_canonical, get_data_paths, add_extra_data, load_polymer_dataset
from model import WDMPNNModel, MultiTaskHead
from train import fit  # 直接复用你已有的 fit()

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 训练超参
HIDDEN_DIM = 128
MLP_HIDDEN = [256, 128]
NUM_EDGE_LAYERS = 4
DROPOUT = 0.001
LR = 5e-4
WARMUP_EPOCHS = 3
FINETUNE_EPOCHS = 20

# 手动填写（按你 QM9 预训练时的输入维度）
QM9_NODE_DIM = 11  # e.g., 11（举例，按你实际为准）
QM9_EDGE_DIM = 4  # e.g., 4
# 选择一个预训练最优权重
CKPT_PATH = "checkpoints/trial_29_best.pt"  # 换成你想用的那个

In [2]:
P = get_data_paths()
for k, v in P.items():
    print(f"{k}: {v}  {'[OK]' if v.exists() else '[MISSING]'}")

# 核心官方数据
train = pd.read_csv(P['train_csv'])
test  = pd.read_csv(P['test_csv'])
sub   = pd.read_csv(P['sample_submission'])
print("Train/Test/Sub:", train.shape, test.shape, sub.shape)

train_csv: kaggle\input\neurips-open-polymer-prediction-2025\train.csv  [OK]
test_csv: kaggle\input\neurips-open-polymer-prediction-2025\test.csv  [OK]
sample_submission: kaggle\input\neurips-open-polymer-prediction-2025\sample_submission.csv  [OK]
tc_data: kaggle\input\tc-smiles\Tc_SMILES.csv  [OK]
tg_jcim_data: kaggle\input\smiles-extra-data\JCIM_sup_bigsmiles.csv  [OK]
tg_excel_data: kaggle\input\smiles-extra-data\data_tg3.xlsx  [OK]
density_data: kaggle\input\smiles-extra-data\data_dnst1.xlsx  [OK]
ffv_data: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset4.csv  [OK]
dataset1: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset1.csv  [OK]
dataset2: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset2.csv  [OK]
dataset3: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset3.csv  [OK]
Train/Test/Sub: (7973, 7) (3, 2) (3, 6)


In [3]:
if os.path.exists(P['tc_data']):       
    train = add_extra_data(
        train,
        pd.read_csv(P['tc_data']).rename(columns={'TC_mean':'Tc'}),
        target='Tc',
        source_name='tc_data'
    )

if os.path.exists(P['tg_jcim_data']):  
    train = add_extra_data(
        train,
        pd.read_csv(P['tg_jcim_data'], usecols=['SMILES','Tg (C)']).rename(columns={'Tg (C)':'Tg'}),
        target='Tg',
        source_name='tg_jcim'
    )

if os.path.exists(P['tg_excel_data']):
    train = add_extra_data(
        train,
        pd.read_excel(P['tg_excel_data']).rename(columns={'Tg [K]':'Tg'}).assign(Tg=lambda df: df['Tg'] - 273.15),
        target='Tg',
        source_name='tg_excel_K_to_C'
    )

if os.path.exists(P['dataset3']):
    train = add_extra_data(
        train,
        pd.read_csv(P['dataset3']),  # 已是列名 SMILES, Tg
        target='Tg',
        source_name='dataset3'
    )

if os.path.exists(P['density_data']):
    train = add_extra_data(
        train,
        pd.read_excel(P['density_data'])
          .rename(columns={'density(g/cm3)':'Density'})
          .assign(Density=lambda df: pd.to_numeric(df['Density'], errors='coerce') - 0.118),
        target='Density',
        source_name='density_extra_minus_0p118'
    )

if os.path.exists(P['dataset1']):
    train = add_extra_data(
        train,
        pd.read_csv(P['dataset1']).rename(columns={'TC_mean':'Tc'}),
        target='Tc',
        source_name='dataset1'
    )

if os.path.exists(P['ffv_data']):
    train = add_extra_data(
        train,
        pd.read_csv(P['ffv_data']).rename(columns={'FFV':'FFV'}),
        target='FFV',
        source_name='ffv_dataset4'
    )

print(f"train: {train.shape}")
print("targets non-null:", {t: int(train[t].notna().sum()) for t in TARGETS if t in train.columns})

[INFO] Working on tc_data (target=Tc)
  重复 SMILES: 737, 其中 2 条数值不同 (mean diff=0.000)
  [Tc] 增强: +130 条新样本, 填补 130 条缺失
[INFO] Working on tg_jcim (target=Tg)
  重复 SMILES: 7, 其中 0 条数值不同 (mean diff=0.000)
  [Tg] 增强: +655 条新样本, 填补 655 条缺失
[INFO] Working on tg_excel_K_to_C (target=Tg)
  [Tg] 增强: +499 条新样本, 填补 499 条缺失
[INFO] Working on dataset3 (target=Tg)
  [Tg] 增强: +46 条新样本, 填补 46 条缺失
[INFO] Working on density_extra_minus_0p118 (target=Density)
  重复 SMILES: 4, 其中 2 条数值不同 (mean diff=0.055)
  [Density] 增强: +782 条新样本, 填补 784 条缺失
[INFO] Working on dataset1 (target=Tc)
  重复 SMILES: 867, 其中 2 条数值不同 (mean diff=0.000)
  [Tc] 增强: +0 条新样本, 填补 0 条缺失
[INFO] Working on ffv_dataset4 (target=FFV)
  重复 SMILES: 37, 其中 0 条数值不同 (mean diff=nan)
  [FFV] 增强: +825 条新样本, 填补 862 条缺失
train: (10910, 7)
targets non-null: {'Tg': 1711, 'FFV': 7892, 'Tc': 867, 'Density': 1397, 'Rg': 614}


In [None]:
df = train.copy()
df["SMILES"] = df["SMILES"].apply(make_smile_canonical)
df = df[df["SMILES"].notnull()].reset_index(drop=True)


poly_dataset_full = PolymerDataset(df, transform=None)

# 随机切分 train/val
total = len(poly_dataset_full)
val_sz = max(1, int(0.1 * total))
train_sz = total - val_sz
g = torch.Generator().manual_seed(42)
train_set, val_set = torch.utils.data.random_split(poly_dataset_full, [train_sz, val_sz], generator=g)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_set,   batch_size=32, shuffle=False)

# 聚合物原始维度（供 Adapter 使用）
b0 = next(iter(train_loader))
POLY_NODE_DIM = b0.x.size(-1)
POLY_EDGE_DIM = b0.edge_attr.size(-1)
print("POLY dims:", POLY_NODE_DIM, POLY_EDGE_DIM, "| train/val:", len(train_set), len(val_set))

KeyError: 'uid'

In [5]:
class LinearAdapter(nn.Module):
    """in_dim -> out_dim 的轻量投影（可选瓶颈 + LayerNorm + 小门控）"""
    def __init__(self, in_dim, out_dim, bottleneck=64, p_drop=0.1, use_gate=True, use_ln=True):
        super().__init__()
        self.norm = nn.LayerNorm(in_dim) if use_ln else nn.Identity()
        if bottleneck and bottleneck < min(in_dim, out_dim):
            self.proj = nn.Sequential(
                nn.Linear(in_dim, bottleneck), nn.GELU(), nn.Dropout(p_drop),
                nn.Linear(bottleneck, out_dim),
            )
        else:
            self.proj = nn.Linear(in_dim, out_dim)
        self.gate = nn.Parameter(torch.tensor(0.01)) if use_gate else None

    def forward(self, x):
        x = self.norm(x)
        y = self.proj(x)
        return y * self.gate if self.gate is not None else y

def attach_adapters(model, poly_node_dim, poly_edge_dim, qm9_node_dim, qm9_edge_dim):
    """在 encoder 前挂 adapter，并用 monkey-patch 覆盖 forward。"""
    model.node_adapter = LinearAdapter(poly_node_dim, qm9_node_dim, bottleneck=64, p_drop=0.1)
    model.edge_adapter = LinearAdapter(poly_edge_dim, qm9_edge_dim, bottleneck=32, p_drop=0.1)

    def forward_with_adapter(self, data):
        edge_weight = getattr(data, "edge_weight", None)
        x2 = self.node_adapter(data.x)
        e2 = self.edge_adapter(data.edge_attr)
        g = self.encoder(x2, data.edge_index, e2, data.batch, edge_weight=edge_weight)
        return self.head(g)

    model.forward = types.MethodType(forward_with_adapter, model)
    return model

@torch.no_grad()
def compute_task_stats_from_dataset(dataset, tasks):
    """wMAE 所需 n_i / r_i（从 dataset 内的 y & mask 统计）"""
    n = {t: 0 for t in tasks}
    lo = {t: math.inf for t in tasks}
    hi = {t: -math.inf for t in tasks}
    for d in getattr(dataset, "data_list", dataset):
        y = d.y.cpu()
        m = d.mask.cpu()
        for i, t in enumerate(tasks):
            if m[i] > 0.5:
                n[t] += 1
                v = float(y[i])
                lo[t] = min(lo[t], v)
                hi[t] = max(hi[t], v)
    r = {t: (0.0 if not math.isfinite(lo[t]) or not math.isfinite(hi[t]) else (hi[t]-lo[t])) for t in tasks}
    return n, r

def load_trunk_only(model, ckpt_path, device=DEVICE):
    """从 trial_*_best.pt 里只加载 encoder.*（丢弃 head.*）"""
    sd_full = torch.load(ckpt_path, map_location=device)
    state = sd_full.get("model_state", sd_full)  # 兼容纯 state_dict
    trunk = {k: v for k, v in state.items() if k.startswith("encoder.")}
    missing, unexpected = model.load_state_dict(trunk, strict=False)
    print(f"Loaded trunk: {len(trunk)} tensors | missing={len(missing)} | unexpected={len(unexpected)}")

def freeze_encoder_only(model, freeze=True):
    """warmup 期：只训 adapter+head；finetune 期：全解冻。"""
    for n, p in model.named_parameters():
        is_adapter = n.startswith(("node_adapter", "edge_adapter"))
        is_head = n.startswith("head.")
        if freeze:
            if not (is_adapter or is_head):
                p.requires_grad_(False)
        else:
            p.requires_grad_(True)

def build_optimizer_grouped(model, base_lr=1e-3):
    """分组学习率：adapter > head > trunk。"""
    adapter_params, head_params, trunk_params = [], [], []
    for n, p in model.named_parameters():
        if not p.requires_grad: continue
        if n.startswith(("node_adapter", "edge_adapter")):
            adapter_params.append(p)
        elif n.startswith("head."):
            head_params.append(p)
        else:
            trunk_params.append(p)
    return torch.optim.Adam([
        {"params": adapter_params, "lr": base_lr * 2.0},
        {"params": head_params,    "lr": base_lr * 1.0},
        {"params": trunk_params,   "lr": base_lr * 0.2},
    ], lr=base_lr)

In [6]:
from data_preparation import TARGETS  # 比赛任务

# 1) 用 QM9 维度构建骨架（tasks=TARGETS）
model = WDMPNNModel(
    node_feat_dim=QM9_NODE_DIM,
    edge_feat_dim=QM9_EDGE_DIM,
    hidden_dim=HIDDEN_DIM,
    mlp_hidden=MLP_HIDDEN,
    tasks=TARGETS,
    num_edge_layers=NUM_EDGE_LAYERS,
    dropout=DROPOUT
).to(DEVICE)

# 2) 前置适配器
attach_adapters(model, POLY_NODE_DIM, POLY_EDGE_DIM, QM9_NODE_DIM, QM9_EDGE_DIM)

# 3) 任务权重（wMAE）
n_dict, r_dict = compute_task_stats_from_dataset(poly_dataset_full, TARGETS)
model.set_task_stats(n_dict, r_dict)

# 4) 只加载 encoder 主干权重
load_trunk_only(model, CKPT_PATH, device=DEVICE)

print("Ready. Hidden =", model.encoder.hidden_dim, "| params trainable =", sum(p.numel() for p in model.parameters() if p.requires_grad))

Loaded trunk: 10 tensors | missing=40 | unexpected=0
Ready. Hidden = 128 | params trainable = 367382


In [7]:
freeze_encoder_only(model, freeze=True)

optimizer = torch.optim.Adam([
    {"params":[p for n,p in model.named_parameters() if n.startswith(("node_adapter","edge_adapter"))], "lr": LR*5},
    {"params":[p for n,p in model.named_parameters() if n.startswith("head.")], "lr": LR*2},
], lr=LR)

_ = fit(model, train_loader, val_loader, optimizer, DEVICE, TARGETS,
        epochs=WARMUP_EPOCHS, use_mask=True, run_wandb=False,
        project="polymer-finetune-adapter-warmup",
        ckpt_path="checkpoints/poly_adapter_warmup_best.pt")

✅ Device: cpu | Params: 0.37M | Tasks: 5


                                                                                          

[Epoch 1/3] Train 0.072, Val 0.066 | Tg:82.783 | FFV:0.027 | Tc:0.067


                                                                                          

[Epoch 2/3] Train 0.062, Val 0.053 | Tg:82.388 | FFV:0.020 | Tc:0.046


                                                                                          

[Epoch 3/3] Train 0.055, Val 0.051 | Tg:80.201 | FFV:0.019 | Tc:0.042




In [9]:
freeze_encoder_only(model, freeze=False)
optimizer = build_optimizer_grouped(model, base_lr=LR)

history = fit(model, train_loader, val_loader, optimizer, DEVICE, TARGETS,
              epochs=FINETUNE_EPOCHS, use_mask=True, run_wandb=False,
              project="polymer-finetune-adapter",
              ckpt_path="checkpoints/poly_adapter_finetune_best.pt")

✅ Device: cpu | Params: 0.37M | Tasks: 5


                                                                                           

[Epoch 1/20] Train 0.052, Val 0.053 | Tg:74.267 | FFV:0.018 | Tc:0.039


                                                                                           

[Epoch 2/20] Train 0.050, Val 0.047 | Tg:69.485 | FFV:0.017 | Tc:0.037


                                                                                           

KeyboardInterrupt: 

In [10]:
model.load_state_dict(torch.load("checkpoints/poly_adapter_finetune_best.pt", map_location=DEVICE))
model.eval()

@torch.no_grad()
def predict_batch(model, batch):
    out = model(batch.to(DEVICE))
    return {t: out[t].cpu().numpy() for t in model.tasks}

batch = next(iter(val_loader))
pred = predict_batch(model, batch)
{k: v[:5] for k, v in pred.items()}

{'Tg': array([ 31.952337, -17.538939,  85.358826,  73.86589 ,  57.80554 ],
       dtype=float32),
 'FFV': array([0.3515721 , 0.3814314 , 0.36397016, 0.35942668, 0.35875398],
       dtype=float32),
 'Tc': array([0.25661173, 0.38523132, 0.23553681, 0.20132993, 0.21671148],
       dtype=float32),
 'Density': array([1.0350479 , 0.89208084, 1.0976292 , 1.1937238 , 1.0376378 ],
       dtype=float32),
 'Rg': array([18.588612, 19.274672, 19.26153 , 18.443525, 18.358446],
       dtype=float32)}

In [12]:
test_loader, test_dataset = load_polymer_dataset(P["test_csv"], batch_size=64, shuffle=False)

# 可选：看看有没有被丢弃的样本
if getattr(test_dataset, "dropped", None):
    print("Dropped samples:", len(test_dataset.dropped), test_dataset.dropped[:3])

# 2) 载入你已经训练好的“带 adapter”的模型
model.load_state_dict(torch.load("checkpoints/poly_adapter_finetune_best.pt", map_location=DEVICE))
model.eval()

# 3) 逐批预测（同一个 forward）
import pandas as pd
@torch.no_grad()
def predict_loader(model, loader):
    rows = []
    for batch in loader:
        batch = batch.to(DEVICE)
        out = model(batch)  # dict{task: Tensor[B]}
        # 如果 test 有 id，则在构图时已保存到 data.uid
        uids = batch.uid.view(-1).tolist() if hasattr(batch, "uid") else list(range(len(rows), len(rows) + out[TARGETS[0]].shape[0]))
        for i, uid in enumerate(uids):
            row = {"id": int(uid)}
            for t in model.tasks:
                row[t] = float(out[t][i].detach().cpu().item())
            rows.append(row)
    return pd.DataFrame(rows)

pred_df = predict_loader(model, test_loader)

# 4) 与原 test.csv 对齐并保存
test_raw = pd.read_csv(P["test_csv"])[["id"]]
submission = test_raw.merge(pred_df, on="id", how="left")
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

NameError: name 'load_polymer_dataset' is not defined