In [1]:
import torch
import transformers
import datasets
import accelerate

print("Python:", __import__("sys").version)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)
print("Accelerate:", accelerate.__version__)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Python: 3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0]
Torch: 2.10.0+cu128
CUDA available: True
Transformers: 4.36.2
Datasets: 2.16.1
Accelerate: 0.26.1


In [2]:
import sys
!{sys.executable} -m pip uninstall transformer_engine -y

[0m

In [3]:

import json
import math
import numpy as np
from pathlib import Path
from typing import Dict

import torch
from torch import nn

from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from scipy.stats import spearmanr


In [5]:
def load_json_dict(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

train_raw = load_json_dict("data/train.json")
dev_raw   = load_json_dict("data/dev.json")
test_raw  = load_json_dict("data/test.json")

print(len(train_raw), len(dev_raw), len(test_raw))


2280 588 930


In [6]:

def flatten(raw_dict, is_test=False):
    rows = []
    for k, v in raw_dict.items():
        row = {
            "id": k,
            "precontext": v.get("precontext", ""),
            "sentence": v["sentence"],
            "ending": v.get("ending", ""),
            "judged_meaning": v["judged_meaning"],
            "example_sentence": v["example_sentence"],
        }
        if not is_test:
            row["label"] = float(v["average"])
        rows.append(row)
    return rows

train_ds = Dataset.from_list(flatten(train_raw))
dev_ds   = Dataset.from_list(flatten(dev_raw))
test_ds  = Dataset.from_list(flatten(test_raw, is_test=True))


In [7]:

def build_story_text(ex):
    parts = [
        ex["precontext"],
        ex["sentence"],
        ex["ending"],
    ]
    return " ".join(p for p in parts if p.strip())

def build_sense_text(ex):
    return f"Meaning: {ex['judged_meaning']} Example: {ex['example_sentence']}"


In [26]:
def tokenize(batch):
    story = []
    sense = []

    for i in range(len(batch["sentence"])):
        story_parts = [
            batch["precontext"][i],
            batch["sentence"][i],
            batch["ending"][i],
        ]
        story_text = " ".join(p for p in story_parts if p and p.strip())

        sense_text = (
            f"Meaning: {batch['judged_meaning'][i]} "
            f"Example: {batch['example_sentence'][i]}"
        )

        story.append(story_text)
        sense.append(sense_text)

    return tokenizer(
        story,
        sense,
        truncation=True,
        max_length=512,
    )


In [27]:
train_ds = train_ds.map(tokenize, batched=True)
dev_ds   = dev_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("label", "labels")
train_ds.set_format("torch")
dev_ds.set_format("torch")
test_ds.set_format("torch")





ap: 100%|██████████| 930/930 [00:00<00:00, 972.56 examples/s]

ValueError: Original column name label not in the dataset. Current columns in the dataset: ['id', 'precontext', 'sentence', 'ending', 'judged_meaning', 'example_sentence', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']

dict_keys(['id', 'precontext', 'sentence', 'ending', 'judged_meaning', 'example_sentence', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [17]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=1,
    problem_type="regression",
)

model.to("cuda")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [18]:

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()

    spearman = spearmanr(preds, labels).correlation

    std = np.std(labels)
    acc_std = np.mean(np.abs(preds - labels) <= std)

    return {
        "spearman": spearman,
        "acc_within_std": acc_std,
    }


In [None]:
args = TrainingArguments(
    output_dir="./wsd-bert",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=8,
    fp16=True,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="spearman",
    greater_is_better=True,
    report_to="none",
)


In [20]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [21]:
trainer.train()


Epoch,Training Loss,Validation Loss,Spearman,Acc Within Std
0,1.4012,1.374101,0.237313,0.60034
2,0.7596,1.420795,0.353873,0.690476
3,0.5287,1.441202,0.36023,0.680272


TrainOutput(global_step=568, training_loss=1.1678872427470248, metrics={'train_runtime': 59.2692, 'train_samples_per_second': 153.874, 'train_steps_per_second': 9.583, 'total_flos': 489176615298720.0, 'train_loss': 1.1678872427470248, 'epoch': 3.99})

In [22]:
preds = trainer.predict(dev_ds)
print(compute_metrics((preds.predictions, preds.label_ids)))


{'spearman': 0.36022993567267286, 'acc_within_std': 0.6802721088435374}


In [23]:
dev_preds = preds.predictions.squeeze()
dev_labels = preds.label_ids

# Simple linear calibration: y' = a*y + b
a, b = np.polyfit(dev_preds, dev_labels, 1)

def calibrate(x):
    return a * x + b


In [28]:
test_preds = trainer.predict(test_ds).predictions.squeeze()
test_preds = calibrate(test_preds)
test_preds = np.clip(test_preds, 1.0, 5.0)


In [29]:
out_path = Path("predictions.jsonl")

with out_path.open("w", encoding="utf-8") as f:
    for idx, pred in zip(test_ds["id"], test_preds):
        line = {
            "id": idx,
            "prediction": float(pred),
        }
        f.write(json.dumps(line) + "\n")

print("Saved:", out_path)


Saved: predictions.jsonl
