In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import json
import os
from glob import glob
from collections import Counter, defaultdict

In [3]:
from typing import List, Dict

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EvalPrediction,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
torch.cuda.is_available()

True

In [5]:
from datetime import datetime

def time():
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


DATASET = "hard"
SAVE_FOLDER = f"bertweet_large_cr_{DATASET}_{time()}"

print(time(), "STARTING IMPORTS")

2025-04-30_03-10-48 STARTING IMPORTS


In [6]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import Dataset
from transformers import pipeline

In [7]:
# METRICS
def compute_metrics(eval_pred):
    """
    Hugging-Face Trainer passes a named-tuple (logits, labels).

    Returns:
        dict with metric-name → float
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)

    # average='binary' because it’s a 0/1 task; use 'macro' for multi-class
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )

    return {
        "accuracy":  acc,
        "precision": precision,
        "recall":    recall,
        "f1":        f1,
    }

In [8]:
class ChangeDetectionDataset(Dataset):
    def __init__(self, root_dir: str, tokenizer, max_length: int = 512):
        """
        root_dir should be e.g. "easy/train" or "hard/validation"
        Expects files: problem-*.txt and truth-problem-*.json
        """
        self.examples = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        # find all txt files
        for txt_path in glob(os.path.join(root_dir, "problem-*.txt")):
            base = os.path.splitext(os.path.basename(txt_path))[0]  # e.g. "problem-3"
            json_path = os.path.join(root_dir, f"truth-{base}.json")
            if not os.path.exists(json_path):
                continue

            # read sentences
            with open(txt_path, encoding="utf-8") as f:
                lines = [l.strip() for l in f.readlines() if l.strip()]
            # read labels
            with open(json_path, encoding="utf-8") as f:
                data = json.load(f)
            changes: List[int] = data["changes"]

            # build pairs (sent_i, sent_{i+1})
            for i, label in enumerate(changes):
                if i + 1 < len(lines):
                    self.examples.append({
                        "sent1": lines[i],
                        "sent2": lines[i+1],
                        "label": label,
                    })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        # tokenizer will handle truncation and padding (padding done in collator)
        enc = self.tokenizer(
            ex["sent1"],
            ex["sent2"],
            truncation=True,
            max_length=self.max_length,
            #padding="max_length",
        )
        enc["labels"] = torch.tensor(ex["label"], dtype=torch.long)
        return enc

In [9]:
# bertweet = AutoModel.from_pretrained()
bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-large", num_labels=2, 
                                                              problem_type="single_label_classification",
                                                              id2label={0: "NEG", 1: "POS"}, label2id={"NEG": 0, "POS": 1},
                                                             )
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large", use_fast=True, 
                                          normalization=True, add_prefix_space=True,)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_ds = ChangeDetectionDataset("hard/train", tokenizer)
eval_ds  = ChangeDetectionDataset("hard/validation", tokenizer)

In [11]:
# lst = [train_ds[i]['attention_mask'].count(1) for i in range(20000)]
# plt.hist(lst)

In [12]:
data_collator = DataCollatorWithPadding(
    tokenizer,                # uses the model’s pad-token ID
    pad_to_multiple_of=8,     # keeps tensors tensor-core friendly; optional
    return_tensors="pt"       # default, explicit for clarity
)

In [13]:
training_args = TrainingArguments(
    output_dir="bertweet_large_crossentropy_hard",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,      # ↓ if you OOM; ↑ if you have VRAM
    auto_find_batch_size = True,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,                          # half-precision = 2× speed, 2× memory
    gradient_accumulation_steps=2,      # acts like batch_size double
    logging_steps=50,
    # extra optimization
)

In [14]:
trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,   # keep this so metrics decode nicely
    data_collator=data_collator,   # <-- perfectly OK to omit
    compute_metrics = compute_metrics,
)

  trainer = Trainer(


In [15]:
trainer.train()
print(time(), "ENDING TRAINING")

metrics = trainer.evaluate()
print(metrics)

trainer.save_model(f"{SAVE_FOLDER}/final")
tokenizer.save_pretrained(f"{SAVE_FOLDER}/final")
print(time(), "MODELS SAVED")

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
