In [1]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import json
import os
from glob import glob
from collections import Counter, defaultdict

In [4]:
from typing import List, Dict

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EvalPrediction,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch
torch.cuda.is_available()

True

In [7]:
torch.cuda.empty_cache()

In [8]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

bertweet = AutoModel.from_pretrained("vinai/bertweet-large")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from torch.utils.data import Dataset

class ChangeDetectionDataset(Dataset):
    def __init__(self, root_dir: str, tokenizer, max_length: int = 512):
        """
        root_dir should be e.g. "easy/train" or "hard/validation"
        Expects files: problem-*.txt and truth-problem-*.json
        """
        self.examples = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        # find all txt files
        for txt_path in glob(os.path.join(root_dir, "problem-*.txt")):
            base = os.path.splitext(os.path.basename(txt_path))[0]  # e.g. "problem-3"
            json_path = os.path.join(root_dir, f"truth-{base}.json")
            if not os.path.exists(json_path):
                continue

            # read sentences
            with open(txt_path, encoding="utf-8") as f:
                lines = [l.strip() for l in f.readlines() if l.strip()]
            # read labels
            with open(json_path, encoding="utf-8") as f:
                data = json.load(f)
            changes: List[int] = data["changes"]

            # build pairs (sent_i, sent_{i+1})
            for i, label in enumerate(changes):
                if i + 1 < len(lines):
                    self.examples.append({
                        "sent1": lines[i],
                        "sent2": lines[i+1],
                        "label": label,
                    })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        # tokenizer will handle truncation and padding (padding done in collator)
        enc = self.tokenizer(
            ex["sent1"],
            ex["sent2"],
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            # return_tensors="pt", #???
        )
        enc["labels"] = torch.tensor(ex["label"], dtype=torch.long)
        return enc

In [10]:
train_ds = ChangeDetectionDataset("hard/train", tokenizer)
eval_ds  = ChangeDetectionDataset("hard/validation", tokenizer)

In [11]:
# for i in range(len(train_ds)):
#     assert min(train_ds[i]['input_ids']) >= 0
#     assert min(train_ds[i]['attention_mask']) in [0,1]
#     assert min(train_ds[i]['input_ids']) <= 50265
#     assert train_ds[i]['labels'] in [0,1]
    

In [12]:
class CrossEncoderClassifier(nn.Module):
    def __init__(self, model_name="vinai/bertweet-large", num_labels=2, dropout_prob=0.1):
        super().__init__()
        self.bertweet = AutoModel.from_pretrained(model_name)
        hidden_size = self.bertweet.config.hidden_size
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bertweet(input_ids=input_ids, attention_mask=attention_mask)
        # take [CLS] token
        cls_rep = outputs.last_hidden_state[:, 0, :]
        cls_rep = self.dropout(cls_rep)
        logits = self.classifier(cls_rep)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-large",
    num_labels=2
)
model.to('cuda')
data_collator = DataCollatorWithPadding(tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=True,
    dataloader_num_workers=4,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
                     # mixed precision to save memory & boost throughput
      # load data in parallel
)
print("Trainer device:", trainer.model.device)

Trainer device: cuda:0


In [None]:
trainer.train()

In [2]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

20