In [1]:
import torch
import torch.nn as nn

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256-2
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path), # split up the data 
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)), # give the data indexes according to the model
    T.Truncate(max_seq_len), # shortens up the data, interms of lengths, stf...
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)


from torch.utils.data import DataLoader

In [3]:
# text_transform = XLMR_BASE_ENCODER.transform()


In [4]:
T.SentencePieceTokenizer(xlmr_spm_model_path)("testing this transformer")

['▁testing', '▁this', '▁transform', 'er']

In [5]:
T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path))(T.SentencePieceTokenizer(xlmr_spm_model_path)("testing this transformer"))

[134234, 903, 27198, 56]

In [6]:
T.Truncate(max_seq_len)(T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path))(T.SentencePieceTokenizer(xlmr_spm_model_path)("testing this transformer")))

[134234, 903, 27198, 56]

In [7]:
T.AddToken(token=bos_idx, begin=True)(T.Truncate(max_seq_len - 2)(T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path))(T.SentencePieceTokenizer(xlmr_spm_model_path)("testing this transformer"))))

[0, 134234, 903, 27198, 56]

In [8]:
T.AddToken(token=eos_idx, begin=False)(T.AddToken(token=bos_idx, begin=True)(T.Truncate(max_seq_len - 2)(T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path))(T.SentencePieceTokenizer(xlmr_spm_model_path)("testing this transformer")))))

[0, 134234, 903, 27198, 56, 2]

In [10]:
from torchtext.datasets import SST2

batch_size = 16

train_datapipe = SST2(split="dev")
dev_datapipe = SST2(split="dev")


# Transform the raw dataset using non-batched API (i.e apply transformation line by line)
def apply_transform(x):
    return text_transform(x[0]), x[1]


train_datapipe = train_datapipe.map(apply_transform)
train_datapipe = train_datapipe.batch(batch_size)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
train_dataloader = DataLoader(train_datapipe, batch_size=None)

dev_datapipe = dev_datapipe.map(apply_transform)
dev_datapipe = dev_datapipe.batch(batch_size)
dev_datapipe = dev_datapipe.rows2columnar(["token_ids", "target"])
dev_dataloader = DataLoader(dev_datapipe, batch_size=None)

In [11]:
# for x in train_dataloader:
#     print(x)

In [12]:
def batch_transform(x):
    return {"token_ids": text_transform(x["text"]), "target": x["label"]}


train_datapipe = train_datapipe.batch(batch_size).rows2columnar(["text", "label"])
train_datapipe = train_datapipe.map(lambda x: batch_transform)
dev_datapipe = dev_datapipe.batch(batch_size).rows2columnar(["text", "label"])
dev_datapipe = dev_datapipe.map(lambda x: batch_transform)

In [13]:
num_classes = 2
input_dim = 768

from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
model.to(DEVICE)

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(250002, 768, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0-11): 12 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (linear1): Linear(in_features=768, out_features=3072, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=3072, out_features=768, bias=True)
            (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (positional_embedding): PositionalEmbedding(
        (embedding): Embedding(5

In [14]:
import torchtext.functional as F
from torch.optim import AdamW

learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()


def train_step(input, target):
    output = model(input)
    loss = criteria(output, target)
    optim.zero_grad()
    loss.backward()
    optim.step()


def eval_step(input, target):
    output = model(input)
    loss = criteria(output, target).item()
    return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()


def evaluate():
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    with torch.no_grad():
        for batch in dev_dataloader:
            input = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
            target = torch.tensor(batch["target"]).to(DEVICE)
            loss, predictions = eval_step(input, target)
            total_loss += loss
            correct_predictions += predictions
            total_predictions += len(target)
            counter += 1

    return total_loss / counter, correct_predictions / total_predictions

In [15]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f1e67525e90>

In [16]:
num_epochs = 1

for e in range(num_epochs):
    for batch in train_dataloader:
        input = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
        target = torch.tensor(batch["target"]).to(DEVICE)
        train_step(input, target)

    loss, accuracy = evaluate()
    print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch = [0], loss = [0.6923663139343261], accuracy = [0.5091743119266054]


In [22]:
import numpy as np

In [25]:
# batch['token_ids']

In [18]:
F.to_tensor(batch['token_ids']).shape

ValueError: expected sequence of length 19 at dim 1 (got 29)

In [20]:
F.to_tensor(batch["token_ids"], padding_value=padding_idx)

tensor([[     0,    115,   1181,      6,      4,   6897,   9393,      6,      4,
            136,  79775,   4745,    538,      6,  93457,     71,      6,      5,
              2,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1],
        [     0,    347,  13400,  10792,  18788,     53,    242,      7,   7477,
             47,  39544,      6,      4,     70,  24911,   5245,    621,  68403,
              6,      4,    959,   5045,  11476,     23,     70,  26499,      6,
              5,      2,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1],
        [     0,     70,   2060,  22690,   1543,    186,  79929,     47,  16916,
              6,      4,   1284,   1295,  14922,  22443,     47,  26866,      7,
            111,     70,  26498,      6,      4,    903,   1346,     83,  