In [1]:
! pip install torch
! pip install transformers
! pip install tqdm 
! pip install pandas
! pip install torchmetrics
! pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
   ---------------------------------------- 0.0/163.3 kB ? eta -:--:--
   -- ------------------------------------- 10.2/163.3 kB ? eta -:--:--
   ---------------------------------------- 163.3/163.3 kB 2.5 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.6.1


In [337]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import tqdm
import pandas as pd
from torchmetrics.regression import PearsonCorrCoef
from sentence_transformers import SentenceTransformer, util, InputExample, losses, models, evaluation

### Loading the data

In [338]:
def load_data(file_path):
    data = pd.read_table(file_path)
    # check if any missing values
    print(data.isnull().sum())
    key = data.keys()
    # some values were missing in sentence2 column, so did the below (sentence1 didnt split properly)
    # iterate through the rows in dataframe which have missing values
    for index, row in data[data.isnull().any(axis=1)].iterrows():
        if pd.isnull(row[key[2]]):
            if(len(row[key[1]].split('\t')) > 2 or len(row[key[1]].split('\t')) < 2):
                data.drop(index, inplace=True)
                continue
            # split the sentence1 into words into 2 parts based on \t and assign to sentence1 and sentence2
            sentence1, sentence2 = row[key[1]].split('\t')
            score = row[key[0]]
            # assign to the row
            data.at[index, key[1]] = sentence1
            data.at[index, key[2]] = sentence2
            data.at[index, key[0]] = score
    #rescale every score in data from 0-5 to 0-1
    data[key[0]] = data[key[0]]/5
    return data

In [339]:
train_data = load_data('train.csv')
valid_data = load_data('dev.csv')

score        0
sentence1    0
sentence2    5
dtype: int64
score        0
sentence1    0
sentence2    2
dtype: int64


In [340]:
valid_data.tail(10)

Unnamed: 0,score,sentence1,sentence2
1460,0.4,New UN peacekeeping chief named for Central Af...,UN takes over peacekeeping in Central African ...
1461,1.0,Oil falls in Asian trade,Oil prices down in Asian trade
1462,0.6,Israeli forces detain Palestinian MP in Hebron,Israeli forces detain 2 Palestinians in overni...
1463,0.6,Israeli police clash with Palestinian proteste...,Israel Police Clash With Palestinians in Jerus...
1464,0.0,"3 killed, 4 injured in Los Angeles shootings",Five killed in Saudi Arabia shooting
1465,0.4,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?
1466,0.0,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations
1467,0.4,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o..."
1468,0.0,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...
1469,0.0,New video shows US police officers beating men...,New York police officer critically wounded in ...


In [341]:
print(train_data.shape)
print(train_data['score'].value_counts())

(5709, 3)
score
0.0000    367
0.8000    351
0.6000    308
1.0000    265
0.7600    263
         ... 
0.0134      1
0.1454      1
0.3400      1
0.3556      1
0.8660      1
Name: count, Length: 139, dtype: int64


### Defining device variable

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


### Task 1A: using BERT to perform regression

### Creating a dataset class

In [198]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['sentence1'] + ' [SEP] ' + self.data.iloc[idx]['sentence2']
        inputs = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors='pt')
        inputs['labels'] = torch.tensor(self.data.iloc[idx]['score'], dtype=torch.float)
        inputs = {key: inputs[key].squeeze() for key in inputs}
        return inputs

In [199]:
max_length = 50

In [200]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
sep_token = '[SEP]'

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining the dataset and dataloader class:

In [242]:
train_dataset = TextDataset(tokenizer, train_data, max_length)
valid_dataset = TextDataset(tokenizer, valid_data, max_length)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
pearson = PearsonCorrCoef()

### wandb setup

In [72]:
import wandb
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [73]:
model_config = dict(
    task = 1,
    part = 'A',
    model_name = 'bert-base-uncased',
    max_length = 50,
    batch_size = 8,
    learning_rate = 1e-5,
    optimizer = 'Adam',
    criterion = 'MSELoss',
    epochs = 5
)

In [74]:
wandb.init(project='assignment-3', entity= 'nlp-assignments', config=model_config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112954300001043, max=1.0…

### Defining the model architecture:

In [75]:
class Model(torch.nn.Module):
    def __init__(self, model):
        super(Model, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits
    
model = Model(model)
model.to(device)


Model(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

### Defining train and evaluation loops

In [23]:
def train(model, loader, optimizer, epochs=1, valid_loader=None):
    # wandb.define_metric('epoch')
    # wandb.define_metric('training_loss', step_metric='epoch')
    # wandb.define_metric('validation_loss', step_metric='epoch')
    # wandb.define_metric('Pearson Correlation', step_metric='epoch')
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm.tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            loss, preds = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch: {epoch + 1}, Training loss: {total_loss / len(loader)}')
        epoch_log = {}
        if(valid_loader is not None):
            loss, corr = evaluate(model, valid_loader)
        #     epoch_log['validation_loss'] = loss
        #     epoch_log['Pearson Correlation'] = corr
        # epoch_log['epoch'] = epoch
        # epoch_log['training_loss'] = total_loss / len(loader)
        # wandb.log(epoch_log)
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    predicted = torch.tensor([])
    all_labels = torch.tensor([])
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            loss, preds = model(input_ids, attention_mask, labels)
            total_loss += loss.item()
            for i in preds:
                i = i.cpu()
                predicted = torch.cat((predicted, i))
            for i in labels:
                i = i.cpu()
                i = torch.tensor([i])
                all_labels = torch.cat((all_labels, i))
    print(f'Validation loss: {total_loss / len(loader)}, Pearson correlation: {pearson(predicted, all_labels)}')
    return total_loss / len(loader), pearson(predicted, all_labels).item()

In [77]:
train(model, train_loader, optimizer, epochs=5, valid_loader=valid_loader)
# evaluate(model, valid_loader)

100%|██████████| 714/714 [00:51<00:00, 13.98it/s]


Epoch: 1, Training loss: 0.06139531917283077


100%|██████████| 184/184 [00:05<00:00, 35.88it/s]


Validation loss: 0.034116377368184694, Pearson correlation: 0.8023785948753357


100%|██████████| 714/714 [00:50<00:00, 14.17it/s]


Epoch: 2, Training loss: 0.020675181214246347


100%|██████████| 184/184 [00:05<00:00, 35.79it/s]


Validation loss: 0.02697314087898754, Pearson correlation: 0.8386620879173279


100%|██████████| 714/714 [00:50<00:00, 14.15it/s]


Epoch: 3, Training loss: 0.010633578259042655


100%|██████████| 184/184 [00:05<00:00, 35.41it/s]


Validation loss: 0.026718384039628763, Pearson correlation: 0.8420818448066711


100%|██████████| 714/714 [00:50<00:00, 14.17it/s]


Epoch: 4, Training loss: 0.005479094146731954


100%|██████████| 184/184 [00:05<00:00, 35.66it/s]


Validation loss: 0.02651181103568018, Pearson correlation: 0.8404891490936279


100%|██████████| 714/714 [00:50<00:00, 14.15it/s]


Epoch: 5, Training loss: 0.003137762849714805


100%|██████████| 184/184 [00:05<00:00, 35.88it/s]

Validation loss: 0.030492372087040996, Pearson correlation: 0.8379072546958923





0.003137762849714805

In [79]:
wandb.finish()

In [81]:
torch.save(model.state_dict(), '1A_model.pt')

In [22]:
#load data from sample_demo.csv
test_data = load_data('/kaggle/input/nlp-a3/sample_demo.csv')
test_dataset = TextDataset(tokenizer, test_data, max_length)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

id           0

score        0

setence1     0

sentence2    0

dtype: int64


In [23]:
model.load_state_dict(torch.load('model.pt'))
evaluate(model, test_loader)

FileNotFoundError: [Errno 2] No such file or directory: 'model.pt'

### Part 1B

In [356]:
# Load the BERT base model
bert_model = models.Transformer('bert-base-uncased')

# Define the pooling layer
pooling_model = models.Pooling(bert_model.get_word_embedding_dimension())

# Create the Sentence Transformer model
model = SentenceTransformer(modules=[bert_model, pooling_model])


In [357]:
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data.iloc[idx, 1]
        sentence2 = self.data.iloc[idx, 2]
        score = self.data.iloc[idx, 0]
        return [sentence1, sentence2], score
    

In [358]:
def validation(model, valid_loader):
    model.eval()
    all_scores = []
    all_targets = []
    with torch.no_grad():
        for batch in tqdm.tqdm(valid_loader):
            sentences, targets = batch
            # print(len(sentences[0]), len(targets))
            for i in range(len(targets)):
                sentence1_features = model.encode(sentences[0][i], convert_to_tensor=True)
                sentence2_features = model.encode(sentences[1][i], convert_to_tensor=True)
                score = util.pytorch_cos_sim(sentence1_features, sentence2_features)
                all_scores.append(score.item())
                all_targets.append(targets[i])
    return pearson(torch.tensor(all_scores), torch.tensor(all_targets))

In [359]:
train_dataset = SentenceDataset(train_data)
valid_dataset = SentenceDataset(valid_data)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [360]:
validation(model, valid_loader)

100%|██████████| 184/184 [03:44<00:00,  1.22s/it]


tensor(0.5855)

### Part 1C

In [350]:
class SentenceDataset(torch.utils.data.Dataset):
    # def __init__(self, data, max_length):
    #     self.data = data
    #     self.max_length = max_length

    # def __len__(self):
    #     return len(self.data)

    # def __getitem__(self, idx):
    #     return self.data.iloc[idx]['sentence1'], self.data.iloc[idx]['sentence2'], self.data.iloc[idx]['score'] 
    def __init__(self, tokenizer, data, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data.iloc[idx]['sentence1']
        sentence2 = self.data.iloc[idx]['sentence2']
        score = self.data.iloc[idx]['score']
        #tokenize
        inputs = self.tokenizer(sentence1, sentence2, max_length=self.max_length, padding="max_length", truncation=True, return_tensors='pt')
        inputs['labels'] = torch.tensor(score, dtype=torch.float)
        inputs = {key: inputs[key].squeeze() for key in inputs}
        return inputs


In [351]:
transformed_train_data = [InputExample(texts=[train_data.iloc[i]['sentence1'], train_data.iloc[i]['sentence2']], label=train_data.iloc[i]['score']) for i in range(len(train_data))]
transformed_valid_data = [InputExample(texts=[valid_data.iloc[i]['sentence1'], valid_data.iloc[i]['sentence2']], label=valid_data.iloc[i]['score']) for i in range(len(valid_data))]

In [352]:
def get_data(data):
    sentence1 = data['sentence1'].tolist()
    sentence2 = data['sentence2'].tolist()
    score = data['score'].tolist()
    return sentence1, sentence2, torch.tensor(score)

In [353]:
def evaluate_sentence(model, dataloader):
    model.eval()
    predicted = torch.tensor([])
    all_labels = torch.tensor([])
    for i in tqdm.tqdm(dataloader):
        sentences_features, labels = i
        sentences_features = {key: value.to(device) for key, value in sentences_features.items()}
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(**sentences_features, labels=labels)
        for i in outputs.logits:    
            i = i.cpu()
            predicted = torch.cat((predicted, i))
        for i in labels:
            i = i.cpu()
            all_labels = torch.cat((all_labels, i))
    print(f'Pearson correlation: {pearson(predicted, all_labels)}')
    return pearson(predicted, all_labels).item()

In [354]:
class CosineLoss(torch.nn.Module):
    def __init__(self, model, loss_fct= torch.nn.MSELoss(), cos_score_transformation= torch.nn.Identity()):
        super(CosineLoss, self).__init__()
        self.model = model
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation

    def forward(self, sentence1, sentence2, score):
        sentence1 = self.model.encode(sentence1, convert_to_tensor=True, device=device)
        sentence2 = self.model.encode(sentence2, convert_to_tensor=True, device=device)
        output = self.cos_score_transformation(torch.cosine_similarity(sentence1, sentence2))
        return self.loss_fct(output, score)

In [355]:
train_dataset = SentenceDataset(tokenizer, train_data, max_length)
valid_dataset = SentenceDataset(tokenizer, valid_data, max_length)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
# valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)
train_loader = torch.utils.data.DataLoader(transformed_train_data, shuffle=True, batch_size=8)
valid_loader = torch.utils.data.DataLoader(transformed_valid_data, shuffle=False, batch_size=8)
train_loader.collate_fn = model.smart_batching_collate
valid_loader.collate_fn = model.smart_batching_collate

word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss = losses.CosineSimilarityLoss(model)
train_loss = CosineSimilarityLoss(model=model)
valid_loss = CosineSimilarityLoss(model=model)
# evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_loader)

model.to(device)
pearson_1b = evaluate_sentence(model, valid_loader)
# wandb.init(project='assignment-3', entity= 'nlp-assignments', config=model_config)

# model.fit(train_objectives=[(train_loader, loss)], epochs=5, optimizer_params={'lr': 1e-5})


  0%|          | 0/184 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'items'

In [None]:
print(pearson_1b)

0.10537926107645035


### Part 1C

In [None]:
def trainer(model, train_loader, loss_fn, optimizer, epochs=2, valid_loader = None):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in tqdm.tqdm(train_loader):
            sentence1, sentence2, score = i
            embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
            output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
            score = torch.tensor(score, dtype=torch.float).to(device)
            loss = loss_fn(sentence1, sentence2, score)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch: {epoch + 1}, Training loss: {total_loss / len(train_loader)}')
        if(valid_loader is not None):
            pearson = evaluate_sentence(model, valid_loader)
            print(f'Pearson correlation: {pearson}')
    
trainer(model, train_loader, loss, optimizer, epochs=2, valid_loader=valid_loader)

  0%|          | 0/714 [00:00<?, ?it/s]

('GOP candidate Romney to call to arm Syrian rebels', '3 suspected extremists were released on bail.', 'Two dogs are playing around in the dirt.', 'How would Martin not know that that was not the case?', 'Lay had argued that handing over the documents would be a violation of his Fifth Amendment rights against self-incrimination.', 'Ex-Virginia governor Bob McDonnell charged with corruption', 'photo of a television screen showing a movie.', 'a white dog running in the snow')
('US helping get arms to Syria rebels', '1 suspected extremist was provisionally released without bail.', 'A dancer posing for the camera in a red and white dress.', 'How would Martin or anyone else know his intentions based on his actions?', 'Lay had refused to turn over the papers, asserting his Fifth Amendment right against self-incrimination.', "Virginia's ex-Gov. Bob McDonnell, wife charged with corruption", 'The wall-mounted flat-screen TV is showing a movie.', 'The dogs are running in the snow.')


  0%|          | 0/714 [00:00<?, ?it/s]


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [295]:
import torch
from torch import nn, Tensor
from typing import Iterable, Dict


class CosineSimilarityLoss(nn.Module):
    def __init__(self, model: SentenceTransformer, loss_fct=nn.MSELoss(), cos_score_transformation=nn.Identity()):
        """
        CosineSimilarityLoss expects that the InputExamples consists of two texts and a float label. It computes the
        vectors ``u = model(sentence_A)`` and ``v = model(sentence_B)`` and measures the cosine-similarity between the two.
        By default, it minimizes the following loss: ``||input_label - cos_score_transformation(cosine_sim(u,v))||_2``.

        :param model: SentenceTransformer model
        :param loss_fct: Which pytorch loss function should be used to compare the ``cosine_similarity(u, v)`` with the input_label?
            By default, MSE is used: ``||input_label - cosine_sim(u, v)||_2``
        :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity.
            By default, the identify function is used (i.e. no change).

        References:
            - `Training Examples > Semantic Textual Similarity <../../examples/training/sts/README.html>`_

        Requirements:
            1. Sentence pairs with corresponding similarity scores in range `[0, 1]`

        Relations:
            - :class:`CoSENTLoss` seems to produce a stronger training signal than CosineSimilarityLoss. In our experiments, CoSENTLoss is recommended.
            - :class:`AnglELoss` is :class:`CoSENTLoss` with ``pairwise_angle_sim`` as the metric, rather than ``pairwise_cos_sim``. It also produces a stronger training signal than CosineSimilarityLoss.

        Inputs:
            +--------------------------------+------------------------+
            | Texts                          | Labels                 |
            +================================+========================+
            | (sentence_A, sentence_B) pairs | float similarity score |
            +--------------------------------+------------------------+

        Example:
            ::

                from sentence_transformers import SentenceTransformer, InputExample, losses
                from torch.utils.data import DataLoader

                model = SentenceTransformer('distilbert-base-nli-mean-tokens')
                train_examples = [
                    InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
                    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)
                ]
                train_batch_size = 1
                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
                train_loss = losses.CosineSimilarityLoss(model=model)

                model.fit(
                    [(train_dataloader, train_loss)],
                    epochs=10,
                )
        """
        super(CosineSimilarityLoss, self).__init__()
        self.model = model
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
        output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
        return self.loss_fct(output, labels.view(-1))


model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [
    InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3),
    InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3),
]
train_batch_size = 4
train_dataloader = torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = CosineSimilarityLoss(model=model)

model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

[{'input_ids': tensor([[ 101, 2026, 2034, 6251,  102],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}, {'input_ids': tensor([[  101,  2026,  2117,  6251,   102],
        [  101, 15142,  6251,   102,     0],
        [  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}]
tensor([0.6208, 0.5664, 0.5365, 0.6643], grad_fn=<SumBackward1>) tensor([0.8000, 0.3000, 0.3000, 0.8000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Epoch:  10%|█         | 1/10 [00:00<00:07,  1.16it/s]

[{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2178, 3940,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0]])}, {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102],
        [  101,  2026,  2117,  6251,   102],
        [  101, 15142,  6251,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0]])}]
tensor([0.6017, 0.5778, 0.6144, 0.5989], grad_fn=<SumBackward1>) tensor([0.3000, 0.8000, 0.8000, 0.3000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
Epoch:  20%|██        | 2/10 [00:01<00:05,  1.41it/s]

[{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2034, 6251,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}, {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
        [  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102],
        [  101,  2026,  2117,  6251,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}]
tensor([0.5945, 0.5982, 0.6136, 0.6013], grad_fn=<SumBackward1>) tensor([0.3000, 0.3000, 0.8000, 0.8000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
Epoch:  30%|███       | 3/10 [00:02<00:04,  1.47it/s]

[{'input_ids': tensor([[ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2178, 3940,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]])}, {'input_ids': tensor([[  101,  2026,  2117,  6251,   102],
        [  101,  2026,  2117,  6251,   102],
        [  101, 15142,  6251,   102,     0],
        [  101, 15142,  6251,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]])}]
tensor([0.7006, 0.6271, 0.5876, 0.5627], grad_fn=<SumBackward1>) tensor([0.8000, 0.8000, 0.3000, 0.3000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
Epoch:  40%|████      | 4/10 [00:02<00:03,  1.51it/s]

[{'input_ids': tensor([[ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2178, 3940,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]])}, {'input_ids': tensor([[  101,  2026,  2117,  6251,   102],
        [  101,  2026,  2117,  6251,   102],
        [  101, 15142,  6251,   102,     0],
        [  101, 15142,  6251,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]])}]
tensor([0.6482, 0.6376, 0.6369, 0.5850], grad_fn=<SumBackward1>) tensor([0.8000, 0.8000, 0.3000, 0.3000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.85it/s]
Epoch:  50%|█████     | 5/10 [00:03<00:03,  1.61it/s]

[{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2034, 6251,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}, {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
        [  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102],
        [  101,  2026,  2117,  6251,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}]
tensor([0.6247, 0.6023, 0.6234, 0.6322], grad_fn=<SumBackward1>) tensor([0.3000, 0.3000, 0.8000, 0.8000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
Epoch:  60%|██████    | 6/10 [00:03<00:02,  1.60it/s]

[{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2178, 3940,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0]])}, {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102],
        [  101,  2026,  2117,  6251,   102],
        [  101, 15142,  6251,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0]])}]
tensor([0.5125, 0.6705, 0.5742, 0.5438], grad_fn=<SumBackward1>) tensor([0.3000, 0.8000, 0.8000, 0.3000])


Iteration: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s]it/s]
Epoch:  70%|███████   | 7/10 [00:04<00:01,  1.54it/s]


[{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102],
        [ 101, 2178, 3940,  102,    0],
        [ 101, 2026, 2034, 6251,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}, {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102],
        [  101, 15142,  6251,   102,     0],
        [  101,  2026,  2117,  6251,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}]


KeyboardInterrupt: 