# 2022-2023 Interação em Linguagem Natural

**Grupo 4**

54859 Rita Rodrigues

39074 Ana Silva

# Installation of required libraries

In [None]:
!pip install --quiet datasets transformers[sentencepiece]
!pip install --quiet accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m128.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

# Importing dependencies

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW, TrainingArguments, Trainer
from datasets import DatasetDict, load_dataset, load_metric

# Loading the dataset

In [None]:
# https://huggingface.co/datasets/super_glue/viewer/boolq
dataset = load_dataset('super_glue', 'boolq')

# exploring the dataset
dataset.keys()

Downloading builder script:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/38.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.8k [00:00<?, ?B/s]

Downloading and preparing dataset super_glue/boolq to /root/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed...


Downloading data:   0%|          | 0.00/4.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

Dataset super_glue downloaded and prepared to /root/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

dict_keys(['train', 'validation', 'test'])

In [None]:
# structure of the datasets
dataset['train']

Dataset({
    features: ['question', 'passage', 'idx', 'label'],
    num_rows: 9427
})

In [None]:
# the question of the train dataset example
dataset["train"]["question"][4:5]

['is elder scrolls online the same as skyrim']

In [None]:
# the context of the train data set
dataset["train"]["passage"][4:5]

['The Elder Scrolls Online -- As with other games in The Elder Scrolls series, the game is set on the continent of Tamriel. The events of the game occur a millennium before those of The Elder Scrolls V: Skyrim and around 800 years before The Elder Scrolls III: Morrowind and The Elder Scrolls IV: Oblivion. It has a broadly similar structure to Skyrim, with two separate conflicts progressing at the same time, one with the fate of the world in the balance, and one where the prize is supreme power on Tamriel. In The Elder Scrolls Online, the first struggle is against the Daedric Prince Molag Bal, who is attempting to meld the plane of Mundus with his realm of Coldharbour, and the second is to capture the vacant imperial throne, contested by three alliances of the mortal races. The player character has been sacrificed to Molag Bal, and Molag Bal has stolen their soul, the recovery of which is the primary game objective.']

In [None]:
# the label of the train dataset example
dataset["train"]["label"][4:5]

[0]

In [None]:
# what are the possible values of the labels?
np.unique(np.array(dataset["train"]["label"]))

array([0, 1])

In [None]:
# what are the possible values of the labels?
np.unique(np.array(dataset["validation"]["label"]))

array([0, 1])

In [None]:
# what are the possible values of the labels?
np.unique(np.array(dataset["test"]["label"]))

array([-1])

In [None]:
# since the "test" dataset is unsupervised, we will not use it in the evaluation of the model, 
# instead we will take some of the examples on the "train" dataset to be able to evaluate the model
train_test_split = dataset["train"].train_test_split(0.1)
train_test_split

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 8484
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 943
    })
})

In [None]:
# create a new dataset with the correct data
fine_datasets = DatasetDict({
    "train": train_test_split["train"], # to train the model
    "validation": dataset["validation"], # to adjust the hyperparameters
    "test": train_test_split["test"] # to evaluate the model
})

fine_datasets

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 8484
    })
    validation: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 943
    })
})

In [None]:
# now the test dataset has both labels examples
np.unique(np.array(fine_datasets["test"]["label"]))

array([0, 1])

# Tokenizing the dataset

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(example):    
    return tokenizer(example['question'], example['passage'], truncation=True, padding='max_length')

tokenized_dataset = fine_datasets.map(tokenize, batched=True)

train_dataset = tokenized_dataset["train"]
validation_dataset = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/8484 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/943 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


# Defining the model

In [None]:
# https://huggingface.co/roberta-base
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

# Training the model

In [None]:
def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2, # Treino durante 2 épocas.
    per_device_train_batch_size=12, # Lote de tamanho 12.
    per_device_eval_batch_size=12, # Lote de tamanho 12.
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=200
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



Step,Training Loss,Validation Loss,Accuracy
100,0.6592,0.680666,0.621713
200,0.6575,0.66378,0.621713
300,0.6566,0.663784,0.621713
400,0.6569,0.681181,0.621713
500,0.6657,0.664002,0.621713
600,0.6605,0.666087,0.621713
700,0.6624,0.66388,0.621713
800,0.6737,0.66431,0.621713
900,0.6595,0.677974,0.621713
1000,0.6734,0.663688,0.621713


TrainOutput(global_step=1414, training_loss=0.6612495357615944, metrics={'train_runtime': 3020.0596, 'train_samples_per_second': 5.618, 'train_steps_per_second': 0.468, 'total_flos': 4464468387348480.0, 'train_loss': 0.6612495357615944, 'epoch': 2.0})

# Save the model

In [None]:
trainer.save_model("project_model")

# Evaluation on the test dataset

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.6615504026412964,
 'eval_accuracy': 0.6267232237539767,
 'eval_runtime': 28.3028,
 'eval_samples_per_second': 33.318,
 'eval_steps_per_second': 2.791,
 'epoch': 2.0}

# 'test' function

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def test(passage, question):
  inputs = tokenizer(question, passage, truncation=True, padding=True)
  input_ids = inputs['input_ids']
  attention_mask = inputs['attention_mask']

  input_ids = torch.tensor([input_ids])
  attention_mask = torch.tensor([attention_mask])

  data = torch.utils.data.TensorDataset(input_ids, attention_mask)
  data_loader = torch.utils.data.DataLoader(data, batch_size=1)
  model.eval()
  predictions = []
  for batch in data_loader:
      batch = tuple(t.to(device) for t in batch)
      inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
      
      with torch.no_grad():
          outputs = model(**inputs)
      
      logits = outputs.logits
      probabilities = torch.softmax(logits, dim=-1)
      predicted_labels = torch.argmax(probabilities, dim=1)
      
      predictions.extend(predicted_labels.cpu().numpy())
  predicted_label = predictions[0]
  return predicted_label

In [None]:
question = "is confectionary sugar the same as powdered sugar"
passage = "Powdered sugar, also called confectioners' sugar, icing sugar, and icing cake, is a finely ground sugar produced by milling granulated sugar into a powdered state. It usually contains a small amount of anti-caking agent to prevent \
 clumping and improve flow. Although most often produced in a factory, powdered sugar can also be made by processing ordinary granulated sugar in a coffee grinder, or by crushing it by hand in a mortar and pestle."
response = test(passage, question)
# the response should be 1 (yes)
print(response)

1


In [None]:
question = "is saline and sodium chloride the same thing"
passage = "Saline, also known as saline solution, is a mixture of sodium chloride in water and has a number of uses in medicine. Applied to the affected area it is used to clean wounds, help remove contact lenses, and help with dry eyes. \
  By injection into a vein it is used to treat dehydration such as from gastroenteritis and diabetic ketoacidosis. It is also used to dilute other medications to be given by injection."
response = test(passage, question)
# the response should be 0 (no)
print(response)

1
