<a href="https://colab.research.google.com/github/TheoLpr/NLI_study/blob/main/Natural_language_inference_Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library setup

In [None]:
!pip3 install -q --upgrade transformers
!pip3 install -q --upgrade datasets
!pip3 install -q --upgrade evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

import numpy as np
from tqdm import tqdm

Use `AutoModelFor...` as this handles potential differences in model architectures in the background. Helpful so you can run many different models using practically the same code.
Similar story for `AutoTokenizer`.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
pretrained_name_or_path = "roberta-base"
num_classes = 3  # {"entailment", "neutral", "contradiction"}
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
MAX_LENGTH = 128
NUM_EPOCHS = 1
LEARNING_RATE = 2e-6
BATCH_SIZE = 4
VALIDATE_EVERY_N_EXAMPLES = 10_000
# strings that should not be used in model, but might be useful for connecting predictions with examples
RAW_DATA_KEYS = {"premise", "hypothesis"}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_name_or_path,
                                                           num_labels=num_classes).to(DEVICE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
def tokenize_function(data_to_tokenize):
    return tokenizer(list(zip(data_to_tokenize["premise"],
                              data_to_tokenize["hypothesis"])),
                     max_length=MAX_LENGTH,
                     padding="max_length",
                     truncation=True,
                     return_tensors="pt")

##DATA LOADING


In [None]:
import datasets
data = datasets.load_dataset("snli")
data = data.rename_column("label", "labels")

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

In [None]:
# Drop examples with invalid labels
train_data = data["train"].filter(lambda _ex: _ex["labels"] != -1).select(range(50000))
val_data = data["validation"].filter(lambda _ex: _ex["labels"] != -1)
test_data = data["test"].filter(lambda _ex: _ex["labels"] != -1)

train_data = train_data.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")
val_data = val_data.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")
test_data = test_data.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [None]:
NUM_SUBSETS = (len(train_data) + VALIDATE_EVERY_N_EXAMPLES - 1) // VALIDATE_EVERY_N_EXAMPLES

##Training part

In [None]:
for idx_epoch in range(NUM_EPOCHS):
    rand_indices = torch.randperm(len(train_data))
    train_loss, num_batches = 0.0, 0

    for idx_subset in range(NUM_SUBSETS):
        s_sub, e_sub = idx_subset * VALIDATE_EVERY_N_EXAMPLES, (idx_subset + 1) * VALIDATE_EVERY_N_EXAMPLES

        # Training loop
        model.train()

        # DataLoader handles things such as batching data,
        # Subset creates a smaller data subset
        for curr_batch in tqdm(DataLoader(Subset(train_data, rand_indices[s_sub: e_sub].tolist()),
                                          batch_size=BATCH_SIZE)):
            batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}

            res = model(**batch_on_device)

            loss = res.loss
            train_loss += float(loss)
            num_batches += 1

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(f"Training loss: {train_loss / max(1, num_batches):.4f}")

        # Validation loop
        val_preds = []

        # .eval turns off dropout layers in model
        model.eval()

        # torch.inference_mode() makes sure gradients are not being computed
        # (they are only required during training and require a lot of memory)
        with torch.inference_mode():
            for curr_batch in tqdm(DataLoader(val_data, batch_size=BATCH_SIZE)):
                batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                                   if _k not in RAW_DATA_KEYS}
                res = model(**batch_on_device)

                # logits = class scores; normalize them to get probabilities
                probas = torch.softmax(res.logits, dim=-1)
                preds = torch.argmax(probas, dim=-1).cpu()
                val_preds.append(preds)

        val_preds = torch.cat(val_preds)

        # TODO: compute whatever metric you wish
        accuracy = torch.sum(val_preds == val_data["labels"]) / len(val_data["labels"])
        print(f"Validation accuracy: {accuracy:.4f}")



100%|██████████| 2500/2500 [05:08<00:00,  8.09it/s]


Training loss: 0.8294


100%|██████████| 2461/2461 [01:13<00:00, 33.32it/s]


Validation accuracy: 0.8256


100%|██████████| 2500/2500 [05:09<00:00,  8.07it/s]


Training loss: 0.6789


100%|██████████| 2461/2461 [01:14<00:00, 33.23it/s]


Validation accuracy: 0.8469


100%|██████████| 2500/2500 [05:09<00:00,  8.08it/s]


Training loss: 0.6109


100%|██████████| 2461/2461 [01:13<00:00, 33.27it/s]


Validation accuracy: 0.8590


100%|██████████| 2500/2500 [05:09<00:00,  8.07it/s]


Training loss: 0.5691


100%|██████████| 2461/2461 [01:13<00:00, 33.34it/s]


Validation accuracy: 0.8648


100%|██████████| 2500/2500 [05:09<00:00,  8.08it/s]


Training loss: 0.5452


100%|██████████| 2461/2461 [01:14<00:00, 33.15it/s]

Validation accuracy: 0.8704





Use the HuggingFace ecosystem's built-in saving function to save your model: `<model>.save_pretrained(<path>)`. This path can be used in `.from_pretrained(<path>)` to reload the model later.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
torch.save(model, "/content/gdrive/MyDrive/Colab Notebooks/MultiTask_model/model_basic.pth")

In [None]:
model= torch.load("/content/gdrive/MyDrive/Colab Notebooks/MultiTask_model/model_basic.pth")

In [None]:
# Validation loop
test_preds = []

# .eval turns off dropout layers in model
model.eval()

# torch.inference_mode() makes sure gradients are not being computed
# (they are only required during training and require a lot of memory)
with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(test_data, batch_size=BATCH_SIZE)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        preds = torch.argmax(probas, dim=-1).cpu()
        test_preds.append(preds)

test_preds = torch.cat(test_preds)

torch.save(test_preds, "/content/gdrive/MyDrive/Colab Notebooks/test_preds_roberta")


100%|██████████| 2456/2456 [01:14<00:00, 32.81it/s]


In [None]:
train_data

Dataset({
    features: ['premise', 'hypothesis', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 50000
})

##Prediction of reverse data

In [None]:
train_data2_inv=train_data.rename_column("hypothesis","temp").rename_column("premise","hypothesis").rename_column("temp","premise").remove_columns(["labels","input_ids","attention_mask"]).map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")
val_data2_inv=val_data.rename_column("hypothesis","temp").rename_column("premise","hypothesis").rename_column("temp","premise").remove_columns(["labels","input_ids","attention_mask"]).map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [None]:
# Raw predictions

# Validation loop
train_preds_inv = []
val_preds_inv = []

# .eval turns off dropout layers in model
model.eval()

# torch.inference_mode() makes sure gradients are not being computed
# (they are only required during training and require a lot of memory)
with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(val_data2_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        preds = torch.argmax(probas, dim=-1).cpu()
        val_preds_inv.append(preds)

with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(train_data2_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        preds = torch.argmax(probas, dim=-1).cpu()
        train_preds_inv.append(preds)

100%|██████████| 9842/9842 [01:59<00:00, 82.58it/s]
100%|██████████| 50000/50000 [10:01<00:00, 83.14it/s]


In [None]:
#Probas

# Validation loop
train_preds_inv = []
val_preds_inv = []

# .eval turns off dropout layers in model
model.eval()

# torch.inference_mode() makes sure gradients are not being computed
# (they are only required during training and require a lot of memory)
with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(val_data2_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        val_preds_inv.append(probas)

with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(train_data2_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        train_preds_inv.append(probas)

100%|██████████| 9842/9842 [01:57<00:00, 83.61it/s]
100%|██████████| 50000/50000 [09:51<00:00, 84.49it/s]


In [None]:
print(val_preds[:10])
print(val_preds_inv[:10])

[tensor([1]), tensor([0]), tensor([2]), tensor([0]), tensor([1]), tensor([1]), tensor([2]), tensor([1]), tensor([0]), tensor([0])]
[tensor([0]), tensor([1]), tensor([2]), tensor([1]), tensor([2]), tensor([2]), tensor([2]), tensor([1]), tensor([1]), tensor([1])]


In [None]:
test_preds_inv = []

with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(test_data2_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        test_preds_inv.append(probas)

100%|██████████| 9824/9824 [02:18<00:00, 70.73it/s]


In [None]:

# Validation loop
val_preds = []

# .eval turns off dropout layers in model
model.eval()

# torch.inference_mode() makes sure gradients are not being computed
# (they are only required during training and require a lot of memory)
with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(val_data, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        preds = torch.argmax(probas, dim=-1).cpu()
        val_preds.append(preds)

acc=0
for i in range(len(val_data["labels"])):
  if(int(val_data["labels"][i])==val_preds[i]):
    acc+=1

print(acc/len(val_preds))


100%|██████████| 9842/9842 [02:26<00:00, 67.08it/s]


0.8275756959967486


In [None]:
from sklearn import metrics

In [None]:
actual=list(map(int,val_data["labels"]))

predicted=list(map(int,val_preds))

confusion_matrix = metrics.confusion_matrix(actual, predicted)

In [None]:
print(confusion_matrix)


[[2934  294  101]
 [ 360 2476  399]
 [ 155  388 2735]]


##Collect predictions as a list (easier to process)

In [None]:
train_preds_inv2=[]
for i in range(len(train_preds_inv)):
  train_preds_inv2.append(train_preds_inv[i].tolist()[0])


test_preds_inv2=[]
for i in range(len(test_preds_inv)):
  test_preds_inv2.append(test_preds_inv[i].tolist()[0])


val_preds_inv2=[]
for i in range(len(val_preds_inv)):
  val_preds_inv2.append(val_preds_inv[i].tolist()[0])

In [None]:
train_data2_inv=train_data2_inv.add_column("label",train_preds_inv2)
val_data2_inv=val_data2_inv.add_column("label",val_preds_inv2)


In [None]:
train_data2_inv['label']

tensor([[0.0013, 0.0240, 0.9748],
        [0.0012, 0.0045, 0.9942],
        [0.0023, 0.1305, 0.8672],
        ...,
        [0.0045, 0.9546, 0.0409],
        [0.0011, 0.0163, 0.9826],
        [0.0071, 0.9539, 0.0390]])

In [None]:
train_data2_inv.save_to_disk("/content/gdrive/MyDrive/Colab Notebooks/MultiTask_model/train_data_inv_probas")
val_data2_inv.save_to_disk("/content/gdrive/MyDrive/Colab Notebooks/MultiTask_model/val_data_inv_probas")


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9842 [00:00<?, ? examples/s]