<a href="https://colab.research.google.com/github/TheoLpr/NLI_study/blob/main/Natural_language_inference_Multilingual_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library setup

In [None]:
!pip3 install -q --upgrade transformers
!pip3 install -q --upgrade datasets
!pip3 install -q --upgrade evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

import numpy as np
from tqdm import tqdm

Use `AutoModelFor...` as this handles potential differences in model architectures in the background. Helpful so you can run many different models using practically the same code.
Similar story for `AutoTokenizer`.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:

pretrained_name_or_path = "bert-base-multilingual-cased"
num_classes = 3  # {"entailment", "neutral", "contradiction"}
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
MAX_LENGTH = 128
NUM_EPOCHS = 1
LEARNING_RATE = 2e-6
BATCH_SIZE = 4
VALIDATE_EVERY_N_EXAMPLES = 10_000
# strings that should not be used in model, but might be useful for connecting predictions with examples
RAW_DATA_KEYS = {"premise", "hypothesis"}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_name_or_path,
                                                           num_labels=num_classes).to(DEVICE)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
def tokenize_function(data_to_tokenize):
    return tokenizer(list(zip(data_to_tokenize["premise"],
                              data_to_tokenize["hypothesis"])),
                     max_length=MAX_LENGTH,
                     padding="max_length",
                     truncation=True,
                     return_tensors="pt")

##DATA LOADING


In [None]:
import datasets
data = datasets.load_dataset("snli")
data = data.rename_column("label", "labels")

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

In [None]:
# Drop examples with invalid labels
train_data = data["train"].filter(lambda _ex: _ex["labels"] != -1).select(range(50000))
val_data = data["validation"].filter(lambda _ex: _ex["labels"] != -1)
test_data = data["test"].filter(lambda _ex: _ex["labels"] != -1)

train_data = train_data.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")
val_data = val_data.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")
test_data = test_data.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [None]:
val_data

Dataset({
    features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9842
})

In [None]:
NUM_SUBSETS = (len(train_data) + VALIDATE_EVERY_N_EXAMPLES - 1) // VALIDATE_EVERY_N_EXAMPLES

##Training part

In [None]:
for idx_epoch in range(NUM_EPOCHS):
    rand_indices = torch.randperm(len(train_data))
    train_loss, num_batches = 0.0, 0

    for idx_subset in range(NUM_SUBSETS):
        s_sub, e_sub = idx_subset * VALIDATE_EVERY_N_EXAMPLES, (idx_subset + 1) * VALIDATE_EVERY_N_EXAMPLES

        # Training loop
        model.train()

        # DataLoader handles things such as batching data,
        # Subset creates a smaller data subset
        for curr_batch in tqdm(DataLoader(Subset(train_data, rand_indices[s_sub: e_sub].tolist()),
                                          batch_size=BATCH_SIZE)):

            batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}

            res = model(**batch_on_device)

            loss = res.loss
            train_loss += float(loss)
            num_batches += 1

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(f"Training loss: {train_loss / max(1, num_batches):.4f}")

        # Validation loop
        val_preds = []

        # .eval turns off dropout layers in model
        model.eval()

        # torch.inference_mode() makes sure gradients are not being computed
        # (they are only required during training and require a lot of memory)
        with torch.inference_mode():
            for curr_batch in tqdm(DataLoader(val_data, batch_size=BATCH_SIZE)):
                batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                                   if _k not in RAW_DATA_KEYS}
                res = model(**batch_on_device)

                # logits = class scores; normalize them to get probabilities
                probas = torch.softmax(res.logits, dim=-1)
                preds = torch.argmax(probas, dim=-1).cpu()
                val_preds.append(preds)

        val_preds = torch.cat(val_preds)
        # TODO: compute whatever metric you wish
        accuracy = torch.sum(val_preds == val_data["labels"]) / len(val_data["labels"])
        print(f"Validation accuracy: {accuracy:.4f}")


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
torch.save(model, "/content/gdrive/MyDrive/Colab Notebooks/model_multibert.pth")

In [None]:
model2= torch.load("/content/gdrive/MyDrive/Colab Notebooks/model_multibert.pth")

In [None]:
train_data

Dataset({
    features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 50000
})

##Prediction of reverse data

In [None]:
train_data_inv=train_data.rename_column("hypothesis","temp").rename_column("premise","hypothesis").rename_column("temp","premise").remove_columns(["labels","input_ids","attention_mask"])
val_data_inv=val_data.rename_column("hypothesis","temp").rename_column("premise","hypothesis").rename_column("temp","premise").remove_columns(["labels","input_ids","attention_mask"])

train_data_inv = train_data_inv.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")
val_data_inv = val_data_inv.map(tokenize_function, batched=True, batch_size=10000, load_from_cache_file=False).with_format("torch")


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [None]:
train_data_inv

Dataset({
    features: ['hypothesis', 'premise', 'token_type_ids', 'input_ids', 'attention_mask'],
    num_rows: 50000
})

In [None]:

# Validation loop
train_preds_inv = []
val_preds_inv = []

# .eval turns off dropout layers in model
model.eval()

# torch.inference_mode() makes sure gradients are not being computed
# (they are only required during training and require a lot of memory)
with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(val_data_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model2(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        preds = torch.argmax(probas, dim=-1).cpu()
        val_preds_inv.append(preds)

with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(train_data_inv, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model2(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        preds = torch.argmax(probas, dim=-1).cpu()
        train_preds_inv.append(preds)



100%|██████████| 9842/9842 [02:35<00:00, 63.26it/s]
100%|██████████| 50000/50000 [12:55<00:00, 64.44it/s]


In [None]:
train_preds_inv[:10]

[tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2])]

In [None]:
train_data["labels"][:10]

In [None]:
# Validation loop
val_preds = []

# .eval turns off dropout layers in model
model.eval()

# torch.inference_mode() makes sure gradients are not being computed
# (they are only required during training and require a lot of memory)
with torch.inference_mode():
    for curr_batch in tqdm(DataLoader(val_data, batch_size=1)):
        batch_on_device = {_k: curr_batch[_k].to(DEVICE) for _k in curr_batch
                            if _k not in RAW_DATA_KEYS}
        res = model(**batch_on_device)

        # logits = class scores; normalize them to get probabilities
        probas = torch.softmax(res.logits, dim=-1)
        val_preds.append(probas)

 85%|████████▍ | 8360/9842 [02:47<00:23, 64.42it/s]

In [None]:
from sklearn import metrics

In [None]:
actual=list(map(int,val_data["labels"]))

predicted=list(map(int,val_preds))

confusion_matrix = metrics.confusion_matrix(actual, predicted)

#virer les tirets -> crééer une 2ème conf matrix
confusion_matrix2=np.zeros((4,3))
i=0
for ligne in confusion_matrix:
  confusion_matrix2[i]=[ligne[1],ligne[2],ligne[3]]
  i+=1

In [None]:
print(confusion_matrix2[1:4])


##Collect predictions as a list (easier to process)

In [None]:
train_preds_inv2=[]
for i in range(len(train_preds_inv)):
  train_preds_inv2.append(train_preds_inv[i].tolist()[0])




val_preds_inv2=[]
for i in range(len(val_preds_inv)):

  val_preds_inv2.append(val_preds_inv[i].tolist()[0])

In [None]:
train_preds_inv

[tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([0]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([0]),
 tensor([1]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([0]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([1]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tensor([2]),
 tensor([2]),
 tensor([1]),
 tenso

In [None]:
[[x,train_preds_inv2.count(x)] for x in set(train_preds_inv2)]

[[0, 5833], [1, 25794], [2, 18373]]

In [None]:
train_data_inv=train_data_inv.add_column("label",train_preds_inv2)
val_data_inv=val_data_inv.add_column("label",val_preds_inv2)

In [None]:
train_data_inv.save_to_disk("/content/gdrive/MyDrive/Colab Notebooks/train_data_inv_multibert")
val_data_inv.save_to_disk("/content/gdrive/MyDrive/Colab Notebooks/val_data_inv_multibert")



Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9842 [00:00<?, ? examples/s]