# Installation et importation des librairies:

In [1]:
pip install datasets transformers evaluate seqeval accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, get_scheduler, pipeline
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
import evaluate
from tqdm.auto import tqdm
from accelerate import Accelerator

2024-04-01 17:52:21.360619: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 17:52:21.360671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 17:52:21.362266: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data:

In [3]:
raw_datasets = load_dataset("conll2003")

  0%|          | 0/3 [00:00<?, ?it/s]

# Preprocessing:

In [4]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [5]:
model_checkpoint = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [6]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels,
                                      batched=True,
                                      remove_columns=raw_datasets["train"].column_names)

  0%|          | 0/15 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

# DataLoaders with DataCollator:

In [7]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

train_dataloader = DataLoader(tokenized_datasets["train"],
                              shuffle=True,
                              collate_fn=data_collator,
                              batch_size=8)

eval_dataloader = DataLoader(tokenized_datasets["validation"],
                             collate_fn=data_collator,
                             batch_size=8)

test_dataloader = DataLoader(tokenized_datasets["test"],
                             collate_fn=data_collator,
                             batch_size=8)

# Initialize model:

In [8]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Optimizer and Learning Rate Scheduler:

In [9]:
optimizer = Adam(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)

In [10]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# Accelerator:

In [11]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader, test_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader, test_dataloader)

# Postprocessing:

In [12]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# Finetuning:

## Evaluation metric:

In [13]:
metric = evaluate.load("seqeval")

## Training and validation:

In [14]:
training_losses = []
validation_losses = []

progress_bar = tqdm(range(num_train_epochs * len(train_dataloader)))

for epoch in range(num_train_epochs):
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    average_train_loss = total_train_loss / len(train_dataloader)
    training_losses.append(average_train_loss)
    
    model.eval()
    total_eval_loss = 0
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss
            total_eval_loss += loss.item()

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    average_eval_loss = total_eval_loss / len(eval_dataloader)
    validation_losses.append(average_eval_loss)
    
    results = metric.compute()
    overalls = {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]}
    
    print(f"Epoch {epoch+1}, Training Loss: {average_train_loss:.4f}, Validation Loss: {average_eval_loss:.4f}, "
          f"Precision: {overalls['precision']:.4f}, Recall: {overalls['recall']:.4f}, "
          f"F1: {overalls['f1']:.4f}, Accuracy: {overalls['accuracy']:.4f}")

  0%|          | 0/5268 [00:00<?, ?it/s]

Epoch 1, Training Loss: 0.0694, Validation Loss: 0.0337, Precision: 0.9576, Recall: 0.9502, F1: 0.9539, Accuracy: 0.9921
Epoch 2, Training Loss: 0.0203, Validation Loss: 0.0288, Precision: 0.9685, Recall: 0.9601, F1: 0.9643, Accuracy: 0.9938
Epoch 3, Training Loss: 0.0099, Validation Loss: 0.0300, Precision: 0.9705, Recall: 0.9628, F1: 0.9666, Accuracy: 0.9939


# Evaluation on Test Dataset:

In [15]:
model.eval()

test_losses = []
total_test_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(**batch)
        
        loss = outputs.loss
        total_test_loss += loss.item()
        
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        
        predictions, labels = accelerator.gather((predictions, labels))
        
        true_predictions, true_labels = postprocess(predictions, labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)

average_test_loss = total_test_loss / len(test_dataloader)
test_losses.append(average_test_loss)

results = metric.compute()
overalls = {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]}

# Affichage des résultats
print(f"Test Loss: {average_test_loss:.4f}, "
      f"Precision: {overalls['precision']:.4f}, "
      f"Recall: {overalls['recall']:.4f}, "
      f"F1: {overalls['f1']:.4f}, "
      f"Accuracy: {overalls['accuracy']:.4f}")

Test Loss: 0.1064, Precision: 0.9347, Recall: 0.9224, F1: 0.9285, Accuracy: 0.9856


# Inference:

In [16]:
token_classifier = pipeline("token-classification",
                            model=model,
                            tokenizer=tokenizer,
                            aggregation_strategy="simple")

text = "My name is Jack and I work with Clara at Meta in Paris."

results = token_classifier(text)

for result in results:
    print(result)

{'entity_group': 'PER', 'score': 0.9965789, 'word': 'Jack', 'start': 10, 'end': 15}
{'entity_group': 'PER', 'score': 0.99711335, 'word': 'Clara', 'start': 31, 'end': 37}
{'entity_group': 'ORG', 'score': 0.9002738, 'word': 'Meta', 'start': 40, 'end': 45}
{'entity_group': 'LOC', 'score': 0.9983719, 'word': 'Paris', 'start': 48, 'end': 54}
