# Installation et importation des librairies:

In [1]:
pip install datasets transformers evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m607.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=a907168842176d6662989f4a9e83b9b31f03fcfb0260d309dfd7711c7156ae82
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed ev

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, get_scheduler, pipeline
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertModel
from torch.optim import Adam
import evaluate
from tqdm.auto import tqdm

2024-04-01 20:31:45.692973: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 20:31:45.693100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 20:31:45.859076: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data :

In [4]:
raw_datasets = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Preprocessing:

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels,
                                      batched=True,
                                      remove_columns=raw_datasets["train"].column_names)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

# DataLoaders with DataCollator :

In [8]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

train_dataloader = DataLoader(tokenized_datasets["train"],
                              shuffle=True,
                              collate_fn=data_collator,
                              batch_size=8)

eval_dataloader = DataLoader(tokenized_datasets["validation"],
                             collate_fn=data_collator,
                             batch_size=8)

test_dataloader = DataLoader(tokenized_datasets["test"],
                             collate_fn=data_collator,
                             batch_size=8)

# Définition et initialisation du modèle:

In [9]:
class BERT_LSTM_NER(nn.Module):
    def __init__(self, num_labels):
        super(BERT_LSTM_NER, self).__init__()
        self.bert = BertModel.from_pretrained(model_checkpoint)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, 256, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(512, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        bert_output = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_output, _ = self.lstm(bert_output)
        logits = self.classifier(lstm_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return loss, logits

In [10]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

model = BERT_LSTM_NER(num_labels=len(label_names))
model.to(device)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

BERT_LSTM_NER(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

# Optimizer et scheduler:

In [11]:
optimizer = Adam(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)

In [12]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

In [13]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# Fine tuning :

## Evaluation metric:

In [14]:
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Training and validation:

In [15]:
training_losses = []
validation_losses = []

progress_bar = tqdm(range(num_train_epochs * len(train_dataloader)))

for epoch in range(num_train_epochs):
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        loss, logits = model(input_ids, attention_mask, labels)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
    average_train_loss = total_train_loss / len(train_dataloader)
    training_losses.append(average_train_loss)
    
    model.eval()
    total_eval_loss = 0
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            loss, logits = model(input_ids, attention_mask, labels)
            total_eval_loss += loss.item()
        predictions = logits.argmax(dim=-1)
        true_predictions, true_labels = postprocess(predictions, labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
    average_eval_loss = total_eval_loss / len(eval_dataloader)
    validation_losses.append(average_eval_loss)
    results = metric.compute()
    overalls = {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]}
    
    print(f"Epoch {epoch+1}, Training Loss: {average_train_loss:.4f}, Validation Loss: {average_eval_loss:.4f}, "
          f"Precision: {overalls['precision']:.4f}, Recall: {overalls['recall']:.4f}, "
          f"F1: {overalls['f1']:.4f}, Accuracy: {overalls['accuracy']:.4f}")

  0%|          | 0/5268 [00:00<?, ?it/s]

Epoch 1, Training Loss: 0.1493, Validation Loss: 0.0470, Precision: 0.9310, Recall: 0.9323, F1: 0.9316, Accuracy: 0.9885
Epoch 2, Training Loss: 0.0331, Validation Loss: 0.0375, Precision: 0.9461, Recall: 0.9427, F1: 0.9444, Accuracy: 0.9905
Epoch 3, Training Loss: 0.0179, Validation Loss: 0.0385, Precision: 0.9475, Recall: 0.9419, F1: 0.9447, Accuracy: 0.9906


# Evaluation on Test Dataset:

In [16]:
model.eval()

test_losses = []
total_test_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        loss, logits = model(input_ids, attention_mask, labels)
        total_test_loss += loss.item()
        predictions = logits.argmax(dim=-1)
        true_predictions, true_labels = postprocess(predictions, labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    average_test_loss = total_test_loss / len(test_dataloader)
    test_losses.append(average_test_loss)
    
    results = metric.compute()
    overalls = {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]}

print(f"Test Loss: {average_test_loss:.4f}, "
      f"Precision: {overalls['precision']:.4f}, "
      f"Recall: {overalls['recall']:.4f}, "
      f"F1: {overalls['f1']:.4f}, "
      f"Accuracy: {overalls['accuracy']:.4f}")

Test Loss: 0.1056, Precision: 0.9193, Recall: 0.9044, F1: 0.9118, Accuracy: 0.9822


# Inference:

In [17]:
test_sentence = "My name is John and I work with Clara at Google in Paris."

inputs = tokenizer(test_sentence, return_tensors="pt", is_split_into_words=False, padding=True, truncation=True, max_length=128)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

model.eval()
with torch.no_grad():
    _, logits = model(input_ids=input_ids, attention_mask=attention_mask)

predictions = torch.argmax(logits, dim=-1)
predicted_label_indices = predictions.cpu().numpy()[0]

tokens = tokenizer.convert_ids_to_tokens(input_ids.cpu().numpy()[0], skip_special_tokens=False)
predicted_labels = [label_names[label_idx] for label_idx in predicted_label_indices]

aggregated_results = []
current_entity = {"word": "", "entity": ""}

for token, label in zip(tokens, predicted_labels):
    if token.startswith("[CLS]") or token.startswith("[SEP]") or token.startswith("[PAD]"):
        continue
    if label.startswith("B-") or label == "O":
        if current_entity["word"]:
            aggregated_results.append(current_entity)
            current_entity = {"word": "", "entity": ""}
    if label != "O":
        if current_entity["word"]:
            current_entity["word"] += " "
        current_entity["word"] += token.replace("##", "")
        current_entity["entity"] = label.split("-")[-1]

if current_entity["word"]:
    aggregated_results.append(current_entity)

for result in aggregated_results:
    print(result)

{'word': 'John', 'entity': 'PER'}
{'word': 'Clara', 'entity': 'PER'}
{'word': 'Google', 'entity': 'ORG'}
{'word': 'Paris', 'entity': 'LOC'}
