# Transfer learning

Ce notebook télécharge le modèle RoBERTa-base et utilise le dataset MultiNERD English pour prédire les noms de personnes sur du texte anglais. <br/>
Il faut:
- L'adapter sur du français (modèle camembert, autre dataset)
- Essayer de freeze des layers, améliorer ses performances sur le jeu "dev" 

In [32]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers

In [33]:
model_name = "bert-base-multilingual-cased"

## MultiNERD data

Ce dataset est un text avec des catégories assez fines (dont nom de personne).<br>
Il est disponible [ici](https://github.com/Babelscape/multinerd)<br>
Prenez le dataset français<br>

In [34]:
with open("./Data/train_fr.tsv") as f:
    all_rows = list(line.strip().split("\t") for line in f)
    # Select  50% of the rows
    rows = all_rows[:int(len(all_rows) * 0.02 )]

rows[:10]

[['0', 'Il', 'O'],
 ['1', 'est', 'O'],
 ['2', 'incarné', 'O'],
 ['3', 'par', 'O'],
 ['4',
  'Austin',
  'B-PER',
  'bn:02525192n',
  'Q4204710',
  '7345300',
  'Austin_Stowell',
  'Austin Stowell est un acteur américain né le 24 décembre 1984 à Kensington dans le Connecticut.',
  'https://upload.wikimedia.org/wikipedia/commons/9/95/Austin_Stowell-DolphinTale.jpg'],
 ['5', 'Stowell', 'I-PER'],
 ['6', '.', 'O'],
 [''],
 ['0', 'c’', 'O'],
 ['1', 'est', 'O']]

In [35]:
def make_labelled_sentences(tagged_words):
    # Joining words until we meet a dot
    # Word's label is 1 if 'PER' is in its tag
    X = []
    y = []

    this_word = []
    this_labels = []
    for tagged_word in tagged_words:
        if len(tagged_word) < 3:
            # not a tagged word
            continue
        word = tagged_word[1]
        tag = tagged_word[2]

        if word == '.':
            X.append(this_word)
            y.append(this_labels)

            this_word = []
            this_labels = []
        else:
            this_word.append(word)
            this_labels.append(1 * tag.endswith("PER"))

    return X, y

In [36]:
sentences, labels = make_labelled_sentences(rows)

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [39]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

In [40]:
import sentencepiece
print(sentencepiece)


<module 'sentencepiece' from 'c:\\Users\\louis\\miniconda3\\lib\\site-packages\\sentencepiece\\__init__.py'>


# Applying Hugging face V2

In [41]:
from transformers import BertForTokenClassification, BertTokenizerFast

num_labels = 31  # Ajustez selon le nombre de classes dans votre tâche
model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = BertTokenizerFast.from_pretrained(model_name)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Analyse détaillée des couches du modèle
for idx, (name, layer) in enumerate(model.named_children()):
    print(f"Couche {idx}: {name}")
    print(f"Type de Couche: {type(layer).__name__}")
    print(f"Forme de Sortie: {layer.out_features if hasattr(layer, 'out_features') else 'N/A'}")
    print(f"Paramètres Entraînables: {sum(p.numel() for p in layer.parameters() if p.requires_grad)}")
    print(f"Paramètres Non-Entraînables: {sum(p.numel() for p in layer.parameters() if not p.requires_grad)}\n")


Couche 0: bert
Type de Couche: BertModel
Forme de Sortie: N/A
Paramètres Entraînables: 177262848
Paramètres Non-Entraînables: 0

Couche 1: dropout
Type de Couche: Dropout
Forme de Sortie: N/A
Paramètres Entraînables: 0
Paramètres Non-Entraînables: 0

Couche 2: classifier
Type de Couche: Linear
Forme de Sortie: 31
Paramètres Entraînables: 23839
Paramètres Non-Entraînables: 0



In [43]:
for name, param in model.named_parameters():
    print(name)


bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [44]:
'''# Geler tous les paramètres
for param in model.roberta.parameters():
    param.requires_grad = False

# Dégelez les derniers layers (par exemple, les deux dernières couches)
for param in model.roberta.encoder.layer[-2:].parameters():
    param.requires_grad = True

# Assurez-vous que le classificateur est également dégelé
for param in model.classifier.parameters():
    param.requires_grad = True'''
# Geler toutes les couches sauf les dernières dans BERT
nombre_de_couches_a_geler = 11  # Ajustez ce nombre selon vos besoins
for layer in model.bert.encoder.layer[:-nombre_de_couches_a_geler]:
    for param in layer.parameters():
        param.requires_grad = False

# Vérifier les paramètres après le gel
for name, param in model.named_parameters():
    print(f"{name} est {'gelé' if not param.requires_grad else 'dégelé'}")




bert.embeddings.word_embeddings.weight est dégelé
bert.embeddings.position_embeddings.weight est dégelé
bert.embeddings.token_type_embeddings.weight est dégelé
bert.embeddings.LayerNorm.weight est dégelé
bert.embeddings.LayerNorm.bias est dégelé
bert.encoder.layer.0.attention.self.query.weight est gelé
bert.encoder.layer.0.attention.self.query.bias est gelé
bert.encoder.layer.0.attention.self.key.weight est gelé
bert.encoder.layer.0.attention.self.key.bias est gelé
bert.encoder.layer.0.attention.self.value.weight est gelé
bert.encoder.layer.0.attention.self.value.bias est gelé
bert.encoder.layer.0.attention.output.dense.weight est gelé
bert.encoder.layer.0.attention.output.dense.bias est gelé
bert.encoder.layer.0.attention.output.LayerNorm.weight est gelé
bert.encoder.layer.0.attention.output.LayerNorm.bias est gelé
bert.encoder.layer.0.intermediate.dense.weight est gelé
bert.encoder.layer.0.intermediate.dense.bias est gelé
bert.encoder.layer.0.output.dense.weight est gelé
bert.encoder

In [45]:
'''def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs'''
    
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        print(f"Processing sentence {i}, Length of labels: {len(label)}")  # Débogage

        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            print(f"Word index: {word_idx}")  # Débogage

            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs


In [46]:
print(len(sentences_train), len(labels_train))


1766 1766


In [47]:
for i in range(5):
    print(sentences_train[i])
    print(labels_train[i])
    print()


['Le', ',', 'la', 'sous', '-', 'préfecture', 'est', 'supprimée', ',', 'dans', 'le', 'cadre', 'du', 'plan', 'd', '’', 'économies', 'de', 'Poincaré', ',', 'puis', 'est', 'rétablie', 'par', 'le', 'Régime', 'de', 'Vichy', 'le']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Le', '10', 'avril', '2015', ',', 'il', 'annonce', 'qu', "'", 'il', 'rejoindra', 'le', 'Feyenoord', 'Rotterdam', 'à', 'l', "'", 'issue', 'de', 'la', 'saison']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Beaucoup', 'de', '"', 'pa', '"', '(', 'villages', 'fortifiés', ')', 'y', 'furent', 'construits', ',', 'particulièrement', 'sur', 'les', 'volcans', '(', 'aujourd', "'", 'hui', ',', 'il', 'est', 'toujours', 'possible', 'd', "'", 'observer', 'les', 'traces', 'de', '"', 'pa', '"', ',', 'notamment', 'sur', 'les', 'anciens', 'volcans', 'que', 'sont', 'le', 'Mont', 'Éden', 'et', 'le', 'One', 'Tree', 'Hill', ')']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [48]:
for i, sentence in enumerate(sentences_train):
    if not sentence or isinstance(sentence, str):
        print(f"Problème avec la phrase à l'indice {i}: {sentence}")


Problème avec la phrase à l'indice 948: []


In [49]:
filtered_sentences_train = []
filtered_labels_train = []

for sentence, label in zip(sentences_train, labels_train):
    if sentence:  # Vérifiez si la liste de mots n'est pas vide
        filtered_sentences_train.append(sentence)
        filtered_labels_train.append(label)


In [50]:
#tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)
tokenized_train = tokenize_and_align_labels(filtered_sentences_train, filtered_labels_train)


Processing sentence 0, Length of labels: 29
Word index: None
Word index: 0
Word index: 1
Word index: 2
Word index: 3
Word index: 4
Word index: 5
Word index: 6
Word index: 7
Word index: 7
Word index: 7
Word index: 7
Word index: 8
Word index: 9
Word index: 10
Word index: 11
Word index: 12
Word index: 13
Word index: 14
Word index: 15
Word index: 16
Word index: 16
Word index: 17
Word index: 18
Word index: 18
Word index: 18
Word index: 18
Word index: 19
Word index: 20
Word index: 21
Word index: 22
Word index: 22
Word index: 22
Word index: 22
Word index: 23
Word index: 24
Word index: 25
Word index: 25
Word index: 25
Word index: 26
Word index: 27
Word index: 28
Word index: None
Processing sentence 1, Length of labels: 21
Word index: None
Word index: 0
Word index: 1
Word index: 2
Word index: 3
Word index: 4
Word index: 5
Word index: 6
Word index: 7
Word index: 8
Word index: 9
Word index: 10
Word index: 10
Word index: 10
Word index: 10
Word index: 11
Word index: 12
Word index: 13
Word index: 14

In [51]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

Processing sentence 0, Length of labels: 18
Word index: None
Word index: 0
Word index: 1
Word index: 2
Word index: 3
Word index: 4
Word index: 5
Word index: 6
Word index: 7
Word index: 8
Word index: 9
Word index: 10
Word index: 11
Word index: 12
Word index: 13
Word index: 14
Word index: 14
Word index: 14
Word index: 15
Word index: 16
Word index: 17
Word index: 17
Word index: 17
Word index: None
Processing sentence 1, Length of labels: 35
Word index: None
Word index: 0
Word index: 1
Word index: 2
Word index: 3
Word index: 4
Word index: 4
Word index: 5
Word index: 5
Word index: 5
Word index: 6
Word index: 7
Word index: 8
Word index: 8
Word index: 8
Word index: 9
Word index: 10
Word index: 11
Word index: 11
Word index: 11
Word index: 11
Word index: 12
Word index: 12
Word index: 13
Word index: 14
Word index: 15
Word index: 16
Word index: 17
Word index: 18
Word index: 18
Word index: 19
Word index: 19
Word index: 19
Word index: 20
Word index: 21
Word index: 22
Word index: 23
Word index: 24
W

In [52]:
from datasets import Dataset

dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [53]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [54]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [55]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)


True
2.1.2
11.8


In [56]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650 with Max-Q Design'

In [57]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)


True
2.1.2
11.8


In [58]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2
)
model = model.to("cuda")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
for name, _ in model.base_model.named_parameters():
  print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [60]:
N = 11  # exemple : geler les 6 premières couches
for i, (name, param) in enumerate(model.base_model.named_parameters()):
    layer_num = name.split(".")[2] if "layer" in name else None
    if layer_num and int(layer_num) < N:
        param.requires_grad = False

In [61]:
#def plots metrics

import matplotlib.pyplot as plt

def plot_metrics(history):
    plt.figure(figsize=(15, 5))

    # Précision
    plt.subplot(1, 4, 1)
    plt.plot(history['precision'], label='Précision')
    plt.title('Précision par époque')
    plt.legend()

    # Rappel
    plt.subplot(1, 4, 2)
    plt.plot(history['recall'], label='Rappel')
    plt.title('Rappel par époque')
    plt.legend()

    # Score F1
    plt.subplot(1, 4, 3)
    plt.plot(history['f1'], label='Score F1')
    plt.title('Score F1 par époque')
    plt.legend()

    # Exactitude
    plt.subplot(1, 4, 4)
    plt.plot(history['accuracy'], label='Exactitude')
    plt.title('Exactitude par époque')
    plt.legend()

    plt.show()

# Initialisation d'un dictionnaire pour stocker les métriques
metrics_history = {'precision': [], 'recall': [], 'f1': [], 'accuracy': []}


** Validation loss **
- Only last layer learnt: 0.001911

In [62]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Enregistrement des métriques après chaque époque
for log in trainer.state.log_history:
    if 'eval_loss' in log.keys():
        metrics_history['precision'].append(log['eval_precision'])
        metrics_history['recall'].append(log['eval_recall'])
        metrics_history['f1'].append(log['eval_f1'])
        metrics_history['accuracy'].append(log['eval_accuracy'])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
                                                      

[A[A                                         
  1%|          | 130/11230 [29:14<20:10:08,  6.54s/it]
[A
[A

{'eval_loss': 0.05701379477977753, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9776996183491731, 'eval_runtime': 115.1877, 'eval_samples_per_second': 4.801, 'eval_steps_per_second': 0.304, 'epoch': 1.0}




KeyboardInterrupt: 

In [None]:
# Affichage des graphiques
plot_metrics(metrics_history)