# Transfer learning

Ce notebook télécharge le modèle RoBERTa-base et utilise le dataset MultiNERD English pour prédire les noms de personnes sur du texte anglais. <br/>
Il faut:
- L'adapter sur du français (modèle camembert, autre dataset)
- Essayer de freeze des layers, améliorer ses performances sur le jeu "dev" 

In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name ="roberta-base"

## MultiNERD data

Ce dataset est un text avec des catégories assez fines (dont nom de personne).<br>
Il est disponible [ici](https://github.com/Babelscape/multinerd)<br>
Prenez le dataset français<br>

In [3]:
with open("./Data/train_en.tsv") as f:
    rows = list(line.strip().split("\t") for line in f)

rows[:10]

[['0', 'The', 'O'],
 ['1', 'type', 'O'],
 ['2', 'locality', 'O'],
 ['3', 'is', 'O'],
 ['4',
  'Kīlauea',
  'B-LOC',
  'bn:02858748n',
  'Q188698',
  '350666',
  'Kīlauea',
  'Kīlauea is an active shield volcano in the Hawaiian Islands.',
  'https://upload.wikimedia.org/wikipedia/commons/b/b8/Puu_Oo_looking_up_Kilauea_-_edit.jpg'],
 ['5', '.', 'O'],
 [''],
 ['0', 'Common', 'O'],
 ['1', 'components', 'O'],
 ['2', 'of', 'O']]

In [4]:
def make_labelled_sentences(tagged_words):
    # Joining words until we meet a dot
    # Word's label is 1 if 'PER' is in its tag
    X = []
    y = []

    this_word = []
    this_labels = []
    for tagged_word in tagged_words:
        if len(tagged_word) < 3:
            # not a tagged word
            continue
        word = tagged_word[1]
        tag = tagged_word[2]

        if word == '.':
            X.append(this_word)
            y.append(this_labels)

            this_word = []
            this_labels = []
        else:
            this_word.append(word)
            this_labels.append(1 * tag.endswith("PER"))

    return X, y

In [5]:
sentences, labels = make_labelled_sentences(rows)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [8]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

# Applying Hugging face V2

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [10]:
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [11]:
tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)

In [12]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

In [13]:
from datasets import Dataset

dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [14]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [16]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)


True
2.1.2
11.8


In [17]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650 with Max-Q Design'

In [18]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)


True
2.1.2
11.8


In [19]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2
)
model = model.to("cuda")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
for name, _ in model.base_model.named_parameters():
  print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [21]:
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if (
        any(layer_name in name for layer_name in ["layer.5"])
        and any(layer_type in name for layer_type in ["weight", "bias"])
        and "attention" not in name
    ):
        param.requires_grad = True

** Validation loss **
- Only last layer learnt: 0.001911

In [22]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/10506 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  5%|▍         | 501/10506 [01:35<28:11,  5.92it/s]

{'loss': 0.1032, 'learning_rate': 1.904816295450219e-05, 'epoch': 0.1}


 10%|▉         | 1001/10506 [03:09<30:04,  5.27it/s] 

{'loss': 0.0061, 'learning_rate': 1.8096325909004378e-05, 'epoch': 0.19}


 14%|█▍        | 1501/10506 [04:43<26:57,  5.57it/s]

{'loss': 0.0042, 'learning_rate': 1.714448886350657e-05, 'epoch': 0.29}


 19%|█▉        | 2000/10506 [06:21<24:05,  5.89it/s]  

{'loss': 0.0043, 'learning_rate': 1.6192651818008758e-05, 'epoch': 0.38}


 24%|██▍       | 2501/10506 [07:58<22:10,  6.02it/s]

{'loss': 0.0045, 'learning_rate': 1.5240814772510948e-05, 'epoch': 0.48}


 29%|██▊       | 3001/10506 [09:37<20:34,  6.08it/s]  

{'loss': 0.0036, 'learning_rate': 1.4288977727013136e-05, 'epoch': 0.57}


 33%|███▎      | 3500/10506 [11:21<22:39,  5.15it/s]  

{'loss': 0.0037, 'learning_rate': 1.3337140681515326e-05, 'epoch': 0.67}


 38%|███▊      | 4000/10506 [12:56<19:44,  5.49it/s]

{'loss': 0.003, 'learning_rate': 1.2385303636017514e-05, 'epoch': 0.76}


 43%|████▎     | 4501/10506 [14:34<19:06,  5.24it/s]

{'loss': 0.0024, 'learning_rate': 1.1433466590519704e-05, 'epoch': 0.86}


 48%|████▊     | 5000/10506 [16:12<18:35,  4.94it/s]

{'loss': 0.0026, 'learning_rate': 1.0481629545021894e-05, 'epoch': 0.95}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
                                                    
 50%|█████     | 5253/10506 [20:59<16:20,  5.36it/s]

{'eval_loss': 0.0021528711076825857, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9994065903378945, 'eval_runtime': 238.4151, 'eval_samples_per_second': 110.165, 'eval_steps_per_second': 6.887, 'epoch': 1.0}


 52%|█████▏    | 5500/10506 [21:47<13:46,  6.06it/s]    

{'loss': 0.0027, 'learning_rate': 9.529792499524082e-06, 'epoch': 1.05}


 57%|█████▋    | 6001/10506 [23:27<15:42,  4.78it/s]

{'loss': 0.0027, 'learning_rate': 8.577955454026272e-06, 'epoch': 1.14}


 62%|██████▏   | 6500/10506 [25:07<12:03,  5.54it/s]

{'loss': 0.0022, 'learning_rate': 7.62611840852846e-06, 'epoch': 1.24}


 67%|██████▋   | 7001/10506 [26:55<10:43,  5.44it/s]  

{'loss': 0.0027, 'learning_rate': 6.67428136303065e-06, 'epoch': 1.33}


 71%|███████▏  | 7501/10506 [28:40<10:28,  4.78it/s]

{'loss': 0.0024, 'learning_rate': 5.722444317532839e-06, 'epoch': 1.43}


 76%|███████▌  | 8000/10506 [30:22<07:36,  5.49it/s]

{'loss': 0.0028, 'learning_rate': 4.770607272035028e-06, 'epoch': 1.52}


 81%|████████  | 8501/10506 [32:01<06:30,  5.14it/s]

{'loss': 0.0026, 'learning_rate': 3.818770226537217e-06, 'epoch': 1.62}


 86%|████████▌ | 9000/10506 [33:41<04:04,  6.15it/s]

{'loss': 0.0023, 'learning_rate': 2.8669331810394065e-06, 'epoch': 1.71}


 90%|█████████ | 9501/10506 [35:19<02:50,  5.88it/s]

{'loss': 0.0023, 'learning_rate': 1.915096135541595e-06, 'epoch': 1.81}


 95%|█████████▌| 10000/10506 [36:55<01:59,  4.22it/s]

{'loss': 0.0021, 'learning_rate': 9.632590900437846e-07, 'epoch': 1.9}


100%|█████████▉| 10501/10506 [38:33<00:00,  6.46it/s]

{'loss': 0.0027, 'learning_rate': 1.142204454597373e-08, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
                                                     
100%|██████████| 10506/10506 [42:32<00:00,  5.77it/s]

{'eval_loss': 0.0020267462823539972, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.999443333970223, 'eval_runtime': 238.37, 'eval_samples_per_second': 110.186, 'eval_steps_per_second': 6.888, 'epoch': 2.0}


100%|██████████| 10506/10506 [42:34<00:00,  4.11it/s]

{'train_runtime': 2554.064, 'train_samples_per_second': 65.813, 'train_steps_per_second': 4.113, 'train_loss': 0.007862007237620497, 'epoch': 2.0}





TrainOutput(global_step=10506, training_loss=0.007862007237620497, metrics={'train_runtime': 2554.064, 'train_samples_per_second': 65.813, 'train_steps_per_second': 4.113, 'train_loss': 0.007862007237620497, 'epoch': 2.0})