### Dependencies installation



In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-crendential store but this isn't the helper defined on your machine.
You will have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal to set it as the default

git config --global credential.helper store[0m


In [None]:
#variables to change the model behaviour
squad_v2 = True
batch_size = 16

In [None]:
import transformers
from datasets import load_dataset, load_metric

In [None]:
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

### Pré-processing

In [None]:
from transformers import AutoTokenizer
from src import utils

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
import transformers
assert isinstance(utils.TOKENIZER, transformers.PreTrainedTokenizerFast)

### Preparing train & computing features

In [None]:
pad_on_right = utils.TOKENIZER.padding_side == "right"

In [None]:
features = utils.prepare_train_features(datasets['train'][:5])

In [None]:
tokenized_datasets = datasets.map(utils.prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

### Fine Tuning

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from src.predict import create_model

model, is_reference = create_model("mvonwyl/distilbert-base-uncased-finetuned-squad2")

Downloading:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/253M [00:00<?, ?B/s]

In [None]:
model_name = utils.MODEL_CHECKPOINT.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=utils.TOKENIZER,
)

Cloning https://huggingface.co/Ekael/distilbert-base-uncased-finetuned-squad into local empty directory.


In [None]:
# only train a model which is not trained yet
if not is_reference:
  trainer.train()

### Posprocessing

In [None]:
import pandas as pd
df = pd.read_csv('data/dataset_with_10k.csv')

In [None]:
print(df.columns)

Index(['question', 'context', 'title', 'answers'], dtype='object')


In [None]:
unique_questions = df["question"]

In [None]:
from src.model_computation import k_nearest_neighbours_context

nn = k_nearest_neighbours_context(unique_questions[0], 5, df)

loading configuration file /root/.cache/torch/sentence_transformers/sentence-transformers_msmarco-distilbert-base-tas-b/config.json
Model config DistilBertConfig {
  "_name_or_path": "old_models/msmarco-distilbert-base-tas-b/0_Transformer",
  "activation": "gelu",
  "architectures": [
    "DistilBertModel"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.5",
  "vocab_size": 30522
}

loading weights file /root/.cache/torch/sentence_transformers/sentence-transformers_msmarco-distilbert-base-tas-b/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertModel.

All the weights of DistilBertModel were initialized from the model checkpoint at 

Batches:   0%|          | 0/369 [00:00<?, ?it/s]

In [None]:
nn

['The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'Denmark (/ˈdɛnmɑrk/; Danish: Danmark [ˈd̥ænmɑɡ̊]) is a country in Northern Europe. The southernmost of the Nordic countries, it is located southwest of Sweden and south of Norway, and bordered to the south by Germany. Denmark forms part of the cultura

En effectuant cette opération sur chaque question, nous pouvons ensuite utiliser le trainer finetuné avec SQuAD v2 pour obtenir une réponse par contexte. Ensuite en calculant le score de chacune de ces réponses, nous conservons celle qui obtient le plus élevé.