### Dependencies installation



In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-crendential store but this isn't the helper defined on your machine.
You will have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal to set it as the default

git config --global credential.helper store[0m


In [None]:
#variables to change the model behaviour
squad_v2 = True
batch_size = 16

In [None]:
import transformers
from datasets import load_dataset, load_metric

In [None]:
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

Reusing dataset squad_v2 (/root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

### Pré-processing

In [None]:
from transformers import AutoTokenizer
from src import utils

In [None]:
import transformers
assert isinstance(utils.TOKENIZER, transformers.PreTrainedTokenizerFast)

### Preparing train & computing features

In [None]:
pad_on_right = utils.TOKENIZER.padding_side == "right"

In [None]:
features = utils.prepare_train_features(datasets['train'][:5])

In [None]:
tokenized_datasets = datasets.map(utils.prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-e1e14b86a04e0a25.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-ea1f5815843c5bee.arrow


### Fine Tuning

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from src.predict import create_model

model, is_reference = create_model("mvonwyl/distilbert-base-uncased-finetuned-squad2")

In [None]:
model_name = utils.MODEL_CHECKPOINT.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=utils.TOKENIZER,
)

/content/distilbert-base-uncased-finetuned-squad is already a clone of https://huggingface.co/Surenis/distilbert-base-uncased-finetuned-squad. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
# only train a model which is not trained yet
if not is_reference:
  trainer.train()

### Posprocessing

In [None]:
n_best_size = 20

In [None]:
validation_features = datasets["validation"].map(
    utils.prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-c4d3fd3ffff2b830.arrow


### Evaluation

In [None]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 12134
  Batch size = 16


In [None]:
from src import predict
final_predictions = predict.postprocess_qa_predictions(datasets["validation"], validation_features, utils.TOKENIZER, raw_predictions.predictions)

Post-processing 11873 example predictions split into 12134 features.


  0%|          | 0/11873 [00:00<?, ?it/s]

### Compute metrics

In [30]:
from src import evaluation

In [33]:
evaluation.evaluate_predictions(final_predictions, datasets)

### Results analysis

In [None]:
final_predictions

OrderedDict([('56ddde6b9a695914005b9628', 'France'),
             ('56ddde6b9a695914005b9629', '10th and 11th centuries'),
             ('56ddde6b9a695914005b962a', 'Denmark, Iceland and Norway'),
             ('56ddde6b9a695914005b962b', 'Rollo'),
             ('56ddde6b9a695914005b962c', '10th century'),
             ('5ad39d53604f3c001a3fe8d1', ''),
             ('5ad39d53604f3c001a3fe8d2', 'Normandy'),
             ('5ad39d53604f3c001a3fe8d3', ''),
             ('5ad39d53604f3c001a3fe8d4', '10th century'),
             ('56dddf4066d3e219004dad5f', 'William the Conqueror'),
             ('56dddf4066d3e219004dad60', 'Richard I'),
             ('56dddf4066d3e219004dad61', 'Catholic orthodoxy'),
             ('5ad3a266604f3c001a3fea27', 'political, cultural and military'),
             ('5ad3a266604f3c001a3fea28', 'The Normans'),
             ('5ad3a266604f3c001a3fea29', ''),
             ('5ad3a266604f3c001a3fea2a', 'Richard I'),
             ('5ad3a266604f3c001a3fea2b', ''),
        

In [None]:
references[0]

{'answers': {'answer_start': [159, 159, 159, 159],
  'text': ['France', 'France', 'France', 'France']},
 'id': '56ddde6b9a695914005b9628'}

In [None]:
formatted_predictions[0]

{'id': '56ddde6b9a695914005b9628',
 'no_answer_probability': 0.0,
 'prediction_text': 'France'}

In [None]:
false_pred = {}
for i in range(0, 200):
  reference = references[i]
  prediction = formatted_predictions[i]
  if (prediction["prediction_text"] not in reference["answers"]["text"]):
    if (reference["id"] == prediction["id"]):
      false_pred[reference["id"]] = (reference["answers"]["text"], prediction["prediction_text"])

In [None]:
false_pred

Cas relevés de fausses prédictions
- Une question sans réponse et dont la prédiction n'apporte pas de réponse non plus (cas trivial)
- Le plus fréquent : la question n'a pas de réponse mais la prédiction essaie d'en apporter une 
- Typo dans la prédiction (un guillemet en trop par exemple)
- Réponses plus développées dans la prédiction que la référence mais tout de même correctes (exemple : les titres de noblesses, avec "King Charles III" en référence mais "King Charles III of West Francia" prédit"
- Réponses totalement fausses (exemple : "Latin" alors que la réponse est "Modern English"

De fait beaucoup des résultats qui sont considérés comme des fausses prédictions lors du calcul du score sont donc en réalité correctes bien que n'étant pas strictement identiques à la référence.

In [None]:
for i in datasets["validation"]:
  if i["id"] in false_pred:
    false_pred[i["id"]] =  (i["question"],) + false_pred[i["id"]]

In [None]:
false_pred