# Tarea 2: Question Answering Fine-tuning

In [None]:
pip install -U datasets huggingface_hub fsspec
pip install evaluate

In [23]:
# Librerías

import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

import torch
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Number of GPUs available:", torch.cuda.device_count())

from time import time
from datasets import *
from transformers import *
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.colheader_justify', 'center')

Is CUDA available: True
CUDA version: 12.4
Number of GPUs available: 1


## Dataset

El dataset de SQuAD (Stanford Question Answering Dataset) es un conjunto de datos utilizado principalmente para entrenar y evaluar modelos de comprensión lectora. Consiste en ternas de preguntas, respuestas y contexto.

Aquí la ficha del dataset para que podáis explorarla: https://huggingface.co/datasets/rajpurkar/squad

In [25]:
# No modificar esta celda
# Esta celda, celda tiene que estar ejecutada en la entrega

dataset = load_dataset("squad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

Con el único motivo de no demorar los tiempos de entrenamiento. Filtraremos el dataset y nos quedaremos solo con los registros que tenga longitud del campo _context_ inferior a 300.

El resto de la práctica se pide trabajarla sobre la variable `ds_tarea`.

In [26]:
# # No modificar esta celda
# # Esta celda, celda tiene que estar ejecutada en la entrega

# def filtra_por_longitud(ejemplo):
#     return len(ejemplo["context"]) < 300

# ds_tarea = dataset.filter(filtra_por_longitud)

# assert len(ds_tarea['train']) == 3466
# assert len(ds_tarea['validation']) == 345

# ds_tarea

In [29]:
# You can adjust the number of examples as needed
train_subset_size = 100
validation_subset_size = 20

ds_tarea = DatasetDict({
    'train': ds_tarea['train'].select(range(min(train_subset_size, len(ds_tarea['train'])))),
    'validation': ds_tarea['validation'].select(range(min(validation_subset_size, len(ds_tarea['validation']))))
})

print("Shortened dataset:")
print(ds_tarea)

Shortened dataset:
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 20
    })
})


## Modeling

En este apartado es donde tendréis que realizar todo el trabajo de la práctica. El formato, el análisis, el modelo escogido y cualquier proceso intermedio que consideréis es totalmente libre. Sin embargo, hay algunas pautas que tendréis que cumplir:

- La variable `model_checkpoint` debe almacenar el nombre del modelo y el tokenizador de 🤗 que vais a utilizar.
- La variable `model` y la variable `tokenizer` almacenarán, respectivamente, el modelo y el tokenizador de 🤗 que vais a utilizar.
- La variable `trainer` almacenará el _Trainer_ de 🤗 que, en la siguiente sección utilizaréis para entrenar el modelo.

In [68]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import DatasetDict # Import DatasetDict

model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=160,
        truncation="only_second", # Truncate only the context
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    sequence_ids_list = [] # To store sequence_ids for each feature

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        context = contexts[sample_idx]

        # Get the list of answer texts and their start positions for the current sample
        answer_texts = answer["text"]
        answer_starts = answer["answer_start"]

        sequence_ids = inputs.sequence_ids(i)
        sequence_ids_list.append(sequence_ids) # Store sequence_ids

        # If no answers are given, set the cls_index as answer.
        if len(answer_texts) == 0:
            start_positions.append(tokenizer.cls_token_id)
            end_positions.append(tokenizer.cls_token_id)
        else:
            # Find start and end character index of the first answer in the context.
            # Assuming there is at least one answer and we are using the first one.
            start_char = answer_starts[0]
            end_char = start_char + len(answer_texts[0])

            # Start token index of the context in the two texts.
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx

            # End token index of the context.
            idx = len(sequence_ids) - 1
            while sequence_ids[idx] != 1:
                idx -= 1
            context_end = idx

            # If the answer is not fully contained in the context, label it with the cls index.
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(tokenizer.cls_token_id)
                end_positions.append(tokenizer.cls_token_id)
            else:
                # Otherwise it's the start and end token indices.
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["sequence_ids"] = sequence_ids_list # Add sequence_ids to inputs
    inputs["example_id"] = examples["id"] # Add example_id to inputs
    inputs["offset_mapping"] = offset_mapping # Add offset_mapping to inputs
    return inputs

In [56]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    warmup_steps=500,
    report_to=["tensorboard"],
    seed=42,
)

In [66]:
import evaluate
from collections import defaultdict
from tqdm.auto import tqdm

metric = evaluate.load("squad_v2")

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map from features to examples.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = {}

    # Logging.
    print(f"Post-processing {len(examples)} example predictions of team")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []

        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what is used to caulate the null score.
            score = start_logits[0] + end_logits[0]
            if min_null_score is None or min_null_score < score:
                min_null_score = score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Don't consider answers where the context is not included.
                    # Access sequence_ids from features
                    if features[feature_index]["sequence_ids"] is not None and (features[feature_index]["sequence_ids"][start_index] != 1 or features[feature_index]["sequence_ids"][end_index] != 1):
                         continue

                    # Get the character start and end position of the answer in the original context.
                    # Need to check if offset_mapping is available and valid for the indices
                    if features[feature_index]["offset_mapping"] is not None and start_index < len(features[feature_index]["offset_mapping"]) and end_index < len(features[feature_index]["offset_mapping"]):
                        offsets = features[feature_index]["offset_mapping"][start_index]
                        start_char = offsets[0]
                        offsets = features[feature_index]["offset_mapping"][end_index]
                        end_char = offsets[1]

                        valid_answers.append({
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        })

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        # Let's pick our final answer: the best one or the null answer (only for SQuAD v2)
        if min_null_score is None or best_answer["score"] > min_null_score:
            predictions[example["id"]] = best_answer["text"]
        else:
            predictions[example["id"]] = ""

    return predictions

def compute_metrics(eval_pred):
    raw_predictions = eval_pred.predictions
    # The first element of eval_pred.label_ids contains the example_ids.
    # The second element contains the feature indices.
    eval_examples = ds_tarea['validation'] # Assuming ds_tarea is accessible here
    eval_features = tokenized_ds['validation'] # Assuming tokenized_ds is accessible here
    predictions = postprocess_qa_predictions(eval_examples, eval_features, raw_predictions)
    # Reformat the references to match the expected format by the metric.
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in eval_examples]
    return metric.compute(predictions=predictions, references=references)

In [63]:
tokenized_ds = ds_tarea.map(preprocess_function, batched=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(


## Training

In [69]:
# No modificar esta celda
# Esta celda, celda tiene que estar ejecutada en la entrega

start = time()

trainer.train()

end = time()
print(f">>>>>>>>>>>>> elapsed time: {(end-start)/60:.0f}m")

Post-processing 20 example predictions of team


  0%|          | 0/20 [00:00<?, ?it/s]

KeyError: 'offset_mapping'

## Evaluation

In [None]:
# No modificar esta celda
# Esta celda, celda tiene que estar ejecutada en la entrega

print(f"**** EVALUACIÓN ****")
print(f"********\nTokenizer config:\n{tokenizer}")
print(f"\n\n********\nModel config:\n{model.config}")
print(f"\n\n********\nTrainer arguments:\n{trainer.args}")

In [None]:
# No modificar esta celda
# Esta celda, celda tiene que estar ejecutada en la entrega

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

In [None]:
# No modificar esta celda
# Esta celda, celda tiene que estar ejecutada en la entrega

assert len(ds_tarea['train']) == 3466
assert len(ds_tarea['validation']) == 345

def calculate_sentence_similarity(sentence1, sentence2):
    sentence1 = re.sub(r'[^a-zA-Z0-9\s]', '', sentence1).lower()
    sentence2 = re.sub(r'[^a-zA-Z0-9\s]', '', sentence2).lower()
    words1 = set(sentence1.lower().split())
    words2 = set(sentence2.lower().split())
    matches = len(words1.intersection(words2))
    total_words = len(words1.union(words2))
    if total_words == 0:
        return 0.0
    return (matches / total_words) * 100

samples = [324,342,249,176,70,168,120,58,90,192,278,289,197,146,323,248,260,273,112,211]
evaluation_list = []

for ii in samples:
    context = ds_tarea['validation'][ii]['context']
    question = ds_tarea['validation'][ii]['question']
    answer = ds_tarea['validation'][ii]['answers']
    answers = [f"{tt}" for ii, tt in enumerate(answer['text'])]
    prediction = question_answerer(context=context, question=question)['answer']
    match = max([calculate_sentence_similarity(w, prediction) for w in answers])
    evaluation_list.append((ii,context,question,answers,prediction,match))

print(f"*** evaluation_df ***")
evaluation_df = pd.DataFrame(evaluation_list, columns=['sample', 'context', 'question', 'real_answers', 'predicted_answer', 'match'])
evaluation_df[['sample','real_answers','predicted_answer', 'match']]

### Criterio de evaluación

La **nota final de la tarea2** estará relacionada con el resultado de las predicciones de vuestro modelo.

El criterio de evaluación será el siguiente:

- La tarea2 se aprobará si el notebook se entrega sin fallos y con un modelo entrenado (independientemente de sus predicciones).
- Se ponderará en función de la columna _match_, que otorga 100% de acierto si todas las palabras coinciden y bajará gradualmente el porcentaje de acierto en función del número de palabras que no coincidan.
    
Nota: La nota que se calcula a continuación es orientativa y podría verse reducida en función del código de la entrega.

In [None]:
print(f"Tu nota de la tarea2 es: {max(np.ceil(evaluation_df['match'].sum() / len(evaluation_df) / 10), 5.0)}")