Для закрепления материала модуля предлагаем вам решить задачу QA для датасета SberQuad, используя любые доступные вам средства.

Для достижения наилучшего результата уделите внимание подбору гиперапарметров как в плане архитектуры, так и в плане обучения модели.

Критерии оценивания проекта:

общее качество кода и следование PEP-8;


* использование рекуррентных сетей;
* использованы варианты архитектур, близкие к state of the art для данной
* задачи;
* произведен подбор гиперпараметров;
* использованы техники изменения learning rate (lr scheduler);
* использована адекватная задаче функция потерь;
* использованы техники регуляризации;
* корректно проведена валидация модели;
* использованы техники ensemble;
* использованы дополнительные данные;
* итоговое значение метрики качества > 0.75 (f1).

---


Выполните задание в Google Colab и в поле для ответа ниже вставьте ссылку на ваше решение. Не забудьте открыть доступ!



In [None]:
!pip install datasets



In [None]:
!pip install transformers[torch]



In [None]:
import datasets

In [None]:
from transformers import AutoTokenizer,  AutoModelForQuestionAnswering, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from datasets import DatasetDict
from sklearn.metrics import f1_score


In [None]:
# Определяем устройство (CPU или GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
dataset = load_dataset("sberquad")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [None]:
dataset['train']['question'][15]

'У кого Россия арендует этот комплекс?'

In [None]:
dataset['train']['answers'][15]

{'text': ['у Казахстана'], 'answer_start': [93]}

In [None]:
dataset['train']['context'][15]

'Город Байконур и космодром Байконур вместе образуют комплекс Байконур , арендованный Россией у Казахстана на период до 2050 года. Эксплуатация космодрома стоит около 9 млрд рублей в год (стоимость аренды комплекса Байконур составляет 115 млн долларов — около 7,4 млрд рублей в год; ещё около 1,5 млрд рублей в год Россия тратит на поддержание объектов космодрома), что составляет 4,2 % от общего бюджета Роскосмоса на 2012 год. Кроме того, из федерального бюджета России в бюджет города Байконура ежегодно осуществляется безвозмездное поступление в размере 1,16 млрд рублей (по состоянию на 2012 год). В общей сложности космодром и город обходятся бюджету России в 10,16 млрд рублей в год.'

In [None]:
model_name = "timpal0l/mdeberta-v3-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
max_length = 384
stride = 128


In [None]:
# Функция для преобразования тренировочных данных



def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_length,
        truncation='only_second',
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
        )

    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for (i, offset) in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)

        if offset[context_start][0] > end_char \
            or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:

      # Otherwise it's the start and end token positions

            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

In [None]:
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/45328 [00:00<?, ? examples/s]

In [None]:
val_dataset = dataset["validation"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)

Map:   0%|          | 0/5036 [00:00<?, ? examples/s]

In [None]:
# Функция для преобразования тестовых данных

def preprocess_test_examples(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_length,
        truncation='only_second',
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
        )

    sample_map = inputs.pop('overflow_to_sample_mapping')
    example_ids = []

    for i in range(len(inputs['input_ids'])):
        sample_idx = sample_map[i]
        example_ids.append(examples['id'][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs['offset_mapping'][i]
        inputs['offset_mapping'][i] = [(o if sequence_ids[k]
                == 1 else None) for (k, o) in enumerate(offset)]

    inputs['example_id'] = example_ids
    return inputs

In [None]:
test_dataset = dataset["test"].map(
    preprocess_test_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)

Map:   0%|          | 0/23936 [00:00<?, ? examples/s]

In [None]:
def compute_f1(predictions, label_ids):
    def normalize_answer(s):
        """Нормализация текстового ответа"""
        return " ".join(s.strip().split())

    def get_tokens(s):
        """Разбивка текста на токены"""
        if not s:
            return []
        return normalize_answer(s).split()

    f1_scores = []
    for pred, true in zip(predictions, label_ids):
        pred_start, pred_end = pred
        true_start, true_end = true

        # Получение токенов для предсказанных и истинных ответов
        pred_tokens = get_tokens(pred_start) + get_tokens(pred_end)
        true_tokens = get_tokens(true_start) + get_tokens(true_end)

        common = collections.Counter(true_tokens) & collections.Counter(pred_tokens)
        num_same = sum(common.values())

        if num_same == 0:
            f1_scores.append(0)
            continue

        precision = 1.0 * num_same / len(pred_tokens)
        recall = 1.0 * num_same / len(true_tokens)

        f1 = (2 * precision * recall) / (precision + recall)
        f1_scores.append(f1)

    return sum(f1_scores) / len(f1_scores)  # Среднее значение F1 для всех примеров

In [None]:
# Определим функцию для вычисления метрики
#def compute_metrics(eval_predictions):
#    f1_scores = []

#    for prediction, reference in zip(eval_predictions, dataset["validation"]):
#        predicted_answer = prediction["predicted_text"]
#        true_answer = reference["answers"]["text"]

#        common_tokens = set(predicted_answer.lower().split()).intersection(set(true_answer.lower().split()))
#        precision = len(common_tokens) / (len(predictated_answer.split()) + 1e-8)
#        recall = len(common_tokens) / (len(true_answer.split()) + 1e-8)

#        f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
#        f1_scores.append(f1)

#    return {"f1": np.mean(f1_scores)}


In [None]:
# подключимся к аккаунту
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
# Определим, какие слои модели будем обучать (последние 4 слоя)
for param in model.base_model.parameters():
    param.requires_grad = False
for param in model.base_model.encoder.layer[-4:].parameters():
    param.requires_grad = True

In [None]:
# Определим гиперпараметры
learning_rate = 2e-5
epochs = 3

In [None]:
# Определим функцию потерь и оптимизатор
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Создаем объект EarlyStoppingCallback для остановки обучения
# Я знаю, что при обучении на трех эпохах этот шаг не имеет смысла, но мне просто не хватило ресурсов колаба, чтобы обучиться на большем количестве эпох
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

In [None]:
# Определяем TrainingArguments
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    output_dir='./content/drive/MyDrive/МФТИ/Question answering',  # Путь для сохранения модели и результатов
    num_train_epochs=epochs,
    evaluation_strategy="steps",
    eval_steps=500,  # Оценивать модель каждые 500 шагов
    save_total_limit=3,  # Максимальное количество сохранений модели
    load_best_model_at_end=True,
    learning_rate=learning_rate,
    remove_unused_columns=False,
    report_to="tensorboard",
    push_to_hub=False
)

In [None]:
# Создаем объект Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

In [None]:
# Обучение
trainer.add_callback(early_stopping)
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
model.push_to_hub("VesleAnne/sberquad_mdeberta")

In [None]:
# Загружаем  модель
model = AutoModelForQuestionAnswering.from_pretrained('VesleAnne/sberquad_mdeberta')


In [None]:
# Оцениваем модель на тестовом наборе данных
results = trainer.predict(test_dataset)
f1_score = compute_f1(results.predictions, results.label_ids)
print("F1 Score on Test Data:", f1_score)

In [None]:
def evaluate(model, dataset):
  f1_scores = []

  dataloader = DataLoader(dataset=list(
      zip(
            dataset['input_ids'],
            dataset['attention_mask'],
            dataset['start_positions'],
            dataset['end_positions']
            )
      ), batch_size=16
  )

  for input_ids, attention_mask, start_positions, end_positions in dataloader:
    input_ids = torch.stack(input_ids).T.to(model.device)
    attention_mask = torch.stack(attention_mask).T.to(model.device)
    start_positions, end_positions = start_positions.to(model.device), end_positions.to(model.device)
    logits = model(input_ids, attention_mask)

    pred_start_positions = logits.start_logits.argmax(1)
    pred_end_positions = logits.end_logits.argmax(1)

    true_seqs = np.zeros_like(input_ids.detach().cpu())
    pred_seqs = np.zeros_like(input_ids.detach().cpu())

    for i in range(true_seqs.shape[0]):
      true_seqs[i][start_positions[i]:end_positions[i]+1] = 1
      pred_seqs[i][pred_start_positions[i]:pred_end_positions[i]+1] = 1

    f1_score_mean = np.mean([f1_score(
        true_seqs[i], pred_seqs[i]) for i in range(len(true_seqs))]
    )
    f1_scores.append(f1_score_mean)
  return np.mean(f1_scores)

In [None]:
раз

In [None]:
max_length = 356

def preprocess_data(examples):
    questions = [q.strip() for q in examples["question"]] #инициализация вопросов
    contexts = examples["context"]

    # токенезируем вопросы и контексты
    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # извлечение данных
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Находим начало и конец контекста
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Если ответ не полностью в контексте, то меткой будет (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    # обновим входные данные
    inputs['input_ids'] = inputs['input_ids']
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
def preprocess_dataset(examples):
    return preprocess_data(examples)

In [None]:
def preprocess_qa_data(dataset, tokenizer, max_length=356):
    input_ids = []
    attention_mask = []
    start_positions = []
    end_positions = []

    for example in dataset:
        context = example['context']
        question = example['question']
        answer = example['answers']

        inputs = tokenizer(
            question,
            context,
            padding='max_length',
            truncation='only_second',
            max_length=max_length,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        start_idx, end_idx = 0, 0
        if answer['answer_start'][0] != -1:
            start_idx = inputs.char_to_token(0, answer['answer_start'][0])
            end_idx = inputs.char_to_token(0, answer['answer_start'][0] + len(answer['text'][0]))

        input_ids.append(inputs.input_ids)
        attention_mask.append(inputs.attention_mask)
        start_positions.append(start_idx)
        end_positions.append(end_idx)

    # Перед объединением в тензоры, заполните последовательности до максимальной длины
    input_ids = pad_sequence(input_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)

    # Убедитесь, что start_positions и end_positions не содержат None
    start_positions = [0 if x is None else x for x in start_positions]
    end_positions = [0 if x is None else x for x in end_positions]

    start_positions = torch.tensor(start_positions)
    end_positions = torch.tensor(end_positions)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'start_positions': start_positions,
        'end_positions': end_positions
    }

In [None]:
datasets = ["train", "validation", "test"]

for dataset_name in datasets:
    dataset[dataset_name] = dataset[dataset_name].map(
        preprocess_dataset,
        batched=True,
        remove_columns=dataset[dataset_name].column_names,
    )
    print(f"Original {dataset_name} size: {len(dataset[dataset_name])}")
    print(f"Processed {dataset_name} size: {len(dataset[dataset_name])}")

In [None]:
batch_size = 32

# Создаем DataLoader для обработанных датасетов
train_loader = DataLoader(dataset["train"], batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset["validation"], batch_size=batch_size)
test_loader = DataLoader(dataset["test"], batch_size=batch_size)

In [None]:
for batch in train_loader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    start_positions = batch["start_positions"]
    end_positions = batch["end_positions"]

    # Преобразование списков в тензоры
    input_ids = torch.stack(input_ids, dim=1).to(device)
    attention_mask = torch.stack(attention_mask, dim=1).to(device)
    start_positions = torch.tensor(start_positions).to(device)
    end_positions = torch.tensor(end_positions).to(device)

    print(input_ids.shape, attention_mask.shape, start_positions.shape, end_positions.shape)
    break

In [None]:
pip install deberta



In [None]:
def preprocess_training_data(examples, max_seq_length=356, tokenizer=None):
    """
    Предобрабатывает обучающие данные для модели вопрос-ответ.

    Args:
        examples (dict): Словарь с обучающими данными, содержащий "question", "context" и "answers".
        max_seq_length (int): Максимальная длина последовательности.
        tokenizer: Объект токенизатора для преобразования текста в токены.

    Returns:
        dict: Подготовленные данные, включая input_ids, start_positions и end_positions.
    """
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    # Токенизация вопросов и контекстов
    inputs = tokenizer(questions, contexts, max_length=max_seq_length, truncation=True, padding="max_length", return_offsets_mapping=True)

    # Извлечение данных
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx][0]
        start_char, end_char = answer["answer_start"], answer["answer_start"] + len(answer["text"])

        sequence_ids = inputs.sequence_ids(i)

        # Находим начало и конец контекста
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

        # Если ответ не полностью в контексте, то меткой будет (0, 0)
        if offset_mapping[context_start][0] > start_char or offset_mapping[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Находим начало и конец ответа в контексте
            for idx, (start, end) in enumerate(offsets):
                if start <= start_char:
                    start_positions.append(idx)
                if end >= end_char:
                    end_positions.append(idx)

    # Обновляем входные данные
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [None]:
# Применяем функцию к обучающим данным
train_dataset = dataset["train"].map(
    lambda examples: preprocess_training_data(examples, max_seq_length=356, tokenizer=tokenizer),
    batched=True,
    remove_columns=dataset["train"].column_names,
)

# Выводим количество примеров до и после предобработки
print(f"Train dataset: Original size = {len(dataset['train'])}, Processed size = {len(train_dataset)}")

In [None]:
# Применяем функцию к валидационному набору данных
valid_dataset = dataset["validation"].map(
    preprocess_training_data,  # Используем функцию предобработки
    batched=True,                  # Пакетная обработка данных
    remove_columns=dataset["validation"].column_names,  # Удаляем столбцы, которые необходимо удалить
)


In [None]:
# Выводим количество примеров до и после предобработки
print(f"Validation dataset: Original size = {len(dataset['validation'])}, Processed size = {len(valid_dataset)}")

# Применяем функцию к тестовому набору данных
test_dataset = dataset["test"].map(
    preprocess_training_data,  # Используем функцию пред

In [None]:
def preprocess_training_data(examples, tokenizer, max_sequence_length):
    # Extract questions from examples and strip whitespace
    questions = [q.strip() for q in examples["question"]]

    # Tokenize questions and contexts, and pad to the maximum sequence length
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_sequence_length,
        padding="max_length",
        truncation=True,
        return_offsets_mapping=True,
        return_tensors="pt"  # Return PyTorch tensors for input_ids, attention_mask, etc.
    )

    # Extract offset mappings and answers
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find the token positions that correspond to the answer's start and end
        for j, (start, end) in enumerate(offset):
            if start_char == start:
                start_positions.append(j)
            if end_char == end:
                end_positions.append(j)

    # Convert the lists of start and end positions to tensors
    start_positions = torch.tensor(start_positions, dtype=torch.long)
    end_positions = torch.tensor(end_positions, dtype=torch.long)

    # Update the input data
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [None]:
# Define a function to preprocess the datasets
def preprocess_datasets(dataset, tokenizer, max_sequence_length):
    # Define a function to preprocess a single example
    def preprocess_example(example):
        return preprocess_training_data(example, tokenizer, max_sequence_length)

    # Apply the preprocess_example function to the train, validation, and test datasets
    train_dataset = dataset["train"].map(
        preprocess_example,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )
    print("Train dataset - Original:", len(dataset["train"]), "Processed:", len(train_dataset))

    valid_dataset = dataset["validation"].map(
        preprocess_example,
        batched=True,
        remove_columns=dataset["validation"].column_names,
    )
    print("Validation dataset - Original:", len(dataset["validation"]), "Processed:", len(valid_dataset))

    test_dataset = dataset["test"].map(
        preprocess_example,
        batched=True,
        remove_columns=dataset["test"].column_names,
    )
    print("Test dataset - Original:", len(dataset["test"]), "Processed:", len(test_dataset))

    return train_dataset, valid_dataset, test_dataset

# Set the maximum sequence length
max_sequence_length = 356

# Apply the preprocess_datasets function to your dataset
train_dataset, valid_dataset, test_dataset = preprocess_datasets(dataset, tokenizer, max_sequence_length)

In [None]:
def preprocess_training_data(examples, tokenizer, max_sequence_length):
    # Extract questions from examples and strip whitespace
    questions = [q.strip() for q in examples["question"]]

    # Tokenize questions and contexts
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_sequence_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Extract offset mappings, sample mappings, answers, and initialize lists for start and end positions
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully in the context, set start and end positions to (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    # Update the input data
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [None]:
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

In [None]:
max_length = 384
stride = 70


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_length,
        truncation='only_second',
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
        )

    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for (i, offset) in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)

        if offset[context_start][0] > end_char \
            or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:

      # Otherwise it's the start and end token positions

            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

In [None]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)