In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
from transformers import DefaultDataCollator
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer

2024-05-13 06:53:26.720659: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-13 06:53:26.720756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-13 06:53:26.843365: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data Preprocessing

In [4]:
dataset = load_dataset("arcd")

Downloading readme:   0%|          | 0.00/8.53k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 174k/174k [00:00<00:00, 977kB/s]
Downloading data: 100%|██████████| 192k/192k [00:00<00:00, 1.79MB/s]


Generating train split:   0%|          | 0/693 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/702 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 693
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 702
    })
})

In [5]:
# Extract the train and validation datasets
train_dataset = dataset['train']
val_dataset = dataset['validation']

# Select the first 150 rows for validation
new_val_dataset = val_dataset.select(range(200))

# Select the remaining rows for training
remaining_val_dataset = val_dataset.select(range(200, len(val_dataset)))


new_train_dataset = concatenate_datasets([train_dataset, remaining_val_dataset])

# Create a new DatasetDict with the updated splits
dataset = DatasetDict({
    'train': new_train_dataset,
    'validation': new_val_dataset
})

# Print the information of the new DatasetDict
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1195
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 200
    })
})


In [6]:
dataset = dataset.remove_columns(["title","id"])

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 1245
    })
    validation: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 150
    })
})

In [7]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="longest",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if start_char is not None else None
        sequence_ids = inputs.sequence_ids(i) if inputs.sequence_ids(i) else []

        if not start_char or not end_char or not sequence_ids:
            # Handle cases where start_char, end_char, or sequence_ids are empty
            start_positions.append(0)
            end_positions.append(0)
            continue  # Move to the next iteration

        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if context_start >= len(offset) or context_end >= len(offset):
            # Handle cases where context indices exceed offset_mapping length
            start_positions.append(0)
            end_positions.append(0)
            continue  # Move to the next iteration

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

tokenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# Model

In [8]:
train_params = TrainingArguments(
    # Learning algorithms parameters
    optim = "adamw_hf",
    learning_rate=2e-4,  # Use a smaller learning rate
    weight_decay=0.02,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    output_dir = 'QA_FineTuned_Arabertr',
    
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Increase the number of epochs
    logging_steps=10,
    save_steps=50,
    seed=42,
    # Enable early stopping
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=True
)

In [9]:
model = AutoModelForQuestionAnswering.from_pretrained("aubmindlab/bert-base-arabert")

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
trainer = Trainer(
    model,
    train_params,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss
10,5.518,4.357257
20,4.254,3.805361
30,3.9915,3.578841
40,3.836,3.405401
50,3.6733,3.250558
60,3.4425,3.088195
70,3.2917,2.993389
80,2.9622,3.002911
90,2.3228,3.119015
100,2.4004,2.861346




TrainOutput(global_step=225, training_loss=2.514185447692871, metrics={'train_runtime': 277.3149, 'train_samples_per_second': 12.928, 'train_steps_per_second': 0.811, 'total_flos': 702561654673920.0, 'train_loss': 2.514185447692871, 'epoch': 3.0})