## Ekhator Uwaila's Notebook

In [None]:
#!pip install transformers datasets evaluate

In [None]:
#!pip install transformers[torch]

#### Installing and Importing modules:

In [56]:
#!pip install transformers -U

In [57]:
#!pip install accelerate -U

In [2]:
import transformers
import accelerate

In [3]:
# from huggingface_hub import notebook_login

# notebook_login()

#### Load SQuAD dataset and split into train and test

In [58]:
#!pip install datasets evaluate

In [6]:
from datasets import load_dataset

#squad = load_dataset("squad", split="train[:5000]")
train = load_dataset("squad", split="train[:5000]")
test = load_dataset("squad", split="train[5000:6000]")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

#### Taking a look at an example:

In [7]:
train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [8]:
train[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

#### Preparing the data:

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Apply the preprocessing function over the entire dataset:

In [11]:
tokenized_train = train.map(preprocess_function, batched=True, remove_columns=train.column_names)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [12]:
tokenized_test = test.map(preprocess_function, batched=True, remove_columns=train.column_names)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 5000
})

In [14]:
tokenized_test

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1000
})

Create a batch of examples using DefaultDataCollator:

In [15]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

#### Train

Load DistilBERT with AutoModelForQuestionAnswering:

In [16]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir="my_q&a_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.314678
2,2.664500,1.97932
3,2.664500,1.954695


TrainOutput(global_step=939, training_loss=2.0337148713100706, metrics={'train_runtime': 619.4044, 'train_samples_per_second': 24.217, 'train_steps_per_second': 1.516, 'total_flos': 1469847375360000.0, 'train_loss': 2.0337148713100706, 'epoch': 3.0})

#### Inference

In [18]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [19]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="/content/my_q&a_model/checkpoint-500")
question_answerer(question=question, context=context)


{'score': 0.1580657958984375,
 'start': 10,
 'end': 95,
 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}

#### Prepare dataset for BLEU:

In [20]:
predictions = []
references = []

def prepare_data_for_bleu(dataset):
  for example in dataset:
    context = example["context"]
    question = example["question"]
    reference_answer = example["answers"]["text"][0]  #since we have only one reference answer

    # Generate prediction using question-answering pipeline
    prediction = question_answerer(question=question, context=context)
    predicted_answer = prediction["answer"]

    # Append predictions and reference answers
    predictions.append(predicted_answer)
    references.append(reference_answer)

In [21]:
prepare_data_for_bleu(test)

In [22]:
test[0]

{'id': '56d0875b234ae51400d9c349',
 'title': 'Solar_energy',
 'context': 'Greenhouses convert solar light to heat, enabling year-round production and the growth (in enclosed environments) of specialty crops and other plants not naturally suited to the local climate. Primitive greenhouses were first used during Roman times to produce cucumbers year-round for the Roman emperor Tiberius. The first modern greenhouses were built in Europe in the 16th century to keep exotic plants brought back from explorations abroad. Greenhouses remain an important part of horticulture today, and plastic transparent materials have also been used to similar effect in polytunnels and row covers.',
 'question': 'What is one purpose of a greenhouse?',
 'answers': {'text': ['enabling year-round production and the growth (in enclosed environments) of specialty crops'],
  'answer_start': [41]}}

In [23]:
#formatted_predictions2 = [[word for word in sentence.split()] for sentence in predictions]
formatted_references2 = [[sentence] for sentence in references]


In [24]:
predictions[0:5]

['Primitive greenhouses were first used during Roman times to produce cucumbers',
 'Primitive greenhouses were first used during Roman times',
 'Europe in the 16th century',
 'World Solar Challenge',
 'North American Solar Challenge and the planned South African Solar Challenge']

In [28]:
formatted_references2[0:5]

[['enabling year-round production and the growth (in enclosed environments) of specialty crops'],
 ['produce cucumbers year-round for the Roman emperor Tiberius'],
 ['Europe'],
 ['The World Solar Challenge'],
 ['90.87']]

In [27]:
from datasets import load_metric
bleu_metric = load_metric("bleu")

  bleu_metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [29]:
len(formatted_references2)

1000

#### Evaluate on the BLEU Metric

In [30]:
bleu_metric.compute(predictions=[predictions], references=[formatted_references2])

{'bleu': 0.0,
 'precisions': [0.351, 0.0, 0.0, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1000.0,
 'translation_length': 1000,
 'reference_length': 1}

#### Using Roberta model:

In [33]:
from transformers import AutoTokenizer, RobertaForQuestionAnswering, RobertaTokenizer
import torch

#### Load dataset:

In [32]:
from datasets import load_dataset

#squad = load_dataset("squad", split="train[:5000]")
train2 = load_dataset("squad", split="train[:5000]")
test2 = load_dataset("squad", split="train[5000:6000]")

#### Prepare data:

In [37]:
from transformers import AutoTokenizer

tokenizer2 = AutoTokenizer.from_pretrained("roberta-base")

In [38]:
def preprocess_function2(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer2(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Apply the preprocessing function over the entire dataset:

In [39]:
tokenized_train2 = train2.map(preprocess_function2, batched=True, remove_columns=train2.column_names)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [40]:
tokenized_test2 = test2.map(preprocess_function2, batched=True, remove_columns=train2.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Model Finetuning:

In [42]:
model2 = RobertaForQuestionAnswering.from_pretrained("roberta-base")
training_args2 = TrainingArguments(
    output_dir="my_q&a2_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=tokenized_train2,
    eval_dataset=tokenized_test2,
    tokenizer=tokenizer2,
    data_collator=data_collator,
)

trainer.train()


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.413927
2,1.660300,1.383823
3,1.660300,1.366131


TrainOutput(global_step=939, training_loss=1.2244310795435636, metrics={'train_runtime': 1233.6779, 'train_samples_per_second': 12.159, 'train_steps_per_second': 0.761, 'total_flos': 2939588513280000.0, 'train_loss': 1.2244310795435636, 'epoch': 3.0})

#### Inference:

In [43]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [45]:
from transformers import pipeline

question_answerer2 = pipeline("question-answering", model="/content/my_q&a2_model/checkpoint-500")
question_answerer2(question=question, context=context)

{'score': 0.3631088137626648, 'start': 93, 'end': 95, 'answer': '13'}

#### Prepare dataset for BLEU

In [46]:
predictions2 = []
references2 = []

def prepare_data_for_bleu2(dataset):
  for example in dataset:
    context = example["context"]
    question = example["question"]
    reference_answer2 = example["answers"]["text"][0]  #since we have only one reference answer

    # Generate prediction using question-answering pipeline
    prediction2 = question_answerer2(question=question, context=context)
    predicted_answer2 = prediction2["answer"]

    # Append predictions and reference answers
    predictions2.append(predicted_answer2)
    references2.append(reference_answer2)

In [47]:
prepare_data_for_bleu2(test2)

In [48]:
formatted_references3 = [[sentence] for sentence in references2]

#### Evaluate on the BLEU metric:

In [50]:
bleu_metric.compute(predictions=[predictions2], references=[formatted_references3])

{'bleu': 0.0,
 'precisions': [0.453, 0.0, 0.0, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1000.0,
 'translation_length': 1000,
 'reference_length': 1}

#### Questions:
1. The default model for this assignment is Distilbert. What other model did you try? Why did you pick that model? How many parameters does that model have compared to Distilbert?

I tried the Roberta model, because i used it in the last homework and was curious to see how it would perform here. Also because i read an article where RoBERTa was said to be an improved version of BERT, and it can train on longer sequences.
[LINK TO ARTICLE](https://towardsdatascience.com/roberta-1ef07226c8d8) <br>
The total number of parameters of RoBERTa is 355M, while Robeta-base has 125M parameters which is the specific one i used. Distilbert has 66M parameters.

2. Which model worked better? Why do you think that particular model worked better?

Roberta model worked better, i think it worked better because a larger amount of text was used in its pretraining compared to Distillbert

3. What is the BLEU metric calculating?

The bleu metric finds the clipped precision for 1 gram, 2 gram, 3 gram, and 4 gram. Then it finds the geometric average of these precision scores which is basically raising each precision to power 1/4, and then multiplying them together.<br>
Then it finds the brevity penalty, which is always 1 if the length of predicted text is greater than the reference text.<br>
After doing that, it multiplies the brevity penalty with the Geometic Average Precision Scores to get the Bleu score.<br>
The Bleu metric tells us how good our predicted texts are.

4. Why is the BLEU metric useful? In what ways is it a bad metric? Provide an example where the BLEU metric might fail.

It is useful in the following ways:
- It uses clipped precision to prevent the issue of repetition, e.g:<br>if a target sentence is: 'she is riding a bike'<br>
predicted sentence: 'bike bike bike riding'<br>
The precision score would be 3/4, and this is incorrect. Clipped precision counts the repeated word just once so we get a precision of 1/4.<br>
- Most times short predictions gain a higher score e.g:<br>
if a target sentence is: 'she is riding a bike'<br>
predicted sentence: 'bike'<br>
the precision would be 1/1, which is incorrect, so Bleu score attempts to prevent this challenge by calculating the brevity penalty.
- Also by making use of the 1 gram to 4 gram, it tries to check that the sequences are also predicted correctly, however there are still challenges with this which i would answer in the next question

It is a bad metric/fails in the following ways with examples:
- It does not consider the order/ sequence of words, it just check if a particular word is present. e.g for unigram precision:<br>
target: 'she is riding a bike'
predicted: 'riding is she a bike'<br>
This would get a high precision even though the predicted sentence is incorrect.
- Another area where it fails is that, it does not conider synonyms. A human might accept if 'thermos' was predicted instead of 'flask', but the blue metric would not.

In [54]:
# num_parameters = model.num_parameters()
# print("Number of parameters in the DistilBERT model:", num_parameters)

In [55]:
# num_parameters2 = model2.num_parameters()
# print("Number of parameters in the Roberta model:", num_parameters2)

In [None]:
# # Calculate BLEU score for each question-answer pair
# bleu_scores = []
# for pred, refs in zip(formatted_predictions2, formatted_references2):
#     bleu_score = bleu_metric.compute(predictions=[pred], references=[[refs]])
#     bleu_scores.append(bleu_score["bleu"])

# # Calculate total BLEU score
# total_bleu_score = sum(bleu_scores) / len(bleu_scores)

# print("Individual BLEU scores:")
# for i, bleu_score in enumerate(bleu_scores, start=1):
#     print(f"Pair {i}: BLEU Score: {bleu_score}")

# print("\nTotal BLEU Score:", total_bleu_score)

In [None]:
# from datasets import load_metric
# from transformers import pipeline

# # Load BLEU metric
# bleu_metric = load_metric("bleu")
# predictions = []
# references = []

# def calculate_bleu_score(dataset):

#   for example in dataset:
#     context = example["context"]
#     question = example["question"]
#     reference_answer = example["answers"]["text"][0]  # Assuming only one reference answer

#     # Generate prediction using question-answering pipeline
#     prediction = question_answerer(question=question, context=context)
#     predicted_answer = prediction["answer"]

#     # Append predictions and reference answers
#     predictions.append(predicted_answer)
#     references.append(reference_answer)

# formatted_predictions2 = [[word for word in sentence.split()] for sentence in predictions]
# formatted_references2 = [[word for word in sentence.split()] for sentence in references]
# bleu_metric.compute(predictions=[predictions], references=[formatted_references2])

# calculate_bleu_score(question_answerer, test)

# # # Define a function to calculate BLEU scores
# # def calculate_bleu_score(question_answerer, dataset):
# #     predictions = []
# #     references = []

# #     for example in dataset:
# #         context = example["context"]
# #         question = example["question"]
# #         reference_answer = example["answers"]["text"][0]  # Assuming only one reference answer

# #         # Generate prediction using question-answering pipeline
# #         prediction = question_answerer(question=question, context=context)
# #         predicted_answer = prediction["answer"]

# #         # Append predictions and reference answers
# #         predictions.append(predicted_answer)
# #         references.append(reference_answer)

#     # # Format predictions and references for BLEU calculation
#     # formatted_predictions = [{"prediction_text": pred} for pred in predictions]
#     # formatted_references = [{"answers": {"text": [ref]}} for ref in references]

#     # Compute BLEU score
#     #results = bleu_metric.compute(predictions=formatted_predictions, references=formatted_references)

#     #return results["exact"]

# # Calculate BLEU score on the test set
# #bleu_score = calculate_bleu_score(question_answerer, test)

# # Print BLEU score
# #print("BLEU Score:", bleu_score)

# #print("Keys of results:", results.keys())

