In [None]:
!pip install transformers datasets accelerate -U peft evaluate

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.0 MB/s[0m eta [36m0

In [None]:
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from collections import Counter
import numpy as np
import re
import string

dataset = load_dataset("legacy107/cpgQA")
train_dataset = dataset['train']
eval_dataset = dataset['test']


# Load the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    encodings = tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, (context, answer, answer_start) in enumerate(zip(examples['context'], examples['answer'], examples['answer_start'])):
        start_position = None
        end_position = None

        start_idx = answer_start
        end_idx = start_idx + len(answer)

        offset_mapping = encodings['offset_mapping'][i]

        for j, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start <= start_idx and offset_end > start_idx:
                start_position = j
            if offset_start < end_idx and offset_end >= end_idx:
                end_position = j
                break

        if start_position is not None and end_position is not None:
            start_positions.append(start_position)
            end_positions.append(end_position)
        else:
            start_positions.append(0)
            end_positions.append(0)

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

# Tokenize the dataset first
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)


# Create the model
model = BertForQuestionAnswering.from_pretrained("csarron/bert-base-uncased-squad-v1")

# Training arguments
training_args = TrainingArguments(
    evaluation_strategy="epoch",
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    greater_is_better=True,
    fp16=True,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Fine-tune
trainer.train()

# Compute predictions
predictions = trainer.predict(eval_dataset)

# Get the predicted start and end positions
pred_start = predictions.predictions[0].argmax(axis=-1)
pred_end = predictions.predictions[1].argmax(axis=-1)

# Extract true start and end positions from the evaluation dataset
true_start_positions = [item['start_positions'] for item in eval_dataset]
true_end_positions = [item['end_positions'] for item in eval_dataset]


# Calculate F1
f1s = []
for i in range(0, len(pred_start)):
    pred_span = set(range(pred_start[i], pred_end[i] + 1))
    true_span = set(range(true_start_positions[i], true_end_positions[i] + 1))

    num_same = len(pred_span.intersection(true_span))
    if num_same == 0:
        f1s.append(0)
        continue

    precision = float(num_same) / float(len(pred_span))
    recall = float(num_same) / float(len(true_span))
    f1 = (2 * precision * recall) / (precision + recall)
    f1s.append(f1)

print('Average F1 Score: {:.3f}'.format(np.mean(f1s)))

# Calculate Exact Match
total_correct = 0
for i in range(0, len(pred_start)):
    matches = 0
    if pred_start[i] == true_start_positions[i]:
        matches += 1
    if pred_end[i] == true_end_positions[i]:
        matches += 1
    total_correct += matches

total_indeces = 2 * len(pred_start)  # because both start and end are considered
print('Correctly predicted indeces: {:,} of {:,} ({:.2%})'.format(
    total_correct,
    total_indeces,
    float(total_correct) / float(total_indeces)
))



model.save_pretrained("/content/my_model")
tokenizer.save_pretrained("/content/my_model")



Downloading readme:   0%|          | 0.00/643 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/174k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/57.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/987 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/110 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at csarron/bert-base-uncased-squad-v1 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss
1,No log,1.354217
2,No log,1.380114
3,No log,1.511015


Average F1 Score: 0.767
Correctly predicted indeces: 156 of 220 (70.91%)


('/content/my_model/tokenizer_config.json',
 '/content/my_model/special_tokens_map.json',
 '/content/my_model/vocab.txt',
 '/content/my_model/added_tokens.json',
 '/content/my_model/tokenizer.json')

In [None]:
import shutil

# Compress the model folder
shutil.make_archive("/content/my_model", 'zip', "/content/my_model")

# Download the compressed file
from google.colab import files
files.download("/content/my_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#evaluating models against SQUAD benchmark
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import pipeline, BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

models = [
    "./results/checkpoint-124",
    "./results/checkpoint-186",
    "./results/checkpoint-62"
]

data = load_dataset("squad", split="validation")
task_evaluator = evaluator("question-answering")

results = []
for model in models:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model,
            tokenizer=tokenizer,
            data=data,
            metric="squad",
            squad_v2_format=False
            )
        )

df = pd.DataFrame(results, index=models)
df[["exact_match", "f1", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Unnamed: 0,exact_match,f1,total_time_in_seconds,samples_per_second,latency_in_seconds
./results/checkpoint-124,71.059603,83.203324,169.300141,62.433498,0.016017
./results/checkpoint-186,69.86755,82.538628,167.497395,63.105459,0.015846
./results/checkpoint-62,74.947966,85.347611,167.425999,63.132369,0.01584


In [3]:
#evaluating models against SQUAD benchmark
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import pipeline, BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

models = [
    "csarron/bert-base-uncased-squad-v1"
]

data = load_dataset("squad", split="validation")
task_evaluator = evaluator("question-answering")

results = []
for model in models:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model,
            data=data,
            metric="squad",
            squad_v2_format=False
            )
        )

df = pd.DataFrame(results, index=models)
df[["exact_match", "f1", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at csarron/bert-base-uncased-squad-v1 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Unnamed: 0,exact_match,f1,total_time_in_seconds,samples_per_second,latency_in_seconds
csarron/bert-base-uncased-squad-v1,80.927152,88.23717,178.840062,59.103089,0.01692


In [9]:
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from collections import Counter
import numpy as np
import re
import string

dataset = load_dataset("legacy107/cpgQA")
train_dataset = dataset['train']
eval_dataset = dataset['test']


# Load the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    encodings = tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, (context, answer, answer_start) in enumerate(zip(examples['context'], examples['answer'], examples['answer_start'])):
        start_position = None
        end_position = None

        start_idx = answer_start
        end_idx = start_idx + len(answer)

        offset_mapping = encodings['offset_mapping'][i]

        for j, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start <= start_idx and offset_end > start_idx:
                start_position = j
            if offset_start < end_idx and offset_end >= end_idx:
                end_position = j
                break

        if start_position is not None and end_position is not None:
            start_positions.append(start_position)
            end_positions.append(end_position)
        else:
            start_positions.append(0)
            end_positions.append(0)

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

# Tokenize the dataset first
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)


# Create the model
model = BertForQuestionAnswering.from_pretrained("csarron/bert-base-uncased-squad-v1")

# Set up TrainingArguments and Trainer
training_args = TrainingArguments(
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    output_dir="./results",
    do_train=False,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Use Trainer to get predictions
predictions = trainer.predict(eval_dataset)

# Extract logits from the predictions
start_logits, end_logits = predictions.predictions

# Get the predicted start and end positions
pred_start = np.argmax(start_logits, axis=-1)
pred_end = np.argmax(end_logits, axis=-1)

# Extract true start and end positions from the evaluation dataset
true_start_positions = [item['start_positions'] for item in eval_dataset]
true_end_positions = [item['end_positions'] for item in eval_dataset]


# Calculate F1
f1s = []
for i in range(0, len(pred_start)):
    pred_span = set(range(pred_start[i], pred_end[i] + 1))
    true_span = set(range(true_start_positions[i], true_end_positions[i] + 1))

    num_same = len(pred_span.intersection(true_span))
    if num_same == 0:
        f1s.append(0)
        continue

    precision = float(num_same) / float(len(pred_span))
    recall = float(num_same) / float(len(true_span))
    f1 = (2 * precision * recall) / (precision + recall)
    f1s.append(f1)

print('Average F1 Score: {:.3f}'.format(np.mean(f1s)))

# Calculate Exact Match
total_correct = 0
for i in range(0, len(pred_start)):
    matches = 0
    if pred_start[i] == true_start_positions[i]:
        matches += 1
    if pred_end[i] == true_end_positions[i]:
        matches += 1
    total_correct += matches

total_indeces = 2 * len(pred_start)  # because both start and end are considered
print('Correctly predicted indeces: {:,} of {:,} ({:.2%})'.format(
    total_correct,
    total_indeces,
    float(total_correct) / float(total_indeces)
))


Some weights of the model checkpoint at csarron/bert-base-uncased-squad-v1 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Average F1 Score: 0.640
Correctly predicted indeces: 124 of 220 (56.36%)
