In [73]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from datasets import load_dataset
from evaluate import load as load_metric
import time
import warnings
warnings.filterwarnings('ignore')

In [74]:
checkpoint = "./results/checkpoint-10950"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [75]:
example = {
    "context": "The Eiffel Tower is located in Paris and is one of the most visited landmarks in the world.",
    "question": "Where is the Eiffel Tower located?"
}
print(qa_pipeline(example))


{'score': 0.983989953994751, 'start': 31, 'end': 36, 'answer': 'Paris'}


In [76]:
squad = load_dataset("squad")
val_set = squad["validation"]
print(len(val_set))

10570


In [77]:
results = {}

In [78]:
metric = load_metric("squad")

preds, refs = [], []

for i in range(len(val_set)):
    ex = val_set[i]
    pred = qa_pipeline({"context": ex["context"], "question": ex["question"]})
    preds.append({"id": ex["id"], "prediction_text": pred["answer"]})
    refs.append({"id": ex["id"], "answers": ex["answers"]})

result = metric.compute(predictions=preds, references=refs)
print(result)

# Add EM and F1 into your results dict
results["Exact Match"] = round(result["exact_match"], 2)
results["F1 Score"] = round(result["f1"], 2)



{'exact_match': 77.07663197729423, 'f1': 85.11510430622388}


In [79]:
test_ex = {
    "context": val_set[0]["context"],
    "question": val_set[0]["question"]
}

qa_pipeline(test_ex)

runs = 20
start = time.time()
for _ in range(runs):
    qa_pipeline(test_ex)
end = time.time()

avg_time = (end - start) / runs
latency_ms = avg_time * 1000
throughput = 1 / avg_time

print(f"Average latency: {latency_ms:.2f} ms")
print(f"Throughput: {throughput:.2f} questions/sec")

# Store results for later presentation
results["latency"] = round(latency_ms, 2)
results["throughput"] = round(throughput, 2)

Average latency: 26.60 ms
Throughput: 37.59 questions/sec


In [80]:
import os

def get_model_size(model_dir=checkpoint):
    total_size = 0
    for dirpath, _, filenames in os.walk(model_dir):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return round(total_size / (1024*1024), 2)

model_size_mb = get_model_size(checkpoint)
print("Model size:", model_size_mb, 'MB')

results['Model_size'] = model_size_mb

Model size: 760.5 MB


In [81]:
import json

with open("M1_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Saved metrics:", results)

Saved metrics: {'Exact Match': 77.08, 'F1 Score': 85.12, 'latency': 26.6, 'throughput': 37.59, 'Model_size': 760.5}


In [82]:
import torch
gpu_name = torch.cuda.get_device_name(0)
gpu_mem = round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)

results['GPU'] = f"GPU: {gpu_name} ({gpu_mem} GB VRAM)"

In [83]:
print(f'Finetuned DistilBERT Evaluation:')
print(f"Model Size: {results['Model_size']} MB")
print(f"F1 Score: {results['F1 Score']}")
print(f"Exact Match: {results['Exact Match']}")
print(f"Latency: {results['latency']} ms")
print(f"Throughput: {results['throughput']} Questions per Second")
print(f"GPU: {results['GPU']}")

Finetuned DistilBERT Evaluation:
Model Size: 760.5 MB
F1 Score: 85.12
Exact Match: 77.08
Latency: 26.6 ms
Throughput: 37.59 Questions per Second
GPU: GPU: NVIDIA GeForce GTX 1650 (4.0 GB VRAM)
