## Load the model & its tokenizer

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained("distill-gpt2")
tokenizer = AutoTokenizer.from_pretrained("distill-gpt2")

# Move to M1/M2 GPU if available, else fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


## The testing example (In NL)

In [2]:
test_examples = [
    {"question": "What is the capital of France?", "answer": "Paris"},
    {"question": "Who wrote Hamlet?", "answer": "William Shakespeare"},
    {"question": "What gas do humans breathe in?", "answer": "Oxygen"},
]

## Generating answer

Loop over examples:
1. Tokenize the input text to embedding
  
2. Generate the output text, under no_grad()
3. Decode the output and return the natrual text

In [10]:
# Define generation function
def generate_answer(prompt: str, max_new_tokens: int = 3) -> str:
    model.eval()
    # Encode and move all tensors to MPS
    encoded = tokenizer(prompt, return_tensors="pt")
    # encoded = {k: v.to(device) for k, v in encoded.items()}  # ✅ Ensure all inputs on MPS

    # 3. Inference
    with torch.no_grad():
        output_ids = model.generate(
            **encoded,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.3,
            eos_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [11]:
generated = []
for item in test_examples:
    prompt = item["question"]
    prediction = generate_answer(prompt)
    print(f"Q: {prompt}")
    print(f"Predicted: {prediction}")
    print(f"Expected : {item['answer']}")
    print("-" * 40)
    generated.append((prediction, item["answer"]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the capital of France?
Predicted: What is the capital of France?
In 18
Expected : Paris
----------------------------------------
Q: Who wrote Hamlet?
Predicted: Who wrote Hamlet?

Question
Expected : William Shakespeare
----------------------------------------
Q: What gas do humans breathe in?
Predicted: What gas do humans breathe in?

Question
Expected : Oxygen
----------------------------------------


In [12]:
from sklearn.metrics import accuracy_score
import evaluate
import numpy as np

bleu = evaluate.load("bleu")

def normalize(text):
    return text.strip().lower()

# Exact Match
exact_match = [
    int(normalize(pred) == normalize(label)) for pred, label in generated
]
em_score = np.mean(exact_match)

# BLEU Score
pred_texts = [normalize(pred) for pred, _ in generated]
label_texts = [[normalize(label)] for _, label in generated]

bleu_result = bleu.compute(predictions=pred_texts, references=label_texts)

print("📊 Evaluation Metrics:")
print(f"✅ Exact Match Accuracy: {em_score:.3f}")
print(f"🟦 BLEU Score: {bleu_result['bleu']:.3f}")

📊 Evaluation Metrics:
✅ Exact Match Accuracy: 0.000
🟦 BLEU Score: 0.000


In [13]:
label_texts

[['paris'], ['william shakespeare'], ['oxygen']]

In [14]:
pred_texts

['what is the capital of france?\nin 18',
 'who wrote hamlet?\n\nquestion',
 'what gas do humans breathe in?\n\nquestion']