In [2]:
import torch
import math
import time
import json
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk


MODEL_NAME = "distilgpt2"
DEVICE = "cpu"
MAX_NEW_TOKENS = 80


N_LAYERS = 6
D_MODEL = 768

PROMPTS = [
    "What are the nutritional benefits of an apple?",
    "Explain why bananas are good for the body in simple terms.",
    "How does eating an orange help your health?",
    "Describe the nutrients found in strawberries.",
]

REFERENCES = [
    "Apples are rich in fibre and vitamin C which support digestion and immunity.",
    "Bananas provide potassium which helps muscle and nerve function.",
    "Oranges are high in vitamin C which supports the immune system.",
    "Strawberries contain vitamin C, antioxidants, and fibre."
]


nltk.download("punkt")
smooth = SmoothingFunction().method1

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
tokenizer.pad_token = tokenizer.eos_token

def compute_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        out = model(**inputs, labels=inputs["input_ids"])
    return math.exp(out.loss.item())

def compute_bleu2(reference, generated):
    return sentence_bleu(
        [reference.split()],
        generated.split(),
        weights=(0.5, 0.5),
        smoothing_function=smooth
    )

def estimate_flops(total_tokens):
    return 2 * N_LAYERS * (D_MODEL ** 2) * total_tokens


results = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\nEvaluating DistilGPT-2 (zero-shot)...\n")

for prompt, reference in zip(PROMPTS, REFERENCES):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    input_len = inputs["input_ids"].shape[1]

    start = time.time()
    with torch.no_grad():
        generated = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
        )
    end = time.time()

    output_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    total_len = generated.shape[1]
    gen_tokens = total_len - input_len

    ppl = compute_perplexity(output_text)
    bleu2 = compute_bleu2(reference, output_text)
    flops = estimate_flops(total_len)

    print(f"PROMPT: {prompt}")
    print(f"OUTPUT: {output_text}")
    print(f"PPL: {ppl:.2f} | BLEU-2: {bleu2:.3f}")
    print(f"Tokens: {gen_tokens} | FLOPs: {flops/1e9:.2f} GFLOPs")
    print("-" * 70)

    results.append({
        "prompt": prompt,
        "output": output_text,
        "perplexity": ppl,
        "bleu2": bleu2,
        "generated_tokens": gen_tokens,
        "flops": flops
    })


outfile = f"distilgpt2_eval_{timestamp}.json"
with open(outfile, "w") as f:
    json.dump(results, f, indent=2)

print(f"\nSaved evaluation results to {outfile}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Evaluating DistilGPT-2 (zero-shot)...



`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


PROMPT: What are the nutritional benefits of an apple?
OUTPUT: What are the nutritional benefits of an apple? We know itâ€™s a lot. But when it comes to fruits and vegetables, we think we're not going to be able to do that. It's a tough question. It's a tough question. But I'm not going to say we can do that at this point.





The new report, authored by researchers from the University of Illinois at Urbana
PPL: 8.31 | BLEU-2: 0.007
Tokens: 80 | FLOPs: 0.63 GFLOPs
----------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


PROMPT: Explain why bananas are good for the body in simple terms.
OUTPUT: Explain why bananas are good for the body in simple terms.
PPL: 126.89 | BLEU-2: 0.000
Tokens: 1 | FLOPs: 0.10 GFLOPs
----------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


PROMPT: How does eating an orange help your health?
OUTPUT: How does eating an orange help your health?
















































































PPL: 36.49 | BLEU-2: 0.000
Tokens: 80 | FLOPs: 0.63 GFLOPs
----------------------------------------------------------------------
PROMPT: Describe the nutrients found in strawberries.
OUTPUT: Describe the nutrients found in strawberries.
















































































PPL: 45.59 | BLEU-2: 0.000
Tokens: 80 | FLOPs: 0.62 GFLOPs
----------------------------------------------------------------------

Saved evaluation results to distilgpt2_eval_20251215_011201.json
