In [6]:
!pip install evaluate datasets transformers nltk absl-py rouge_score --quiet

  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch
import evaluate

model_path = "./distilgpt2-wekeza-finetuned_v5_cot_lora"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): lora.Linear(
            (base_layer): Conv1D(nf=768, nx=768)
            (l

In [8]:
data = [
    {
        "question": "What is a money market fund in Kenya?",
        "reference": "A money market fund in Kenya is a low-risk collective investment scheme that invests in short-term debt instruments and offers high liquidity."
    },
    {
        "question": "Explain fixed deposits in Kenyan banks.",
        "reference": "Fixed deposits are bank accounts where money is locked for a set period in exchange for a higher interest rate than regular savings accounts."
    }
]
dataset = Dataset.from_list(data)

In [9]:
#Load ROUGE Metric
rouge = evaluate.load("rouge")

In [10]:
#predictions
predictions = []
references = []

for example in dataset:
    prompt = example["question"]
    inputs = tokenizer(prompt, return_tensors="pt")  # stays on CPU

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=60)  # CPU generation
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(generated_text)
    references.append(example["reference"])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [13]:
#Computing ROUGE-L Score
results = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True,
    use_aggregator=False
)
print(" ROUGE-L (F1):", results["rougeL"])


 ROUGE-L (F1): [0.375, 0.2]


In [16]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = scorer.score(references[0], predictions[0])

print(f"ROUGE-L Recall: {scores['rougeL'].recall:.4f}")
print(f"ROUGE-L Precision: {scores['rougeL'].precision:.4f}")
print(f"ROUGE-L F1: {scores['rougeL'].fmeasure:.4f}")


ROUGE-L Recall: 0.2500
ROUGE-L Precision: 0.7500
ROUGE-L F1: 0.3750
