In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/sanahaidar/Desktop/NLP/text-summarizer'

In [36]:
from dataclasses import dataclass
from pathlib import Path



@dataclass
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metrics_file_name: Path

   

In [5]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories 

In [42]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([Path(self.config.artifacts_root)])


    # Model Evaluation Configuration
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        

        create_directories([Path(config.root_dir)])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokenizer_path,
            metrics_file_name=config.metrics_file_name
        )

        return model_evaluation_config

In [31]:
import torch
import evaluate
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk

 


In [None]:
class ModelEvaluation:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]


    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                    batch_size=16,
                                    column_text="article",
                                    column_summary="highlights"):

        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)
        ):
            inputs = tokenizer(
                article_batch, max_length=1024, truncation=True,
                padding="max_length", return_tensors="pt"
            )

            summaries = model.generate(
                input_ids=inputs["input_ids"].to(self.device),
                attention_mask=inputs["attention_mask"].to(self.device),
                length_penalty=0.8,
                num_beams=8,
                max_length=128
            )

            decoded_summaries = [
                tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                for s in summaries
            ]

            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        return metric.compute()


    def evaluate(self):
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(self.device)

        dataset = load_from_disk(self.config.data_path)

        rouge_metric = evaluate.load('rouge')
        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        score = self.calculate_metric_on_test_ds(
            dataset["test"],
            rouge_metric,
            model,
            tokenizer,
            batch_size=2,
            column_text="dialogue",
            column_summary="summary"
        )

        rouge_dict = {rn: round(score[rn]*100 , 2) for rn in rouge_names}
        df = pd.DataFrame(rouge_dict, index=["pegasus"])
        df.to_csv(self.config.metrics_file_name, index=False)



In [44]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(model_evaluation_config)
    model_evaluation.evaluate()
except Exception as e:
    logger.exception(e)
    raise e

[2025-07-31 10:03:16,499: INFO: common: YAML file config/config.yaml read successfully.]
[2025-07-31 10:03:16,501: INFO: common: YAML file params.yaml read successfully.]
[2025-07-31 10:03:16,502: INFO: common: Created directory: artifacts]
[2025-07-31 10:03:16,503: INFO: common: Created directory: artifacts/model_evaluation]


Downloading builder script: 6.27kB [00:00, 3.61MB/s]
100%|██████████| 410/410 [2:52:59<00:00, 25.32s/it]    

[2025-07-31 12:56:20,575: INFO: rouge_scorer: Using default tokenizer.]





[2025-07-31 12:56:20,832: ERROR: 3834945561: 'numpy.float64' object has no attribute 'mid']
Traceback (most recent call last):
  File "/var/folders/ym/9nvn6tw55w3b0k7h583z055c0000gn/T/ipykernel_42630/3834945561.py", line 5, in <module>
    model_evaluation.evaluate()
  File "/var/folders/ym/9nvn6tw55w3b0k7h583z055c0000gn/T/ipykernel_42630/695234763.py", line 64, in evaluate
    rouge_dict = {rn: score[rn].mid.fmeasure for rn in rouge_names}
                      ^^^^^^^^^^^^^
AttributeError: 'numpy.float64' object has no attribute 'mid'. Did you mean: 'min'?


AttributeError: 'numpy.float64' object has no attribute 'mid'