In [1]:
import pwd


pwd

<module 'pwd' (built-in)>

In [2]:
import os
os.chdir('../')

In [3]:
pwd

<module 'pwd' (built-in)>

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvalutionConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [5]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvalutionConfig:
        config = self.config.model_evaluation

        model_evalution_config = ModelEvalutionConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            model_path = config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
        )

        return model_evalution_config

In [7]:
import torch
import evaluate
import pandas as pd
from datasets import load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm

In [8]:
class ModelEvaluation:
    def __init__(self, config: ModelEvalutionConfig):
        self.config = config

    @staticmethod
    def generate_batch_sized_chunks(list_of_elements, batch_size):
        """Split dataset into batches."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]

    @staticmethod
    def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                                    batch_size=16, device='cpu',
                                    column_text="dialogue",
                                    column_summary="summary"):

        dialogues = dataset[column_text]
        summaries = dataset[column_summary]

        dialogue_batches = list(ModelEvaluation.generate_batch_sized_chunks(dialogues, batch_size))
        summary_batches = list(ModelEvaluation.generate_batch_sized_chunks(summaries, batch_size))

        for dialogue_batch, summary_batch in tqdm(
                zip(dialogue_batches, summary_batches), total=len(dialogue_batches)):
            
            inputs = tokenizer(dialogue_batch, max_length=512, truncation=True,
                               padding="max_length", return_tensors="pt")
            
            generated_summaries = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                length_penalty=0.8, num_beams=4, max_length=128
            )

            decoded_preds = [
                tokenizer.decode(s, skip_special_tokens=True) 
                for s in generated_summaries
            ]

            metric.add_batch(predictions=decoded_preds, references=summary_batch)

        score = metric.compute()
        return score

    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"

        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)

        dataset = load_from_disk(self.config.data_path)

        # NEW evaluate library
        rouge_metric = evaluate.load("rouge")

        score = self.calculate_metric_on_test_ds(
            dataset["test"][0:10],
            rouge_metric,
            model,
            tokenizer,
            batch_size=4,
            device=device
        )

        # Convert scores
        rouge_dict = {
            "rouge1": score["rouge1"],
            "rouge2": score["rouge2"],
            "rougeL": score["rougeL"],
            "rougeLsum": score["rougeLsum"]
            }


        # Ensure output directory exists
        os.makedirs(os.path.dirname(self.config.metric_file_name), exist_ok=True)

        df = pd.DataFrame([rouge_dict])
        df.to_csv(self.config.metric_file_name, index=False)

        print("\nROUGE scores saved to:", self.config.metric_file_name)



In [9]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    model_evaluation.evaluate()

    
except Exception as e:
    raise e

[2025-11-20 14:37:14,002]: INFO: common: YAML file: config/config.yaml loaded successfully
[2025-11-20 14:37:14,005]: INFO: common: YAML file: params.yaml loaded successfully
[2025-11-20 14:37:14,006]: INFO: common: Directory created at: artifacts


100%|██████████| 3/3 [00:19<00:00,  6.55s/it]

[2025-11-20 14:37:37,126]: INFO: rouge_scorer: Using default tokenizer.

ROUGE scores saved to: artifacts/model_evaluation/metrics.csv



