In [35]:
import os
import sys
from pathlib import Path

# Force correct project root
project_root = Path("C:/Users/tilak/Desktop/TEXT-SUMMARIZER").resolve()
os.chdir(project_root)
print("Current working directory set to:", os.getcwd())

# Add the `src` folder to Python path
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print("SRC path added to sys.path:", src_path)


Current working directory set to: C:\Users\tilak\Desktop\Text-Summarizer
SRC path added to sys.path: C:\Users\tilak\Desktop\Text-Summarizer\src


In [36]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path:Path
    metric_file_name: Path

In [37]:
from textsummarizer.constants import *
from textsummarizer.utils.common import read_yaml, create_directories

In [38]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self)-> ModelEvaluationConfig:
        config = self.config.model_evaluation
        create_directories([config.root_dir])
        model_evaluation_config = ModelEvaluationConfig(
            root_dir =config.root_dir,
            data_path = config.data_path,
            model_path=config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
                    )
        return model_evaluation_config

In [39]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk

import torch
import pandas as pd
from tqdm import tqdm

In [40]:
class ModelEvaluation:
    def __init__(self,config:ModelEvaluationConfig):
        self.config = config

    def generate_batch_sized_chunks(self,list_of_elements, batch_size):
        """split the data into smaller batches that we can process  and Yield successive n-sized chunks from list_of_elements."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i:i + batch_size]

    def calculate_metrics(self,dataset,metric,model,tokenizer,batch_size=16,device='cuda' if torch.cuda.is_available() else 'cpu'):
        article_batches = list(self.generate_batch_sized_chunks(dataset['column_text'],batch_size))
        target_bathces = list(self.generate_batch_sized_chunks(dataset['column_summary'],batch_size))

        for article_batch,target_batch in tqdm(zip(article_batches,target_bathces),total =len(target_batches)):
            inputs=tokenizer(article_batch,max_length=1024, truncation=True, padding="max_length",return_tensors="pt").to(device),
            summaries = model.generate(inputs['input_ids'].to(device),
                                       attention_mask =inputs["attention_mask"].to(device),
                                       length_penalty=0.8,nu_beams=8,mmax_length=128)
            '''Parameter for length penalty ensures that the model does not gegnerate too long answers'''
            decoded_summaries = [tokenizer.decode(s,skip_special_tokens=True,clean_up_tokenization_spaces=True) for s in summaries]
            decoded_summaries = [d.replace("","") for d in decoded_summaries]
            metric.add_batch(predictions=decoded_summaries, references=target_batch)
        final_score = metric.compute()
        return final_score
    
    def evaluate(self):
        device='cuda' if torch.cuda.is_available() else "cpu"
        tokenizer =AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_pegasus =AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
        dataset = load_from_disk(self.config.data_path)
        rougue_names=["rouge1","rouge2","rougeL","rougeLsum"]
        metric = load_metric("rouge",rouge_names=rougue_names)
        score = self.calculate_metric_on_test_ds(dataset_samsum_pt['test'][:10],metric,model_pegasus,tokenizer,batch_size=2,column_text="dialogue",column_summary='summary')
        rouge_dict=dict((rn,score[rn].midfmeasure) for rn in rouge_names)
        df =pd.DataFrame(rouge_dict,index=['pegasus'])   
        df.to_csv(self.config.metric_file_name,index = False)

In [41]:
try:
    config = ConfigurationManager()
    model_eval_config = config.get_model_evaluation_config()
    model_eval = ModelEvaluation(config=model_eval_config)
    model_eval.evaluate()
except Exception as e:
    raise e

[2025-08-29 15:13:59,528]:INFO:common: yaml file: config\config.yaml loaded successfully]
[2025-08-29 15:13:59,531]:INFO:common: yaml file: params.yaml loaded successfully]
[2025-08-29 15:13:59,532]:INFO:common: created directory at: artifacts]
[2025-08-29 15:13:59,533]:INFO:common: created directory at: artifacts/model_evaluation]


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'artifacts/model_trainer/tokenizer'. Use `repo_type` argument if needed.