In [1]:
import os
print(f'Current directory: {os.getcwd()}')
os.chdir('../')
print(f'Changed to directory: {os.getcwd()}')
print(f'Config file exists: {os.path.exists("config/config.yaml")}')

Current directory: c:\Users\tumom\OneDrive\Desktop\END TO END -NLP\research
Changed to directory: c:\Users\tumom\OneDrive\Desktop\END TO END -NLP
Config file exists: True


In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [3]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokenizer_path,
            metric_file_name=config.metric_file_name
        )

        return model_evaluation_config

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk, load_dataset
import torch
import pandas as pd
from tqdm import tqdm
import evaluate
import json
import os

In [7]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        """split the dataset into smaller batches that we can process simultaneously.
        This is a utility function that will return a list of batches.
        """
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]

    def calculate_metric(self, dataset, metric, model, tokenizer):
        dialogue_batches = list(self.generate_batch_sized_chunks(dataset["dialogue"], batch_size=16))
        summary_batches = list(self.generate_batch_sized_chunks(dataset["summary"], batch_size=16))
        for dialogue_batch, summary_batch in tqdm(
            zip(dialogue_batches, summary_batches), total=len(dialogue_batches)
        ):
            inputs = tokenizer(dialogue_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt"
            )
            summaries = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=128, num_beams=8, length_penalty=2.0, early_stopping=True)
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True) for s in summaries]
            metric.add_batch(predictions=decoded_summaries, references=summary_batch)
        score = metric.compute()
        return score
    
    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Load tokenizer - try from trained model first, fallback to original model
        try:
            tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        except:
            print(f"Could not load tokenizer from {self.config.tokenizer_path}, using original model")
            tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
        
        # Load model - try from trained model first, fallback to original model
        try:
            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
        except:
            print(f"Could not load model from {self.config.model_path}, using original model")
            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail").to(device)
        
        # Load dataset
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        
        # Use test split for evaluation
        test_dataset = dataset_samsum_pt["test"]
        
        # Load ROUGE metric using evaluate library
        rouge_metric = evaluate.load("rouge")
        
        # Calculate metrics
        score = self.calculate_metric(test_dataset, rouge_metric, model_pegasus, tokenizer)
        
        # Extract ROUGE scores
        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        rouge_dict = dict((rn, score[rn]) for rn in rouge_names)
        
        # Save results
        df = pd.DataFrame(rouge_dict, index=[0])
        metric_file_path = os.path.join(self.config.root_dir, self.config.metric_file_name)
        df.to_csv(metric_file_path, index=False)
        
        # Also save as JSON for easier reading
        json_file_path = os.path.join(self.config.root_dir, "rouge_scores.json")
        with open(json_file_path, 'w') as f:
            json.dump(rouge_dict, f, indent=2)
        
        print(f"Evaluation completed. Results saved to {metric_file_path} and {json_file_path}")
        print("ROUGE Scores:")
        for key, value in rouge_dict.items():
            print(f"{key}: {value:.4f}")
        
        return rouge_dict

In [8]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    model_evaluation.evaluate()
except Exception as e:
    raise e

ValueError: Unrecognized model in artifacts/model_trainer. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: aimv2, aimv2_vision_model, albert, align, altclip, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, cohere2_vision, colpali, colqwen2, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v2, deepseek_v3, deepseek_vl, deepseek_vl_hybrid, deformable_detr, deit, depth_anything, depth_pro, deta, detr, dia, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, doge, donut-swin, dots1, dpr, dpt, efficientformer, efficientloftr, efficientnet, electra, emu3, encodec, encoder-decoder, eomt, ernie, ernie4_5, ernie4_5_moe, ernie_m, esm, evolla, exaone4, falcon, falcon_h1, falcon_mamba, fastspeech2_conformer, fastspeech2_conformer_with_hifigan, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, gemma3n, gemma3n_audio, gemma3n_text, gemma3n_vision, git, glm, glm4, glm4_moe, glm4v, glm4v_text, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gpt_oss, gptj, gptsan-japanese, granite, granite_speech, granitemoe, granitemoehybrid, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hgnet_v2, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, internvl, internvl_vision, jamba, janus, jetmoe, jukebox, kosmos-2, kyutai_speech_to_text, layoutlm, layoutlmv2, layoutlmv3, led, levit, lfm2, lightglue, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, minimax, mistral, mistral3, mixtral, mlcd, mllama, mm-grounding-dino, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, modernbert-decoder, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, perception_encoder, perception_lm, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_omni, qwen2_5_vl, qwen2_5_vl_text, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen2_vl_text, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_hq, sam_hq_vision_model, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smollm3, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, t5gemma, table-transformer, tapas, textnet, time_series_transformer, timesfm, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, vjepa2, voxtral, voxtral_encoder, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xlstm, xmod, yolos, yoso, zamba, zamba2, zoedepth