In [1]:
import os
%pwd
os.chdir("../")

In [2]:
%pwd

'/workspaces/End-End-Text-summeriser'

In [None]:
# update the config/config.yaml file for model evaluation
# evaluate the model and save the metrics in the csv file.

In [None]:
# 3. define the entity

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path # root directory of the project
    data_path: Path # path to the data
    model_path: Path # path to the model
    tokenizer_path: Path # path to the tokenizer
    metric_file_name: Path # path to the metric file

In [None]:
# 4. configuration manager in src config.

In [5]:
from textSummerizer.constants import *
from textSummerizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation # get the model evaluation config from the config.yaml file

        create_directories([config.root_dir]) # create the directories

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir, # set the root directory
            data_path=config.data_path, # set the data path
            model_path = config.model_path, # set the model path
            tokenizer_path = config.tokenizer_path, # set the tokenizer path
            metric_file_name = config.metric_file_name # set the metric file name
           
        )

        return model_evaluation_config # return the model evaluation config

In [None]:
#5. components

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-19 23:18:22,749: INFO: config: PyTorch version 2.2.1 available.]


In [8]:
# define the model evaluation class
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig): # initialize the model evaluation class
        self.config = config


    # define the method to generate batch sized chunks
    def generate_batch_sized_chunks(self,list_of_elements, batch_size):
        """split the dataset into smaller batches that we can process simultaneously
        Yield successive batch-sized chunks from list_of_elements."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size] # yield the batch sized chunks
        print("Batch sized chunks generated")

    # define the method to calculate the metric on the test dataset
    def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer, 
                               batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu", 
                               column_text="article", 
                               column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size)) # generate the article batches 
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size)) # generate the target batches
        
        # iterate through the article and target batches
        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):
            # tokenize the articles
            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                            padding="max_length", return_tensors="pt")
            # generate the summaries
            
            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                            attention_mask=inputs["attention_mask"].to(device), 
                            length_penalty=0.8, num_beams=8, max_length=128)
            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
            
            # Finally, we decode the generated texts, 
            # replace the  token, and add the decoded texts with the references to the metric.
            # decode the summaries
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=True) 
                for s in summaries]      
            # replace the empty strings with space
            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
            
            # add the decoded summaries to the metric
            metric.add_batch(predictions=decoded_summaries, references=target_batch)
            
        #  Finally compute and return the ROUGE scores.
        score = metric.compute()
        print("calculated score:",score)
        return score


    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using {device}")
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        print(f"Tokenizer loaded from {self.config.tokenizer_path}")
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
        print(f"Model loaded from {self.config.model_path}")
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        print(f"Dataset loaded from {self.config.data_path}")

        # define the rouge names
        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        # load the rouge metric
        rouge_metric = load_metric('rouge')

        print("Calculating ROUGE on test dataset")

        score = self.calculate_metric_on_test_ds(
        dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
            )
        # create the dataframe
        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names ) # create the rouge dictionary

        df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
        print(df)
        df.to_csv(self.config.metric_file_name, index=False)


In [None]:
# 6. pipeline

In [9]:
try:
    config = ConfigurationManager() # create the configuration manager
    model_evaluation_config = config.get_model_evaluation_config() # get the model evaluation config
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config) # create the model evaluation class
    model_evaluation_config.evaluate() # evaluate the model
except Exception as e:
    raise e

[2024-03-19 23:29:33,777: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-19 23:29:33,779: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-19 23:29:33,780: INFO: common: created directory at: artifacts]
[2024-03-19 23:29:33,780: INFO: common: created directory at: artifacts/model_evaluation]
Using cpu
Tokenizer loaded from artifacts/model_trainer/tokenizer
Model loaded from artifacts/model_trainer/pegasus-samsum-model
Dataset loaded from artifacts/data_transformation/samsum_dataset


  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 5.65kB [00:00, 12.0MB/s]                   


Calculating ROUGE on test dataset
Batch sized chunks generated
Batch sized chunks generated


100%|██████████| 5/5 [01:28<00:00, 17.75s/it]

[2024-03-19 23:31:09,036: INFO: rouge_scorer: Using default tokenizer.]
calculated score: {'rouge1': AggregateScore(low=Score(precision=0.0057779353088529325, recall=0.02823763624176721, fmeasure=0.009444431655557705), mid=Score(precision=0.011846186367888152, recall=0.060890209183745256, fmeasure=0.019480017419446032), high=Score(precision=0.017861862880648356, recall=0.0931487725930705, fmeasure=0.02963527480517014)), 'rouge2': AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0)), 'rougeL': AggregateScore(low=Score(precision=0.0060564010409401604, recall=0.03151548679236182, fmeasure=0.010105231135216617), mid=Score(precision=0.012026005873365655, recall=0.06190476190476191, fmeasure=0.01977648805331053), high=Score(precision=0.018273476150688042, recall=0.09338969918782522, fmeasure=0.03009106532586721)), 'rougeLsum': AggregateScore(low=Score(precision=0.00633366200


