In [1]:
import os

In [2]:
%pwd

'd:\\MLOPS\\MLOPS-Text-Summarizer-Project-NLP\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\MLOPS\\MLOPS-Text-Summarizer-Project-NLP'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2026-01-24 11:40:49,466: INFO: config: PyTorch version 2.4.1+cu118 available.]


In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 


        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
        #     per_device_train_batch_size=1, per_device_eval_batch_size=1,
        #     weight_decay=0.01, logging_steps=10,
        #     evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
        #     gradient_accumulation_steps=16
        # ) 
        
        trainer_args = TrainingArguments(
        output_dir=self.config.root_dir,
        max_steps=500,                 # üëà HARD LIMIT (reduces 1841 ‚Üí 500)
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        fp16=True,
        logging_steps=20,
        save_strategy="no",
        evaluation_strategy="no"
)


      


        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        ## Save model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2026-01-24 11:41:11,262: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-01-24 11:41:11,268: INFO: common: yaml file: params.yaml loaded successfully]
[2026-01-24 11:41:11,268: INFO: common: created directory at: artifacts]
[2026-01-24 11:41:11,269: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model_pegasus, args=trainer_args,
max_steps is given, it will override any value given in num_train_epochs
  4%|‚ñç         | 20/500 [03:52<1:44:50, 13.11s/it]

{'loss': 2.6714, 'grad_norm': 324.7447814941406, 'learning_rate': 4.85e-05, 'epoch': 0.01}


  8%|‚ñä         | 40/500 [08:36<1:44:10, 13.59s/it]

{'loss': 2.2863, 'grad_norm': 236.42335510253906, 'learning_rate': 4.6700000000000003e-05, 'epoch': 0.02}


 12%|‚ñà‚ñè        | 60/500 [13:05<1:40:52, 13.76s/it]

{'loss': 1.9839, 'grad_norm': 54.687400817871094, 'learning_rate': 4.4800000000000005e-05, 'epoch': 0.03}


 16%|‚ñà‚ñå        | 80/500 [17:43<1:35:15, 13.61s/it]

{'loss': 1.9742, 'grad_norm': 52.704654693603516, 'learning_rate': 4.29e-05, 'epoch': 0.04}


 20%|‚ñà‚ñà        | 100/500 [22:36<1:59:05, 17.86s/it]

{'loss': 1.8206, 'grad_norm': 45.791194915771484, 'learning_rate': 4.09e-05, 'epoch': 0.05}


 24%|‚ñà‚ñà‚ñç       | 120/500 [27:24<1:31:51, 14.50s/it]

{'loss': 1.8038, 'grad_norm': 50.49692916870117, 'learning_rate': 3.8900000000000004e-05, 'epoch': 0.07}


 28%|‚ñà‚ñà‚ñä       | 140/500 [32:01<1:24:16, 14.05s/it]

{'loss': 1.8993, 'grad_norm': 45.7330207824707, 'learning_rate': 3.69e-05, 'epoch': 0.08}


 32%|‚ñà‚ñà‚ñà‚ñè      | 160/500 [37:02<1:17:43, 13.72s/it]

{'loss': 1.7838, 'grad_norm': 43.527854919433594, 'learning_rate': 3.49e-05, 'epoch': 0.09}


 36%|‚ñà‚ñà‚ñà‚ñå      | 180/500 [41:35<1:13:02, 13.69s/it]

{'loss': 1.8025, 'grad_norm': 46.78560256958008, 'learning_rate': 3.29e-05, 'epoch': 0.1}


 40%|‚ñà‚ñà‚ñà‚ñà      | 200/500 [46:12<1:12:23, 14.48s/it]

{'loss': 1.7953, 'grad_norm': 44.03913116455078, 'learning_rate': 3.09e-05, 'epoch': 0.11}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 220/500 [51:01<1:04:56, 13.92s/it]

{'loss': 1.7437, 'grad_norm': 43.5333137512207, 'learning_rate': 2.8899999999999998e-05, 'epoch': 0.12}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 240/500 [55:35<59:07, 13.64s/it]  

{'loss': 1.7009, 'grad_norm': 172.40377807617188, 'learning_rate': 2.6900000000000003e-05, 'epoch': 0.13}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 260/500 [1:00:23<56:59, 14.25s/it]

{'loss': 1.7366, 'grad_norm': 42.06732177734375, 'learning_rate': 2.4900000000000002e-05, 'epoch': 0.14}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 280/500 [1:04:59<50:15, 13.71s/it]

{'loss': 1.7386, 'grad_norm': 68.60020446777344, 'learning_rate': 2.29e-05, 'epoch': 0.15}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 300/500 [1:09:33<45:21, 13.61s/it]

{'loss': 1.7878, 'grad_norm': 52.08694839477539, 'learning_rate': 2.09e-05, 'epoch': 0.16}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 320/500 [1:16:05<44:21, 14.78s/it]  

{'loss': 1.6792, 'grad_norm': 38.5429573059082, 'learning_rate': 1.8900000000000002e-05, 'epoch': 0.17}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 340/500 [1:20:47<37:54, 14.21s/it]

{'loss': 1.7172, 'grad_norm': 38.56883239746094, 'learning_rate': 1.69e-05, 'epoch': 0.18}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 360/500 [1:25:17<32:18, 13.84s/it]

{'loss': 1.6405, 'grad_norm': 44.71620559692383, 'learning_rate': 1.49e-05, 'epoch': 0.2}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 380/500 [1:29:58<28:14, 14.12s/it]

{'loss': 1.6471, 'grad_norm': 38.61698532104492, 'learning_rate': 1.29e-05, 'epoch': 0.21}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 400/500 [1:34:33<22:24, 13.44s/it]

{'loss': 1.7011, 'grad_norm': 43.19507598876953, 'learning_rate': 1.09e-05, 'epoch': 0.22}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 420/500 [1:39:58<30:36, 22.96s/it]

{'loss': 1.6512, 'grad_norm': 39.65654754638672, 'learning_rate': 8.9e-06, 'epoch': 0.23}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 440/500 [1:44:32<13:40, 13.68s/it]

{'loss': 1.6137, 'grad_norm': 43.94072341918945, 'learning_rate': 6.900000000000001e-06, 'epoch': 0.24}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 460/500 [1:49:06<09:27, 14.19s/it]

{'loss': 1.7056, 'grad_norm': 34.07740783691406, 'learning_rate': 4.9000000000000005e-06, 'epoch': 0.25}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 480/500 [1:53:41<04:25, 13.25s/it]

{'loss': 1.6337, 'grad_norm': 75.50508880615234, 'learning_rate': 2.9e-06, 'epoch': 0.26}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [1:58:16<00:00, 14.19s/it]


{'loss': 1.68, 'grad_norm': 64.47347259521484, 'learning_rate': 9e-07, 'epoch': 0.27}
{'train_runtime': 7096.7719, 'train_samples_per_second': 0.564, 'train_steps_per_second': 0.07, 'train_loss': 1.8079171752929688, 'epoch': 0.27}
