In [34]:
import os
os.chdir(r"C:\Users\p1a2r\OneDrive\Desktop\Git Hub Projects\Text-Summarizer")

In [35]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int  
    weight_decay: float
    learning_rate: float  
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_strategy: str 
    save_steps: int
    save_total_limit: int  
    gradient_accumulation_steps: int
    lr_scheduler_type: str  
    fp16: bool  
    load_best_model_at_end: bool  
    metric_for_best_model: str  
    greater_is_better: bool  
    prediction_loss_only: bool  
    remove_unused_columns: bool  
    report_to: str  

In [36]:
from src.text_summarizer.logger import logger
from src.text_summarizer.utils.common import read_yaml, create_directories
from src.text_summarizer.constants import *

In [37]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        # Check if config files exist
        if not os.path.exists(config_filepath):
            raise FileNotFoundError(f"Configuration file not found: {config_filepath}")
        if not os.path.exists(params_filepath):
            raise FileNotFoundError(f"Parameters file not found: {params_filepath}")
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Check if artifacts_root exists in config
        if not hasattr(self.config, 'artifacts_root'):
            raise KeyError("'artifacts_root' key not found in config.yaml. Please add: artifacts_root: artifacts")
        
        create_directories([self.config.artifacts_root])
        
        
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        # Convert string values to appropriate types
        model_trainer_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=int(params.num_train_epochs),
            warmup_steps=int(params.warmup_steps),
            per_device_train_batch_size=int(params.per_device_train_batch_size),
            per_device_eval_batch_size=int(params.per_device_eval_batch_size),  
            weight_decay=float(params.weight_decay),
            learning_rate=float(params.learning_rate),  
            logging_steps=int(params.logging_steps),
            eval_strategy=params.eval_strategy,
            eval_steps=int(params.eval_steps),
            save_strategy=params.save_strategy,  
            save_steps=int(params.save_steps),
            save_total_limit=int(params.save_total_limit),  
            gradient_accumulation_steps=int(params.gradient_accumulation_steps),
            lr_scheduler_type=params.lr_scheduler_type,  
            fp16=bool(params.fp16),  
            load_best_model_at_end=bool(params.load_best_model_at_end),  
            metric_for_best_model=params.metric_for_best_model,  
            greater_is_better=bool(params.greater_is_better),  
            prediction_loss_only=bool(params.prediction_loss_only),  
            remove_unused_columns=bool(params.remove_unused_columns),  
            report_to=params.report_to  
        )

        return model_trainer_config

In [38]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, load_from_disk
import torch
from accelerate import Accelerator

In [39]:
class ModelTrainer:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        
    def train(self):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,  # Use specific eval batch size
            weight_decay=self.config.weight_decay,
            learning_rate=self.config.learning_rate,  # Added
            logging_steps=self.config.logging_steps,
            eval_strategy=self.config.eval_strategy,  # Note: evaluation_strategy not eval_strategy
            eval_steps=self.config.eval_steps,
            save_strategy=self.config.save_strategy,  # Added
            save_steps=self.config.save_steps,
            save_total_limit=self.config.save_total_limit,  # Added
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            lr_scheduler_type=self.config.lr_scheduler_type,  # Added
            fp16=self.config.fp16,  # Added - mixed precision training
            load_best_model_at_end=self.config.load_best_model_at_end,  # Added
            metric_for_best_model=self.config.metric_for_best_model,  # Added
            greater_is_better=self.config.greater_is_better,  # Added
            prediction_loss_only=self.config.prediction_loss_only,  # Added
            remove_unused_columns=self.config.remove_unused_columns,  # Added
            report_to=self.config.report_to,  # Added
            dataloader_pin_memory=True,  # Added for performance
            dataloader_num_workers=2,  # Added for performance
            logging_dir=os.path.join(self.config.root_dir, "logs"),  # Added for logging
        )
        
        trainer = Trainer(
            model=model_pegasus,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=seq2seq_data_collator,
            train_dataset=dataset_samsum_pt["train"],
            eval_dataset=dataset_samsum_pt["validation"]
        )
        
        # Train the model
        trainer.train()
        
        # Save the best model (automatically loaded if load_best_model_at_end=True)
        trainer.save_model()
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus_model"))
        
        # Also save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))
        
        logger.info("Model training completed successfully!")
        logger.info(f"Best model saved at: {self.config.root_dir}")
        logger.info(f"Final evaluation loss: {trainer.state.log_history[-1].get('eval_loss', 'N/A')}")
        

In [40]:
try: 
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_training_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
    
except Exception as e:
    logger.error(f"Error training model: {str(e)}")
    raise e

[2025-09-10 20:33:32,975: INFO: common]: yaml file: config\config.yaml loaded successfully
[2025-09-10 20:33:32,975: INFO: common]: yaml file: config\params.yaml loaded successfully
[2025-09-10 20:33:32,981: INFO: common]: Directory created at: artifacts
[2025-09-10 20:33:32,982: INFO: common]: Directory created at: artifacts/model_trainer


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-09-10 20:33:40,926: ERROR: 4202113604]: Error training model: Invalid version: 'N/A'


InvalidVersion: Invalid version: 'N/A'

In [41]:
import os
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int  
    weight_decay: float
    learning_rate: float  
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_strategy: str 
    save_steps: int
    save_total_limit: int  
    gradient_accumulation_steps: int
    lr_scheduler_type: str  
    fp16: bool  
    load_best_model_at_end: bool  
    metric_for_best_model: str  
    greater_is_better: bool  
    prediction_loss_only: bool  
    remove_unused_columns: bool  
    report_to: str  

from src.text_summarizer.logger import logger
from src.text_summarizer.utils.common import read_yaml, create_directories
from src.text_summarizer.constants import *

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        # Check if config files exist
        if not os.path.exists(config_filepath):
            raise FileNotFoundError(f"Configuration file not found: {config_filepath}")
        if not os.path.exists(params_filepath):
            raise FileNotFoundError(f"Parameters file not found: {params_filepath}")
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Check if artifacts_root exists in config
        if not hasattr(self.config, 'artifacts_root'):
            raise KeyError("'artifacts_root' key not found in config.yaml. Please add: artifacts_root: artifacts")
        
        create_directories([self.config.artifacts_root])
        
        
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        # Convert string values to appropriate types
        model_trainer_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=int(params.num_train_epochs),
            warmup_steps=int(params.warmup_steps),
            per_device_train_batch_size=int(params.per_device_train_batch_size),
            per_device_eval_batch_size=int(params.per_device_eval_batch_size),  
            weight_decay=float(params.weight_decay),
            learning_rate=float(params.learning_rate),  
            logging_steps=int(params.logging_steps),
            eval_strategy=params.eval_strategy,
            eval_steps=int(params.eval_steps),
            save_strategy=params.save_strategy,  
            save_steps=int(params.save_steps),
            save_total_limit=int(params.save_total_limit),  
            gradient_accumulation_steps=int(params.gradient_accumulation_steps),
            lr_scheduler_type=params.lr_scheduler_type,  
            fp16=bool(params.fp16),  
            load_best_model_at_end=bool(params.load_best_model_at_end),  
            metric_for_best_model=params.metric_for_best_model,  
            greater_is_better=bool(params.greater_is_better),  
            prediction_loss_only=bool(params.prediction_loss_only),  
            remove_unused_columns=bool(params.remove_unused_columns),  
            report_to=params.report_to  
        )

        return model_trainer_config

# Import all required packages at the top
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq
)

# Ensure accelerate is available before using Trainer
try:
    import accelerate
    logger.info(f"Accelerate version: {accelerate.__version__}")
except ImportError as e:
    logger.error(f"Accelerate not available: {e}")
    raise ImportError("Accelerate is required for training. Please install with: pip install accelerate>=0.26.0")

class ModelTrainer:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        
    def train(self):
        try:
            # Set up device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            logger.info(f"Using device: {device}")
            
            # Load tokenizer and model
            logger.info(f"Loading model from: {self.config.model_ckpt}")
            tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
            
            # Data collator
            seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
            
            # Load dataset
            logger.info(f"Loading dataset from: {self.config.data_path}")
            dataset_samsum_pt = load_from_disk(self.config.data_path)
            
            # Create logs directory
            logs_dir = os.path.join(self.config.root_dir, "logs")
            os.makedirs(logs_dir, exist_ok=True)
            
            # Training arguments
            trainer_args = TrainingArguments(
                output_dir=self.config.root_dir,
                num_train_epochs=self.config.num_train_epochs,
                warmup_steps=self.config.warmup_steps,
                per_device_train_batch_size=self.config.per_device_train_batch_size,
                per_device_eval_batch_size=self.config.per_device_eval_batch_size,
                weight_decay=self.config.weight_decay,
                learning_rate=self.config.learning_rate,
                logging_steps=self.config.logging_steps,
                eval_strategy=self.config.eval_strategy,
                eval_steps=self.config.eval_steps,
                save_strategy=self.config.save_strategy,
                save_steps=self.config.save_steps,
                save_total_limit=self.config.save_total_limit,
                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
                lr_scheduler_type=self.config.lr_scheduler_type,
                fp16=self.config.fp16,
                load_best_model_at_end=self.config.load_best_model_at_end,
                metric_for_best_model=self.config.metric_for_best_model,
                greater_is_better=self.config.greater_is_better,
                prediction_loss_only=self.config.prediction_loss_only,
                remove_unused_columns=self.config.remove_unused_columns,
                report_to=self.config.report_to,
                dataloader_pin_memory=False,  # Set to False to avoid potential issues
                dataloader_num_workers=0,     # Set to 0 to avoid multiprocessing issues
                logging_dir=logs_dir,
                # Explicitly set ddp_find_unused_parameters to False to avoid warnings
                ddp_find_unused_parameters=False,
            )
            
            # Initialize trainer
            logger.info("Initializing Trainer...")
            trainer = Trainer(
                model=model_pegasus,
                args=trainer_args,
                tokenizer=tokenizer,
                data_collator=seq2seq_data_collator,
                train_dataset=dataset_samsum_pt["train"],
                eval_dataset=dataset_samsum_pt["validation"]
            )
            
            # Start training
            logger.info("Starting training...")
            trainer.train()
            
            # Save the model
            logger.info("Saving model...")
            trainer.save_model()
            
            # Save model and tokenizer separately
            model_dir = os.path.join(self.config.root_dir, "pegasus_model")
            tokenizer_dir = os.path.join(self.config.root_dir, "tokenizer")
            
            os.makedirs(model_dir, exist_ok=True)
            os.makedirs(tokenizer_dir, exist_ok=True)
            
            model_pegasus.save_pretrained(model_dir)
            tokenizer.save_pretrained(tokenizer_dir)
            
            logger.info("Model training completed successfully!")
            logger.info(f"Model saved at: {model_dir}")
            logger.info(f"Tokenizer saved at: {tokenizer_dir}")
            
            # Log final metrics
            if trainer.state.log_history:
                final_metrics = trainer.state.log_history[-1]
                logger.info(f"Final training metrics: {final_metrics}")
        
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise

# Main execution
try: 
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_training_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
    
except Exception as e:
    logger.error(f"Error training model: {str(e)}")
    raise e

[2025-09-10 20:33:49,749: INFO: 2779877579]: Accelerate version: 1.10.1
[2025-09-10 20:33:49,752: INFO: common]: yaml file: config\config.yaml loaded successfully
[2025-09-10 20:33:49,755: INFO: common]: yaml file: config\params.yaml loaded successfully
[2025-09-10 20:33:49,756: INFO: common]: Directory created at: artifacts
[2025-09-10 20:33:49,757: INFO: common]: Directory created at: artifacts/model_trainer
[2025-09-10 20:33:49,758: INFO: 2779877579]: Using device: cpu
[2025-09-10 20:33:49,759: INFO: 2779877579]: Loading model from: google/pegasus-cnn_dailymail


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-09-10 20:33:56,478: INFO: 2779877579]: Loading dataset from: artifacts\data_transformation\samsum_dataset
[2025-09-10 20:33:56,496: ERROR: 2779877579]: Error during training: Invalid version: 'N/A'
[2025-09-10 20:33:56,497: ERROR: 2779877579]: Error training model: Invalid version: 'N/A'


InvalidVersion: Invalid version: 'N/A'

In [42]:
import os
import sys
from dataclasses import dataclass
from pathlib import Path

# Set environment variables before importing transformers
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int  
    weight_decay: float
    learning_rate: float  
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_strategy: str 
    save_steps: int
    save_total_limit: int  
    gradient_accumulation_steps: int
    lr_scheduler_type: str  
    fp16: bool  
    load_best_model_at_end: bool  
    metric_for_best_model: str  
    greater_is_better: bool  
    prediction_loss_only: bool  
    remove_unused_columns: bool  
    report_to: str  

from src.text_summarizer.logger import logger
from src.text_summarizer.utils.common import read_yaml, create_directories
from src.text_summarizer.constants import *

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        # Check if config files exist
        if not os.path.exists(config_filepath):
            raise FileNotFoundError(f"Configuration file not found: {config_filepath}")
        if not os.path.exists(params_filepath):
            raise FileNotFoundError(f"Parameters file not found: {params_filepath}")
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Check if artifacts_root exists in config
        if not hasattr(self.config, 'artifacts_root'):
            # Add artifacts_root if it doesn't exist
            self.config.artifacts_root = "artifacts"
        
        create_directories([self.config.artifacts_root])
        
        
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        # Convert string values to appropriate types
        model_trainer_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=int(params.num_train_epochs),
            warmup_steps=int(params.warmup_steps),
            per_device_train_batch_size=int(params.per_device_train_batch_size),
            per_device_eval_batch_size=int(params.per_device_eval_batch_size),  
            weight_decay=float(params.weight_decay),
            learning_rate=float(params.learning_rate),  
            logging_steps=int(params.logging_steps),
            eval_strategy=params.eval_strategy,
            eval_steps=int(params.eval_steps),
            save_strategy=params.save_strategy,  
            save_steps=int(params.save_steps),
            save_total_limit=int(params.save_total_limit),  
            gradient_accumulation_steps=int(params.gradient_accumulation_steps),
            lr_scheduler_type=params.lr_scheduler_type,  
            fp16=bool(params.fp16),  
            load_best_model_at_end=bool(params.load_best_model_at_end),  
            metric_for_best_model=params.metric_for_best_model,  
            greater_is_better=bool(params.greater_is_better),  
            prediction_loss_only=bool(params.prediction_loss_only),  
            remove_unused_columns=bool(params.remove_unused_columns),  
            report_to=params.report_to  
        )

        return model_trainer_config

# Import required libraries
import torch
from datasets import load_from_disk

# Test accelerate import and force proper detection
try:
    import accelerate
    logger.info(f"Accelerate version: {accelerate.__version__}")
    
    # Force the transformers library to recognize accelerate properly
    import transformers
    from transformers.utils import is_accelerate_available
    
    # If still not detected, we'll use a workaround
    if not is_accelerate_available():
        logger.warning("Accelerate not detected by transformers, applying workaround...")
        
        # Import required classes with a different approach
        from transformers import (
            AutoTokenizer, 
            AutoModelForSeq2SeqLM,
            DataCollatorForSeq2Seq
        )
        
        # Use a custom trainer or manual training loop
        use_manual_training = True
    else:
        logger.info("Accelerate properly detected by transformers")
        from transformers import (
            AutoTokenizer, 
            AutoModelForSeq2SeqLM,
            Trainer, 
            TrainingArguments, 
            DataCollatorForSeq2Seq
        )
        use_manual_training = False
        
except ImportError as e:
    logger.error(f"Failed to import required packages: {e}")
    raise

class ModelTrainer:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        
    def train(self):
        try:
            # Set up device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            logger.info(f"Using device: {device}")
            
            # Load tokenizer and model
            logger.info(f"Loading model from: {self.config.model_ckpt}")
            tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
            
            # Data collator
            seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
            
            # Load dataset
            logger.info(f"Loading dataset from: {self.config.data_path}")
            dataset_samsum_pt = load_from_disk(self.config.data_path)
            
            # Check if we need to use manual training or can use Trainer
            try:
                # Try to use the standard Trainer
                self._train_with_trainer(model_pegasus, tokenizer, seq2seq_data_collator, dataset_samsum_pt, device)
            except Exception as trainer_error:
                logger.warning(f"Standard Trainer failed: {trainer_error}")
                logger.info("Falling back to manual training loop...")
                self._train_manually(model_pegasus, tokenizer, seq2seq_data_collator, dataset_samsum_pt, device)
            
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise

    def _train_with_trainer(self, model, tokenizer, data_collator, dataset, device):
        """Standard training with HuggingFace Trainer"""
        from transformers import Trainer, TrainingArguments
        
        # Create logs directory
        logs_dir = os.path.join(self.config.root_dir, "logs")
        os.makedirs(logs_dir, exist_ok=True)
        
        # Training arguments
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            warmup_steps=self.config.warmup_steps,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            eval_strategy=self.config.eval_strategy,
            eval_steps=self.config.eval_steps,
            save_strategy=self.config.save_strategy,
            save_steps=self.config.save_steps,
            prediction_loss_only=True,
            remove_unused_columns=False,
            dataloader_num_workers=0,
            dataloader_pin_memory=False,
            logging_dir=logs_dir,
        )
        
        # Initialize trainer
        logger.info("Initializing Trainer...")
        trainer = Trainer(
            model=model,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=data_collator,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"]
        )
        
        # Start training
        logger.info("Starting training with Trainer...")
        trainer.train()
        
        # Save the model
        logger.info("Saving model...")
        trainer.save_model()
        
        self._save_model_and_tokenizer(model, tokenizer)
        logger.info("Training with Trainer completed successfully!")

    def _train_manually(self, model, tokenizer, data_collator, dataset, device):
        """Manual training loop as fallback"""
        from torch.utils.data import DataLoader
        from transformers import get_linear_schedule_with_warmup
        from tqdm import tqdm
        
        # Create data loaders
        train_dataloader = DataLoader(
            dataset["train"], 
            batch_size=self.config.per_device_train_batch_size,
            shuffle=True,
            collate_fn=data_collator,
            num_workers=0
        )
        
        val_dataloader = DataLoader(
            dataset["validation"], 
            batch_size=self.config.per_device_eval_batch_size,
            shuffle=False,
            collate_fn=data_collator,
            num_workers=0
        )
        
        # Optimizer and scheduler
        optimizer = torch.optim.AdamW(
            model.parameters(), 
            lr=self.config.learning_rate,
            weight_decay=self.config.weight_decay
        )
        
        total_steps = len(train_dataloader) * self.config.num_train_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.config.warmup_steps,
            num_training_steps=total_steps
        )
        
        # Training loop
        model.train()
        global_step = 0
        
        logger.info("Starting manual training...")
        for epoch in range(self.config.num_train_epochs):
            logger.info(f"Epoch {epoch + 1}/{self.config.num_train_epochs}")
            
            epoch_loss = 0
            progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")
            
            for batch_idx, batch in enumerate(progress_bar):
                # Move batch to device
                batch = {k: v.to(device) for k, v in batch.items()}
                
                # Forward pass
                outputs = model(**batch)
                loss = outputs.loss
                
                # Backward pass
                loss.backward()
                
                # Update weights
                if (batch_idx + 1) % self.config.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    
                    global_step += 1
                    
                    # Logging
                    if global_step % self.config.logging_steps == 0:
                        logger.info(f"Step {global_step}, Loss: {loss.item():.4f}")
                
                epoch_loss += loss.item()
                
                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{epoch_loss/(batch_idx+1):.4f}'
                })
                
                # Evaluation
                if global_step % self.config.eval_steps == 0:
                    eval_loss = self._evaluate_manually(model, val_dataloader, device)
                    logger.info(f"Evaluation Loss at step {global_step}: {eval_loss:.4f}")
                    model.train()  # Back to training mode
        
        # Save model
        self._save_model_and_tokenizer(model, tokenizer)
        logger.info("Manual training completed successfully!")

    def _evaluate_manually(self, model, dataloader, device):
        """Manual evaluation"""
        model.eval()
        total_loss = 0
        num_batches = 0
        
        with torch.no_grad():
            for batch in dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                total_loss += outputs.loss.item()
                num_batches += 1
        
        return total_loss / num_batches if num_batches > 0 else 0

    def _save_model_and_tokenizer(self, model, tokenizer):
        """Save model and tokenizer"""
        model_dir = os.path.join(self.config.root_dir, "pegasus_model")
        tokenizer_dir = os.path.join(self.config.root_dir, "tokenizer")
        
        os.makedirs(model_dir, exist_ok=True)
        os.makedirs(tokenizer_dir, exist_ok=True)
        
        model.save_pretrained(model_dir)
        tokenizer.save_pretrained(tokenizer_dir)
        
        logger.info(f"Model saved at: {model_dir}")
        logger.info(f"Tokenizer saved at: {tokenizer_dir}")

# Main execution

try: 
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_training_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
    
except Exception as e:
    logger.error(f"Error training model: {str(e)}")
    raise e

[2025-09-10 20:35:06,320: INFO: 2211133503]: Accelerate version: 1.10.1


InvalidVersion: Invalid version: 'N/A'

In [43]:
import os
import sys
from dataclasses import dataclass
from pathlib import Path

# Set environment variables before importing anything
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'

# Import accelerate first and patch the version issue
import accelerate
print(f"Accelerate version: {accelerate.__version__}")

# Now patch the transformers import_utils BEFORE importing transformers
import transformers.utils.import_utils as import_utils

# Store the original version
_original_accelerate_version = accelerate.__version__

# Monkey patch the version detection to return the actual version
import_utils._accelerate_available = True
import_utils._accelerate_version = _original_accelerate_version

# Also patch the version parsing function to handle our case
original_is_accelerate_available = import_utils.is_accelerate_available

def patched_is_accelerate_available(min_version="0.26.0"):
    """Always return True since we know accelerate is installed"""
    try:
        # Import packaging.version here to avoid circular imports
        from packaging import version
        return version.parse(_original_accelerate_version) >= version.parse(min_version)
    except:
        # If version parsing fails, just return True since we know it's installed
        return True

# Apply the patch
import_utils.is_accelerate_available = patched_is_accelerate_available

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int  
    weight_decay: float
    learning_rate: float  
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_strategy: str 
    save_steps: int
    save_total_limit: int  
    gradient_accumulation_steps: int
    lr_scheduler_type: str  
    fp16: bool  
    load_best_model_at_end: bool  
    metric_for_best_model: str  
    greater_is_better: bool  
    prediction_loss_only: bool  
    remove_unused_columns: bool  
    report_to: str  

from src.text_summarizer.logger import logger
from src.text_summarizer.utils.common import read_yaml, create_directories
from src.text_summarizer.constants import *

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        # Check if config files exist
        if not os.path.exists(config_filepath):
            raise FileNotFoundError(f"Configuration file not found: {config_filepath}")
        if not os.path.exists(params_filepath):
            raise FileNotFoundError(f"Parameters file not found: {params_filepath}")
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Check if artifacts_root exists in config
        if not hasattr(self.config, 'artifacts_root'):
            # Add artifacts_root if it doesn't exist
            self.config.artifacts_root = "artifacts"
        
        create_directories([self.config.artifacts_root])
        
        
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        # Convert string values to appropriate types
        model_trainer_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=int(params.num_train_epochs),
            warmup_steps=int(params.warmup_steps),
            per_device_train_batch_size=int(params.per_device_train_batch_size),
            per_device_eval_batch_size=int(params.per_device_eval_batch_size),  
            weight_decay=float(params.weight_decay),
            learning_rate=float(params.learning_rate),  
            logging_steps=int(params.logging_steps),
            eval_strategy=params.eval_strategy,
            eval_steps=int(params.eval_steps),
            save_strategy=params.save_strategy,  
            save_steps=int(params.save_steps),
            save_total_limit=int(params.save_total_limit),  
            gradient_accumulation_steps=int(params.gradient_accumulation_steps),
            lr_scheduler_type=params.lr_scheduler_type,  
            fp16=bool(params.fp16),  
            load_best_model_at_end=bool(params.load_best_model_at_end),  
            metric_for_best_model=params.metric_for_best_model,  
            greater_is_better=bool(params.greater_is_better),  
            prediction_loss_only=bool(params.prediction_loss_only),  
            remove_unused_columns=bool(params.remove_unused_columns),  
            report_to=params.report_to  
        )

        return model_trainer_config

# Now we can safely import transformers
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq
)

# Verify the patch worked
from transformers.utils import is_accelerate_available
logger.info(f"Accelerate available after patch: {is_accelerate_available()}")

class ModelTrainer:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        
    def train(self):
        try:
            # Set up device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            logger.info(f"Using device: {device}")
            
            # Load tokenizer and model
            logger.info(f"Loading model from: {self.config.model_ckpt}")
            tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
            
            # Data collator
            seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
            
            # Load dataset
            logger.info(f"Loading dataset from: {self.config.data_path}")
            dataset_samsum_pt = load_from_disk(self.config.data_path)
            
            # Create logs directory
            logs_dir = os.path.join(self.config.root_dir, "logs")
            os.makedirs(logs_dir, exist_ok=True)
            
            # Training arguments - simplified to ensure they work
            trainer_args = TrainingArguments(
                output_dir=self.config.root_dir,
                num_train_epochs=self.config.num_train_epochs,
                per_device_train_batch_size=self.config.per_device_train_batch_size,
                per_device_eval_batch_size=self.config.per_device_eval_batch_size,
                warmup_steps=self.config.warmup_steps,
                weight_decay=self.config.weight_decay,
                learning_rate=self.config.learning_rate,
                logging_steps=self.config.logging_steps,
                eval_strategy=self.config.eval_strategy,
                eval_steps=self.config.eval_steps,
                save_strategy=self.config.save_strategy,
                save_steps=self.config.save_steps,
                save_total_limit=self.config.save_total_limit,
                prediction_loss_only=self.config.prediction_loss_only,
                remove_unused_columns=self.config.remove_unused_columns,
                dataloader_num_workers=0,  # Avoid multiprocessing issues
                dataloader_pin_memory=False,  # Avoid memory issues
                logging_dir=logs_dir,
                report_to=self.config.report_to if self.config.report_to != "tensorboard" else "none",
                # Disable some features that might cause issues
                push_to_hub=False,
                hub_model_id=None,
                gradient_checkpointing=False,
                fp16=False,  # Disable FP16 to avoid potential issues
                bf16=False,  # Disable BF16
            )
            
            # Initialize trainer
            logger.info("Initializing Trainer...")
            trainer = Trainer(
                model=model_pegasus,
                args=trainer_args,
                tokenizer=tokenizer,
                data_collator=seq2seq_data_collator,
                train_dataset=dataset_samsum_pt["train"],
                eval_dataset=dataset_samsum_pt["validation"]
            )
            
            # Start training
            logger.info("Starting training...")
            trainer.train()
            
            # Save the model
            logger.info("Saving model...")
            trainer.save_model()
            
            # Save model and tokenizer separately
            model_dir = os.path.join(self.config.root_dir, "pegasus_model")
            tokenizer_dir = os.path.join(self.config.root_dir, "tokenizer")
            
            os.makedirs(model_dir, exist_ok=True)
            os.makedirs(tokenizer_dir, exist_ok=True)
            
            model_pegasus.save_pretrained(model_dir)
            tokenizer.save_pretrained(tokenizer_dir)
            
            logger.info("Model training completed successfully!")
            logger.info(f"Model saved at: {model_dir}")
            logger.info(f"Tokenizer saved at: {tokenizer_dir}")
            
            # Log final metrics if available
            if hasattr(trainer, 'state') and trainer.state.log_history:
                try:
                    final_metrics = trainer.state.log_history[-1]
                    logger.info(f"Final training metrics: {final_metrics}")
                except:
                    logger.info("Training completed but final metrics not available")
            
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise

# Main execution

try: 
    logger.info("Starting model training pipeline...")
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_training_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
    logger.info("Model training pipeline completed successfully!")
    
except Exception as e:
    logger.error(f"Error training model: {str(e)}")
    raise e

Accelerate version: 1.10.1
[2025-09-10 20:37:14,240: INFO: 3835435508]: Accelerate available after patch: True
[2025-09-10 20:37:14,241: INFO: 3835435508]: Starting model training pipeline...
[2025-09-10 20:37:14,244: INFO: common]: yaml file: config\config.yaml loaded successfully
[2025-09-10 20:37:14,244: INFO: common]: yaml file: config\params.yaml loaded successfully
[2025-09-10 20:37:14,244: INFO: common]: Directory created at: artifacts
[2025-09-10 20:37:14,244: INFO: common]: Directory created at: artifacts/model_trainer
[2025-09-10 20:37:14,244: INFO: 3835435508]: Using device: cpu
[2025-09-10 20:37:14,250: INFO: 3835435508]: Loading model from: google/pegasus-cnn_dailymail


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-09-10 20:37:20,440: INFO: 3835435508]: Loading dataset from: artifacts\data_transformation\samsum_dataset
[2025-09-10 20:37:20,456: ERROR: 3835435508]: Error during training: name 'AcceleratorConfig' is not defined
[2025-09-10 20:37:20,456: ERROR: 3835435508]: Error training model: name 'AcceleratorConfig' is not defined


NameError: name 'AcceleratorConfig' is not defined

In [48]:
import os
import json
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from dataclasses import dataclass
from pathlib import Path
from tqdm import tqdm

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int  
    weight_decay: float
    learning_rate: float  
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_strategy: str 
    save_steps: int
    save_total_limit: int  
    gradient_accumulation_steps: int
    lr_scheduler_type: str  
    fp16: bool  
    load_best_model_at_end: bool  
    metric_for_best_model: str  
    greater_is_better: bool  
    prediction_loss_only: bool  
    remove_unused_columns: bool  
    report_to: str  

from src.text_summarizer.logger import logger
from src.text_summarizer.utils.common import read_yaml, create_directories
from src.text_summarizer.constants import *

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        if not os.path.exists(config_filepath):
            raise FileNotFoundError(f"Configuration file not found: {config_filepath}")
        if not os.path.exists(params_filepath):
            raise FileNotFoundError(f"Parameters file not found: {params_filepath}")
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        if not hasattr(self.config, 'artifacts_root'):
            self.config.artifacts_root = "artifacts"
        
        create_directories([self.config.artifacts_root])
        
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=int(params.num_train_epochs),
            warmup_steps=int(params.warmup_steps),
            per_device_train_batch_size=int(params.per_device_train_batch_size),
            per_device_eval_batch_size=int(params.per_device_eval_batch_size),  
            weight_decay=float(params.weight_decay),
            learning_rate=float(params.learning_rate),  
            logging_steps=int(params.logging_steps),
            eval_strategy=params.eval_strategy,
            eval_steps=int(params.eval_steps),
            save_strategy=params.save_strategy,  
            save_steps=int(params.save_steps),
            save_total_limit=int(params.save_total_limit),  
            gradient_accumulation_steps=int(params.gradient_accumulation_steps),
            lr_scheduler_type=params.lr_scheduler_type,  
            fp16=bool(params.fp16),  
            load_best_model_at_end=bool(params.load_best_model_at_end),  
            metric_for_best_model=params.metric_for_best_model,  
            greater_is_better=bool(params.greater_is_better),  
            prediction_loss_only=bool(params.prediction_loss_only),  
            remove_unused_columns=bool(params.remove_unused_columns),  
            report_to=params.report_to  
        )

        return model_trainer_config

# Import only what we need - no Trainer class
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup
)
from datasets import load_from_disk

class ManualModelTrainer:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def train(self):
        try:
            logger.info(f"Using device: {self.device}")

            # Load model and tokenizer
            logger.info(f"Loading model from: {self.config.model_ckpt}")
            tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
            model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(self.device)

            # Load dataset
            logger.info(f"Loading dataset from: {self.config.data_path}")
            dataset = load_from_disk(self.config.data_path)

            # Debug: Check dataset structure
            logger.info("Dataset structure:")
            for split in dataset.keys():
                logger.info(f"  {split}: {len(dataset[split])} examples")

            # Check a sample from training data
            sample = dataset["train"][0]
            logger.info("Sample keys: %s", list(sample.keys()))
            for key, value in sample.items():
                if isinstance(value, (list, torch.Tensor)):
                    logger.info("  %s: length=%d type=%s", key, len(value), type(value).__name__)
                else:
                    logger.info("  %s: %s", key, type(value).__name__)

            # Verify tokenizer has pad token
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                logger.info("Set pad_token to eos_token")

            # Remove non-tensor columns completely
            columns_to_remove = ['id', 'dialogue', 'summary']
            for split in dataset.keys():
                # Check which columns actually exist
                existing_columns = set(dataset[split].column_names)
                columns_to_remove_in_split = [col for col in columns_to_remove if col in existing_columns]
                
                if columns_to_remove_in_split:
                    dataset[split] = dataset[split].remove_columns(columns_to_remove_in_split)
                    logger.info("Removed from %s: %s", split, columns_to_remove_in_split)

            logger.info("Remaining columns: %s", list(dataset["train"].column_names))

            # Data collator
            data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

            # Create data loaders
            train_dataloader = DataLoader(
                dataset["train"], 
                batch_size=self.config.per_device_train_batch_size,
                shuffle=True,
                collate_fn=data_collator,
                num_workers=0,
                pin_memory=False
            )

            val_dataloader = DataLoader(
                dataset["validation"], 
                batch_size=self.config.per_device_eval_batch_size,
                shuffle=False,
                collate_fn=data_collator,
                num_workers=0,
                pin_memory=False
            )

            
            # Optimizer and scheduler
            optimizer = torch.optim.AdamW(
                model.parameters(), 
                lr=self.config.learning_rate,
                weight_decay=self.config.weight_decay
            )
            
            total_steps = len(train_dataloader) * self.config.num_train_epochs
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.config.warmup_steps,
                num_training_steps=total_steps
            )
            
            logger.info(f"Starting training for {self.config.num_train_epochs} epochs...")
            logger.info(f"Total training steps: {total_steps}")
            
            # Training metrics tracking
            training_history = []
            best_eval_loss = float('inf')
            global_step = 0
            
            # Training loop
            model.train()
            for epoch in range(self.config.num_train_epochs):
                logger.info(f"=== Epoch {epoch + 1}/{self.config.num_train_epochs} ===")
                
                epoch_loss = 0
                num_batches = 0
                
                progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")
                
                for batch_idx, batch in enumerate(progress_bar):
                    # Move batch to device
                    batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
                    
                    # Forward pass
                    outputs = model(**batch)
                    loss = outputs.loss / self.config.gradient_accumulation_steps
                    
                    # Backward pass
                    loss.backward()
                    
                    epoch_loss += loss.item() * self.config.gradient_accumulation_steps
                    num_batches += 1
                    
                    # Update weights
                    if (batch_idx + 1) % self.config.gradient_accumulation_steps == 0:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                        
                        global_step += 1
                        
                        # Logging
                        if global_step % self.config.logging_steps == 0:
                            current_lr = scheduler.get_last_lr()[0]
                            logger.info(f"Step {global_step}/{total_steps} | Loss: {loss.item() * self.config.gradient_accumulation_steps:.4f} | LR: {current_lr:.2e}")
                        
                        # Evaluation
                        if global_step % self.config.eval_steps == 0:
                            eval_loss = self.evaluate(model, val_dataloader)
                            logger.info(f"Evaluation at step {global_step} | Eval Loss: {eval_loss:.4f}")
                            
                            # Save best model
                            if eval_loss < best_eval_loss:
                                best_eval_loss = eval_loss
                                logger.info(f"New best model! Eval loss: {eval_loss:.4f}")
                                self.save_checkpoint(model, tokenizer, global_step, eval_loss, is_best=True)
                            
                            # Record metrics
                            training_history.append({
                                'step': global_step,
                                'epoch': epoch + 1,
                                'train_loss': loss.item() * self.config.gradient_accumulation_steps,
                                'eval_loss': eval_loss,
                                'learning_rate': current_lr
                            })
                            
                            model.train()  # Back to training mode
                    
                    # Update progress bar
                    progress_bar.set_postfix({
                        'loss': f'{loss.item() * self.config.gradient_accumulation_steps:.4f}',
                        'avg_loss': f'{epoch_loss/num_batches:.4f}',
                        'lr': f'{scheduler.get_last_lr()[0]:.2e}'
                    })
                
                avg_epoch_loss = epoch_loss / num_batches
                logger.info(f"Epoch {epoch + 1} completed | Average Loss: {avg_epoch_loss:.4f}")
                
                # End of epoch evaluation
                eval_loss = self.evaluate(model, val_dataloader)
                logger.info(f"End of Epoch {epoch + 1} | Eval Loss: {eval_loss:.4f}")
                
                if eval_loss < best_eval_loss:
                    best_eval_loss = eval_loss
                    logger.info(f"New best model at end of epoch {epoch + 1}! Eval loss: {eval_loss:.4f}")
                    self.save_checkpoint(model, tokenizer, global_step, eval_loss, is_best=True)
            
            # Final save
            logger.info("Training completed! Saving final model...")
            self.save_final_model(model, tokenizer, training_history, best_eval_loss)
            
            logger.info("="*50)
            logger.info("TRAINING SUMMARY")
            logger.info(f"Total epochs: {self.config.num_train_epochs}")
            logger.info(f"Total steps: {global_step}")
            logger.info(f"Best evaluation loss: {best_eval_loss:.4f}")
            logger.info(f"Final learning rate: {scheduler.get_last_lr()[0]:.2e}")
            logger.info("="*50)
            
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise

    def evaluate(self, model, dataloader):
        """Evaluate the model"""
        model.eval()
        total_loss = 0
        num_batches = 0
        
        logger.info("Running evaluation...")
        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Evaluating", leave=False):
                batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
                outputs = model(**batch)
                total_loss += outputs.loss.item()
                num_batches += 1
        
        avg_loss = total_loss / num_batches if num_batches > 0 else 0
        return avg_loss

    def save_checkpoint(self, model, tokenizer, step, eval_loss, is_best=False):
        """Save model checkpoint"""
        checkpoint_dir = os.path.join(self.config.root_dir, f"checkpoint-{step}")
        if is_best:
            checkpoint_dir = os.path.join(self.config.root_dir, "best_model")
        
        os.makedirs(checkpoint_dir, exist_ok=True)
        
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        
        # Save checkpoint info
        checkpoint_info = {
            'step': step,
            'eval_loss': eval_loss,
            'is_best': is_best
        }
        
        with open(os.path.join(checkpoint_dir, 'checkpoint_info.json'), 'w') as f:
            json.dump(checkpoint_info, f, indent=2)

    def save_final_model(self, model, tokenizer, training_history, best_eval_loss):
        """Save final model and training artifacts"""
        # Save final model
        model_dir = os.path.join(self.config.root_dir, "final_model")
        tokenizer_dir = os.path.join(self.config.root_dir, "tokenizer")
        
        os.makedirs(model_dir, exist_ok=True)
        os.makedirs(tokenizer_dir, exist_ok=True)
        
        model.save_pretrained(model_dir)
        tokenizer.save_pretrained(tokenizer_dir)
        
        # Save training history
        with open(os.path.join(self.config.root_dir, "training_history.json"), "w") as f:
            json.dump(training_history, f, indent=2)
        
        # Save training summary
        summary = {
            "best_eval_loss": best_eval_loss,
            "total_epochs": self.config.num_train_epochs,
            "model_name": self.config.model_ckpt,
            "training_completed": True
        }
        
        with open(os.path.join(self.config.root_dir, "training_summary.json"), "w") as f:
            json.dump(summary, f, indent=2)
        
        logger.info(f"Final model saved to: {model_dir}")
        logger.info(f"Tokenizer saved to: {tokenizer_dir}")
        logger.info(f"Training history saved to: {os.path.join(self.config.root_dir, 'training_history.json')}")

# Main execution

try:
    logger.info("Starting manual model training pipeline...")
    logger.info("This approach bypasses HuggingFace Trainer to avoid dependency issues")
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_training_config()
    model_trainer = ManualModelTrainer(model_trainer_config)
    model_trainer.train()
    logger.info("Manual model training pipeline completed successfully!")
    
except Exception as e:
    logger.error(f"Error training model: {str(e)}")
    import traceback
    traceback.print_exc()
    raise e

[2025-09-10 23:28:56,903: INFO: 654578579]: Starting manual model training pipeline...
[2025-09-10 23:28:56,903: INFO: 654578579]: This approach bypasses HuggingFace Trainer to avoid dependency issues
[2025-09-10 23:28:56,918: INFO: common]: yaml file: config\config.yaml loaded successfully
[2025-09-10 23:28:56,934: INFO: common]: yaml file: config\params.yaml loaded successfully
[2025-09-10 23:28:56,936: INFO: common]: Directory created at: artifacts
[2025-09-10 23:28:56,937: INFO: common]: Directory created at: artifacts/model_trainer
[2025-09-10 23:28:56,939: INFO: 654578579]: Using device: cpu
[2025-09-10 23:28:56,940: INFO: 654578579]: Loading model from: google/pegasus-cnn_dailymail


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-09-10 23:29:08,346: INFO: 654578579]: Loading dataset from: artifacts\data_transformation\samsum_dataset
[2025-09-10 23:29:08,431: INFO: 654578579]: Dataset structure:
[2025-09-10 23:29:08,432: INFO: 654578579]:   train: 14732 examples
[2025-09-10 23:29:08,432: INFO: 654578579]:   test: 819 examples
[2025-09-10 23:29:08,433: INFO: 654578579]:   validation: 818 examples
[2025-09-10 23:29:08,435: INFO: 654578579]: Sample keys: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels']
[2025-09-10 23:29:08,435: INFO: 654578579]:   id: str
[2025-09-10 23:29:08,436: INFO: 654578579]:   dialogue: str
[2025-09-10 23:29:08,436: INFO: 654578579]:   summary: str
[2025-09-10 23:29:08,437: INFO: 654578579]:   input_ids: length=28 type=list
[2025-09-10 23:29:08,438: INFO: 654578579]:   attention_mask: length=28 type=list
[2025-09-10 23:29:08,439: INFO: 654578579]:   labels: length=11 type=list
[2025-09-10 23:29:08,440: INFO: 654578579]: Removed from train: ['id', 'dialogue', 'summ

Training Epoch 1:   4%|‚ñç         | 142/3683 [22:29<9:21:02,  9.51s/it, loss=11.7244, avg_loss=11.7455, lr=3.50e-06] 


KeyboardInterrupt: 