In [1]:
import os

In [2]:
%pwd

'c:\\Users\\sahil\\OneDrive\\Desktop\\AutoSummaryAI\\AutoSummaryAI\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\sahil\\OneDrive\\Desktop\\AutoSummaryAI\\AutoSummaryAI'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int
    learning_rate: float
    load_best_model_at_end: bool
    metric_for_best_model: str
    greater_is_better: bool

In [6]:
from AutoSummaryAI.constants import *
from AutoSummaryAI.utils.common import read_yaml, create_directories

In [7]:
import torch
torch.cuda.empty_cache()

In [8]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt=config.model_ckpt,
            num_train_epochs=params.num_train_epochs,
            warmup_steps=params.warmup_steps,
            per_device_train_batch_size=params.per_device_train_batch_size,
            per_device_eval_batch_size=params.per_device_eval_batch_size,
            weight_decay=params.weight_decay,
            logging_steps=params.logging_steps,
            evaluation_strategy=params.evaluation_strategy,
            eval_steps=params.eval_steps,
            save_steps=params.save_steps,
            gradient_accumulation_steps=params.gradient_accumulation_steps,
            learning_rate=params.learning_rate,
            load_best_model_at_end=params.load_best_model_at_end,
            metric_for_best_model=params.metric_for_best_model,
            greater_is_better=params.greater_is_better
    )

        return model_trainer_config

In [10]:
%pip install -U bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [11]:
%pip install peft

Note: you may need to restart the kernel to use updated packages.


In [12]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-30 09:01:57,954: INFO: config: PyTorch version 2.6.0+cu126 available.]


In [15]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        """
        Train the model with extreme GPU memory optimization techniques for 4GB GPUs
        using Parameter-Efficient Fine-Tuning (PEFT) with LoRA and ROUGE metrics
        """
        import torch
        import gc
        import numpy as np
        from transformers import BitsAndBytesConfig
        from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
        
        # First, install required packages if not already installed
        try:
            import peft
            import rouge_score
        except ImportError:
            import subprocess
            import sys
            print("Installing required packages...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", "peft", "rouge-score"])
            import peft
            import rouge_score
            
        # Force aggressive garbage collection and memory cleanup
        gc.collect()
        torch.cuda.empty_cache()
        
        # Report initial memory state
        if torch.cuda.is_available():
            print(f"Initial GPU memory: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated")
        
        # Set device to CUDA if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Load tokenizer first (uses less memory)
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        
        # Setup 4-bit quantization configuration (more aggressive than 8-bit)
        print("Configuring 4-bit quantization...")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,              # Enable 4-bit quantization (more memory efficient)
            bnb_4bit_use_double_quant=True, # Use nested quantization for 4-bit weights
            bnb_4bit_quant_type="nf4",      # Use normalized float 4 format
            bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation with 4-bit
        )
        
        # Load model with extreme memory optimization
        print("Loading model with 4-bit quantization...")
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(
            self.config.model_ckpt,
            quantization_config=quantization_config,
            device_map="auto",
            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16
        )
        
        # Immediately prepare model for k-bit training
        print("Preparing model for training with PEFT/LoRA...")
        model_pegasus = prepare_model_for_kbit_training(model_pegasus)
        
        # Define LoRA configuration with memory-efficient parameters
        print("Configuring LoRA with reduced parameters...")
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            r=4,                        # Reduced rank (was 8)
            lora_alpha=16,              # Reduced alpha (was 32)
            lora_dropout=0.05,          # Reduced dropout (was 0.1)
            # Apply to fewer modules for memory efficiency
            target_modules=["q_proj", "v_proj"],
            bias="none"
        )
        
        # Apply LoRA adapter
        print("Applying LoRA adapter to the model...")
        model_pegasus = get_peft_model(model_pegasus, lora_config)
        
        # Enable gradient checkpointing (trades computation for memory)
        print("Enabling gradient checkpointing...")
        model_pegasus.gradient_checkpointing_enable()
        
        # Print trainable parameters information
        self.print_trainable_parameters(model_pegasus)
        
        # Set extreme small batch sizes for 4GB GPU
        train_batch_size = 1  # Can't go lower than 1
        eval_batch_size = 1
        
        # Use higher gradient accumulation
        gradient_accumulation_steps = 32  # Increased from 16 to 32
        
        print(f"GPU memory after loading model: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated")
        
        # Data collator that will handle dynamic padding
        print("Creating data collator...")
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus, padding="longest")
        
        # Load dataset
        print("Loading dataset...")
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        
        # Use a smaller subset of training and validation data for testing
        # This helps with memory usage and speeds up initial validation
        train_subset_size = min(1000, len(dataset_samsum_pt["train"]))
        print(f"Using {train_subset_size} examples for training (out of {len(dataset_samsum_pt['train'])})")
        train_dataset = dataset_samsum_pt["train"].select(range(train_subset_size))
        
        # Use very small validation set
        if "validation" in dataset_samsum_pt:
            val_subset_size = min(50, len(dataset_samsum_pt["validation"]))
            print(f"Using {val_subset_size} examples for validation (out of {len(dataset_samsum_pt['validation'])})")
            val_dataset = dataset_samsum_pt["validation"].select(range(val_subset_size))
        else:
            val_dataset = None
        
        # Define compute_metrics function for ROUGE evaluation
        def compute_metrics(pred):
            """
            Compute ROUGE metrics for summarization model evaluation
            with minimal memory usage.
            """
            from rouge_score import rouge_scorer
            
            # Process predictions in small batches to save memory
            labels = []
            predictions = []
            
            # Process in batches of 8 to avoid memory spikes
            batch_size = 8
            
            for i in range(0, len(pred.predictions), batch_size):
                # Get batch
                pred_batch = pred.predictions[i:i + batch_size]
                label_batch = pred.label_ids[i:i + batch_size]
                
                # Decode predictions
                pred_texts = tokenizer.batch_decode(
                    pred_batch, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                predictions.extend(pred_texts)
                
                # Process labels - replace -100 padding
                proc_labels = []
                for label in label_batch:
                    # Replace -100 with pad token ID
                    label = np.where(label != -100, label, tokenizer.pad_token_id)
                    proc_labels.append(label)
                
                # Decode labels
                label_texts = tokenizer.batch_decode(
                    proc_labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                labels.extend(label_texts)
                
                # Force garbage collection after each batch
                if i % (batch_size * 4) == 0:
                    gc.collect()
                    torch.cuda.empty_cache()
            
            # Calculate ROUGE scores
            scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
            
            # Calculate scores for each prediction and reference pair
            rouge1_scores = []
            rouge2_scores = []
            rougeL_scores = []
            
            # Process in batches to avoid memory spikes
            for i in range(len(predictions)):
                # Handle empty predictions or references gracefully
                if not predictions[i].strip() or not labels[i].strip():
                    rouge1_scores.append(0)
                    rouge2_scores.append(0)
                    rougeL_scores.append(0)
                    continue
                    
                # Calculate scores
                try:
                    scores = scorer.score(labels[i], predictions[i])
                    rouge1_scores.append(scores['rouge1'].fmeasure)
                    rouge2_scores.append(scores['rouge2'].fmeasure)
                    rougeL_scores.append(scores['rougeL'].fmeasure)
                except Exception as e:
                    print(f"Error calculating ROUGE: {e}")
                    rouge1_scores.append(0)
                    rouge2_scores.append(0)
                    rougeL_scores.append(0)
                
                # Force garbage collection periodically
                if i % 20 == 0:
                    gc.collect()
            
            # Return the average scores
            results = {
                'rouge1': float(np.mean(rouge1_scores)),
                'rouge2': float(np.mean(rouge2_scores)),
                'rougeL': float(np.mean(rougeL_scores))
            }
            print(f"ROUGE Scores: {results}")
            return results
            
        # Set up training arguments with extreme memory optimization
        print(f"Setting up trainer with batch size: {train_batch_size}, grad_accum: {gradient_accumulation_steps}")
        
        # Set shorter training length for testing
        num_epochs = 1  # Start with 1 epoch to verify setup
        
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=num_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=25,  # More frequent logging
            eval_strategy="steps" if val_dataset else "no",
            eval_steps=100,    # More frequent evaluation
            save_steps=500,    # Save less frequently to reduce disk I/O
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=float(self.config.learning_rate),
            load_best_model_at_end=True if val_dataset else False,
            metric_for_best_model="rouge1",  # Use ROUGE1 for model selection
            greater_is_better=True,          # Higher ROUGE is better
            fp16=True,                      # Use mixed precision
            fp16_full_eval=True,            # Use mixed precision for eval too
            gradient_checkpointing=True,    # Enable gradient checkpointing
            optim="adamw_torch",            # Use AdamW optimizer
            max_grad_norm=0.3,              # Lower gradient clipping threshold
            ddp_find_unused_parameters=False,
            dataloader_pin_memory=False,    # Save CPU memory
            report_to="none",               # Disable reporting
            # These settings help with memory issues during training
            dataloader_num_workers=0,       # Don't use multiprocessing
            group_by_length=True,           # Group similar length sequences
            lr_scheduler_type="cosine",     # Cosine learning rate schedule
            # The following are to avoid "deadlocks"
            use_cpu=False,                  # Don't use CPU for training
            seed=42,                        # Fixed seed for reproducibility
            # Debug settings - uncomment to help diagnose issues
            # debug="underflow_overflow",
        )
        
        # Create trainer with compute_metrics
        print("Creating trainer with ROUGE metrics...")
        trainer = Trainer(
            model=model_pegasus,
            args=trainer_args,
            data_collator=seq2seq_data_collator,
            train_dataset=train_dataset,  # Using the subset
            eval_dataset=val_dataset,     # Using the subset
            compute_metrics=compute_metrics  # Add the ROUGE metrics calculation
        )
        
        # Train model
        print("Starting training...")
        try:
            trainer.train()
            
            # Save model and tokenizer
            print("Saving model and tokenizer...")
            # Save the LoRA adapter only to save space
            model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model-lora"))
            tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))
            print("Training complete!")
            
        except Exception as e:
            print(f"Error during training: {e}")
            import traceback
            traceback.print_exc()
            
            # Try to save checkpoint even if training fails
            try:
                print("Attempting to save checkpoint after error...")
                model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "checkpoint-error"))
                tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer-error"))
            except:
                print("Could not save checkpoint after error.")
    
    def print_trainable_parameters(self, model):
        """
        Prints the number of trainable parameters in the model.
        """
        trainable_params = 0
        all_params = 0
        for _, param in model.named_parameters():
            all_params += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        print(
            f"trainable params: {trainable_params:,} || all params: {all_params:,} || trainable%: {100 * trainable_params / all_params:.2f}%"
        )

In [16]:
try:
    # Make sure CUDA memory is clean before starting
    import torch
    import gc
    import os
    
    # First make sure the rouge-score is installed
    try:
        import rouge_score
    except ImportError:
        import subprocess
        import sys
        print("Installing rouge-score...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "rouge-score"])
    
    # Force aggressive garbage collection and empty CUDA cache
    gc.collect()
    torch.cuda.empty_cache()
    
    # Set memory efficient options
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:32"
    
    # Check available GPU memory
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        print(f"Currently allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"Max allocated: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
    
    # Fix potential deadlocks in dataloader
    torch.multiprocessing.set_sharing_strategy('file_system')
    
    # Get configuration and create trainer
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    
    # Train with all optimizations enabled
    model_trainer.train()
    
except Exception as e:
    print(f"Error occurred: {e}")
    # Print the full traceback for debugging
    import traceback
    traceback.print_exc()

GPU: NVIDIA GeForce GTX 1650
Total GPU memory: 4.00 GB
Currently allocated: 0.00 GB
Max allocated: 0.83 GB
[2025-04-30 09:06:21,767: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-30 09:06:21,770: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-30 09:06:21,771: INFO: common: created directory at: artifacts]
[2025-04-30 09:06:21,771: INFO: common: created directory at: artifacts/model_trainer]
Initial GPU memory: 0.00 MB allocated
Using device: cuda
Loading tokenizer...
Configuring 4-bit quantization...
Loading model with 4-bit quantization...
[2025-04-30 09:06:24,111: INFO: modeling: We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Preparing model for training with PEFT/LoRA...
Configuring LoRA with reduced parameters...
Applying LoRA adapter to the model...
Enabling gradient checkpointing...
trainable params: 786,432 || all params: 336,702,464 || trainable%: 0.23%
GPU memory after loading model: 620.61 MB allocated
Creating data collator...
Loading dataset...
Using 1000 examples for training (out of 14732)
Using 50 examples for validation (out of 818)
Setting up trainer with batch size: 1, grad_accum: 32
Creating trainer with ROUGE metrics...
Starting training...


Step,Training Loss,Validation Loss


Saving model and tokenizer...
Training complete!
