In [3]:
import os 


In [4]:
os.getcwd()

'/media/puzan/NewVolume/textsummarizer/Text-summarizer/research'

In [5]:
os.chdir("../")

In [6]:
os.getcwd()

'/media/puzan/NewVolume/textsummarizer/Text-summarizer'

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir:Path
    data_path:Path
    model_ckpt:Path
    num_train_epochs:int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [8]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml,create_directories


In [9]:
class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainingConfig:
        config=self.config.model_trainer
        params=self.params.TrainingArguments

        model_trainer_config=ModelTrainingConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt=config.model_ckpt,
            num_train_epochs=params.num_train_epochs,
            warmup_steps=params.warmup_steps,
            per_device_train_batch_size=params.per_device_train_batch_size,
            weight_decay=params.weight_decay,
            logging_steps=params.logging_steps,
            evaluation_strategy=params.evaluation_strategy,
            eval_steps=params.eval_steps,
            save_steps=params.save_steps,
            gradient_accumulation_steps=params.gradient_accumulation_steps
        )
        return model_trainer_config

In [10]:
from transformers import TrainingArguments,Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
from datasets import load_dataset,load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-09-12 21:04:00,363:INFO:config:PyTorch version 2.4.0 available.


In [11]:
class ModelTrainer:
    def __init__(self,config:ModelTrainingConfig):
        self.config=config
    def train(self):
        device="cuda" if torch.cuda.is_available()else "cpu"
        tokenizer=AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator=DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)
        #loading data
        dataset_sum_pt=load_from_disk(self.config.data_path)
        trainer_args=TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            fp16=True,
        )
        print(dataset_sum_pt)
        small_train_dataset = dataset_sum_pt["test"].select(range(2))  # First 2 examples from test dataset
        small_eval_dataset = dataset_sum_pt["validation"].select(range(2))  # First 2 examples from validation dataset

# Pass the smaller datasets to the Trainer
        trainer = Trainer(
            model=model_pegasus,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=seq2seq_data_collator,
            train_dataset=small_train_dataset,  # Use 2 examples from test dataset
            eval_dataset=small_eval_dataset,  # Use 2 examples from validation dataset
            
        )

# Run the training with the small dataset
        trainer.train()

        ## saving model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        ##save tokenizer 
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))



In [12]:
# pipeline
try:
    config=ConfigurationManager()
    model_trainer_config=config.get_model_trainer_config()
    model_trainer_config=ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()

except Exception as e:
    raise e
    


[2024-09-12 21:04:00,571:INFO:common:yaml_file:config/config.yaml loaded successfully
[2024-09-12 21:04:00,574:INFO:common:yaml_file:params.yaml loaded successfully
[2024-09-12 21:04:00,575:INFO:common:created directory at: artifacts


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})


  0%|          | 0/1 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 344.44 MiB is free. Including non-PyTorch memory, this process has 3.45 GiB memory in use. Of the allocated memory 3.35 GiB is allocated by PyTorch, and 17.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `kill -9 <PID>'


In [13]:
!nvidia-smi


Thu Sep 12 21:04:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1650        Off | 00000000:01:00.0  On |                  N/A |
| N/A   49C    P8               2W /  50W |   3559MiB /  4096MiB |     20%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    