In [1]:
import os
from text_summarizer.logging import logging
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size : int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    optim : str
    eval_steps: int
    save_steps: int
    gradient_accumlation_steps: int

In [2]:
from text_summarizer.constants import *
from text_summarizer.utils.comon import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_file_path = PROJECT_DIR.joinpath( CONFIG_FILE_PATH),
            params_file_path = PROJECT_DIR.joinpath(PARAMS_FILE_PATH)):
        
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
    
        create_directories([PROJECT_DIR.joinpath(self.config.artifacts_root)])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments


        create_directories([PROJECT_DIR.joinpath(config.root_dir)])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            data_path= config.data_path,
            model_ckpt= config.model_ckpt,
            num_train_epochs= int(params.num_train_epochs),
            warmup_steps= int(params.warmup_steps),
            per_device_train_batch_size= int(params.per_device_train_batch_size),
            per_device_eval_batch_size = int(params.per_device_eval_batch_size),
            weight_decay=  float(params.weight_decay),
            logging_steps= int(params.logging_steps),
            evaluation_strategy= params.evaluation_strategy,
            eval_steps= int(params.eval_steps),
            save_steps= int(params.save_steps),
            gradient_accumlation_steps= int(params.gradient_accumlation_steps),
            optim = params.optim,      
        )
        return model_trainer_config

In [3]:
import torch
from datasets import load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer




  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class ModelTrainer:
    def __init__(self, config:ModelTrainerConfig):
        self.config = config
    
    def train(self):
        # check cuda 
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print("Using device:", device)
        torch.cuda.empty_cache()
        print('Memory Usage Befor load model to device:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
        # loading tokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        # loading model
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)

        print('Memory Usage After load model to device:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

        # loading dataset
        dataset_samsum_pt = load_from_disk(PROJECT_DIR.joinpath(self.config.data_path))

        # set training parameters
        trainer_args = TrainingArguments(
            output_dir=PROJECT_DIR.joinpath(self.config.root_dir),
            num_train_epochs=self.config.num_train_epochs,
            
            warmup_steps=self.config.warmup_steps,

            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy,
              eval_steps=self.config.eval_steps, save_steps=self.config.save_steps,
            gradient_accumulation_steps=self.config.gradient_accumlation_steps,
            # optim=self.config.optim,
            # optim="adamw_torch",
        )
        # train model
        trainer = Trainer(model=model_pegasus,args=trainer_args,
                          tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                          train_dataset=dataset_samsum_pt["train"],
                          eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()
        # save model
        model_pegasus.save_pretrained(os.path.join(PROJECT_DIR,
                                                   self.config.root_dir,
                                                   "pegasus_model_samsum"))
        # save tokenizer
        tokenizer.save_pretrained(os.path.join(PROJECT_DIR,
                                               self.config.root_dir,
                                               "tokenizer"))


In [5]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2023-06-05 21:46:39,190: text_summarizer_logger - INFO:  comon: line 33, yaml file: G:\currentProjects\AI\text_summarizer\config\config.yaml loaded successfully]
[2023-06-05 21:46:39,204: text_summarizer_logger - INFO:  comon: line 33, yaml file: G:\currentProjects\AI\text_summarizer\params.yaml loaded successfully]
[2023-06-05 21:46:39,209: text_summarizer_logger - INFO:  comon: line 64, created directory: G:\currentProjects\AI\text_summarizer\artifacts]
[2023-06-05 21:46:39,213: text_summarizer_logger - INFO:  comon: line 64, created directory: G:\currentProjects\AI\text_summarizer\artifacts\model_trainer]
Using device: cuda
Memory Usage Befor load model to device:
Allocated: 0.0 GB
Cached:    0.0 GB
Memory Usage After load model to device:
Allocated: 2.1 GB
Cached:    2.1 GB


  0%|          | 0/51 [00:00<?, ?it/s]You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 3.47 GiB already allocated; 0 bytes free; 3.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
!nvidia-smi

Mon Jun  5 21:17:20 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.79                 Driver Version: 531.79       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce MX150          WDDM | 00000000:01:00.0 Off |                  N/A |
| N/A   78C    P8               N/A /  N/A|   4018MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    