In [1]:
import os

In [2]:
%pwd

'/workspaces/endtoend/research'

In [3]:
os.chdir("/workspaces/endtoend")

In [10]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    size: Path
    data_size: int
    max_steps: int
    language: str
    task: str
    metric: str
    learning_rate: float
    max_steps: int
    generation_max_length: int
    push_to_hub: bool

In [11]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [18]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        training_data = os.path.join(self.config.data_ingestion.root_dir, "data.hf")
        print(training_data)
        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(training_data),
            size = params.SIZE,
            data_size=params.DATA_SIZE,
            max_steps=params.MAX_STEPS,
            language=params.LANGUAGE,
            task=params.TASK,
            metric=params.METRIC,
            learning_rate=params.LEARNING_RATE,
            generation_max_length=params.GENERATION_MAX_LENGTH,
            push_to_hub=params.PUSH_TO_HUB
        )

        return training_config

In [19]:
from transformers import (WhisperTokenizer, 
                          WhisperFeatureExtractor, 
                          WhisperProcessor,
                          WhisperForConditionalGeneration)

import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from cnnClassifier.utils.common import DataCollatorSpeechSeq2SeqWithPadding
from cnnClassifier.utils.common import compute_metrics
import evaluate
from transformers import Seq2SeqTrainingArguments
from datasets import load_dataset
from transformers import Seq2SeqTrainer

In [34]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    
    def get_base_model(self):
        self.tokenizer = WhisperTokenizer.from_pretrained(self.config.size, 
                                                          language=self.config.language, 
                                                          task=self.config.task)
        
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(self.config.size)

        self.processor = WhisperProcessor.from_pretrained(self.config.size, 
                                                          language=self.config.language, 
                                                          task=self.config.task)
        
        self.model = WhisperForConditionalGeneration.from_pretrained(self.config.updated_base_model_path)

    def configure_trainig_arguments(self):

        self.model.generation_config.language = self.config.language
        self.model.generation_config.task = self.config.task
        self.model.generation_config.forced_decoder_ids = None
        
        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
            processor=self.processor,
            decoder_start_token_id=self.model.config.decoder_start_token_id
        )
        
        self.metric = evaluate.load(self.config.metric)

        self.training_args = Seq2SeqTrainingArguments(
            output_dir=self.config.trained_model_path,
            learning_rate=1e-5,
            max_steps=self.config.max_steps,
            generation_max_length=self.config.generation_max_length,
            metric_for_best_model=self.config.metric,
            push_to_hub=self.config.push_to_hub
        )

        self.dataset = load_dataset(str(self.config.training_data))
        

    def configure_trainer(self):

        dataset = self.dataset
        data_size = self.config.data_size
        indices = [i for i in range(0, data_size)]

        dataset['train'] = dataset['train'].select(indices)
        dataset['test'] = dataset['train'].select(indices)

        self.trainer = Seq2SeqTrainer(
            args=self.training_args,
            model=self.model,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=self.data_collator,
            compute_metrics=compute_metrics,
            tokenizer=self.processor.feature_extractor,
)

    @staticmethod
    def save_model(path: Path, model: WhisperForConditionalGeneration):
        model.save_pretrained(path, from_pt=True)

    
    def train(self):
        
        self.trainer.train()

        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )

In [35]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_base_model()
    training.configure_trainig_arguments()
    training.configure_trainer()
    training.train()
    
except Exception as e:
    raise e

[2024-08-12 03:14:17,437: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-08-12 03:14:17,445: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-12 03:14:17,446: INFO: common: created directory at: artifacts]
artifacts/data_ingestion/data.hf
[2024-08-12 03:14:17,448: INFO: common: created directory at: artifacts/training]


max_steps is given, it will override any value given in num_train_epochs
Trainer is attempting to log a value of "[1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362]" for key "suppress_tokens" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `En

Step,Training Loss


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635