In [1]:
import os

In [2]:
%pwd

'/workspaces/endtoend/research'

In [3]:
os.chdir("/workspaces/endtoend")

In [5]:
from datasets import load_dataset
dataset = load_dataset('vladimir7542/test_dataset')
dataset.save_to_disk("artifacts/data_ingestion/test.hf")

Downloading readme:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.66M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    size: Path
    data_size: int
    max_steps: int
    language: str
    task: str
    metric: str
    learning_rate: float
    max_steps: int
    generation_max_length: int
    push_to_hub: bool

In [7]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        training_data = os.path.join(self.config.data_ingestion.root_dir, "test.hf")
        print(training_data)
        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(training_data),
            size = params.SIZE,
            data_size=params.DATA_SIZE,
            max_steps=params.MAX_STEPS,
            language=params.LANGUAGE,
            task=params.TASK,
            metric=params.METRIC,
            learning_rate=params.LEARNING_RATE,
            generation_max_length=params.GENERATION_MAX_LENGTH,
            push_to_hub=params.PUSH_TO_HUB
        )

        return training_config

In [10]:
from transformers import (WhisperTokenizer, 
                          WhisperFeatureExtractor, 
                          WhisperProcessor,
                          WhisperForConditionalGeneration)

import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from cnnClassifier.utils.common import DataCollatorSpeechSeq2SeqWithPadding
from cnnClassifier.utils.common import compute_metrics
import evaluate
from transformers import Seq2SeqTrainingArguments
from datasets import load_dataset
from transformers import Seq2SeqTrainer

In [11]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    
    def get_base_model(self):
        self.tokenizer = WhisperTokenizer.from_pretrained(self.config.size, 
                                                          language=self.config.language, 
                                                          task=self.config.task)
        
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(self.config.size)

        self.processor = WhisperProcessor.from_pretrained(self.config.size, 
                                                          language=self.config.language, 
                                                          task=self.config.task)
        
        self.model = WhisperForConditionalGeneration.from_pretrained(self.config.updated_base_model_path)

    def configure_trainig_arguments(self):

        self.model.generation_config.language = self.config.language
        self.model.generation_config.task = self.config.task
        self.model.generation_config.forced_decoder_ids = None
        
        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
            processor=self.processor,
            decoder_start_token_id=self.model.config.decoder_start_token_id
        )
        
        self.metric = evaluate.load(self.config.metric)

        self.training_args = Seq2SeqTrainingArguments(
            output_dir=self.config.trained_model_path,
            learning_rate=1e-5,
            max_steps=self.config.max_steps,
            generation_max_length=self.config.generation_max_length,
            metric_for_best_model=self.config.metric,
            push_to_hub=self.config.push_to_hub
        )

        self.dataset = load_dataset(str(self.config.training_data))
        

    def configure_trainer(self):

        dataset = self.dataset
        data_size = self.config.data_size
        indices = [i for i in range(0, data_size)]

        dataset['train'] = dataset['train'].select(indices)
        dataset['test'] = dataset['train'].select(indices)

        self.trainer = Seq2SeqTrainer(
            args=self.training_args,
            model=self.model,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=self.data_collator,
            compute_metrics=compute_metrics,
            tokenizer=self.processor.feature_extractor,
        )

    @staticmethod
    def save_model(path: Path, model: WhisperForConditionalGeneration):
        model.save_pretrained(path, from_pt=True)

    
    def train(self):
        
        self.trainer.train()

        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )

In [12]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_base_model()
    training.configure_trainig_arguments()
    training.configure_trainer()
    training.train()
    
except Exception as e:
    raise e

[2024-08-12 05:53:47,943: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-08-12 05:53:47,950: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-12 05:53:47,953: INFO: common: created directory at: artifacts]
artifacts/data_ingestion/test.hf
[2024-08-12 05:53:47,955: INFO: common: created directory at: artifacts/training]


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [_data_files, _format_kwargs, _output_all_columns, _split, _format_columns, _fingerprint, _format_type]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.