In [None]:
import os
import torch
import torchaudio
import pandas as pd
from torch.utils.data import Dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from dataclasses import dataclass
from typing import Any, Dict, List
import gdown

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Downloading the CSV
csv_file_id = "1oFlf6m0p5_bS8Cnv7q3Mj6cRUAC-oAID"
csv_file_path = "/content/your_dataset.csv"
gdown.download(f"https://drive.google.com/uc?id={csv_file_id}", csv_file_path, quiet=False)

# path to audio files
audio_dir = "/content/drive/My Drive/audio_dataset/"


if os.path.exists(audio_dir):
    print("Audio directory exists!")
    audio_files = os.listdir(audio_dir)
    print(f"Number of audio files: {len(audio_files)}")
    print(f"First few files: {audio_files[:5]}")
else:
    print("Audio directory does not exist!")
    raise FileNotFoundError(f"Audio directory not found at {audio_dir}")


class WhisperDataset(Dataset):
    def __init__(self, csv_file, audio_dir, processor):
        self.data = pd.read_csv(csv_file, encoding='ISO-8859-1')
        self.audio_dir = audio_dir
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        audio_path = os.path.join(self.audio_dir, self.data.iloc[idx, 0])
        transcription = self.data.iloc[idx, 1]

        # Load and process audio
        waveform, sample_rate = torchaudio.load(audio_path)
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

        # Process audio input
        inputs = self.processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)

        # Process transcription labels
        labels = self.processor(text=transcription, return_tensors="pt").input_ids

        return {
            "input_features": inputs.input_features.squeeze(0),
            "labels": labels.squeeze(0)
        }

# Load processor and model
pretrained_model_name = "openai/whisper-small"
language = "en"
processor = WhisperProcessor.from_pretrained(pretrained_model_name, language=language)
model = WhisperForConditionalGeneration.from_pretrained(pretrained_model_name)

# Initialize the dataset
dataset = WhisperDataset(csv_file=csv_file_path, audio_dir=audio_dir, processor=processor)

# Custom data collator
@dataclass
class DataCollatorForWhisper:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [feature["input_features"] for feature in features]
        labels = [feature["labels"] for feature in features]

        # Pad input features and labels
        batch_input_features = torch.nn.utils.rnn.pad_sequence(input_features, batch_first=True, padding_value=0)
        batch_labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)

        return {
            "input_features": batch_input_features,
            "labels": batch_labels
        }

data_collator = DataCollatorForWhisper(processor)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_steps=500,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    predict_with_generate=True,
    save_total_limit=2,
    report_to="none"  #
)

# Instantiate the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,  # Use tokenizer
    data_collator=data_collator,  # Use the custom data collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and processor in separate folders
model_dir = "./whisper-finetuned-model"
processor_dir = "./whisper-finetuned-processor"

model.save_pretrained(model_dir)
processor.save_pretrained(processor_dir)

print(f"Training completed successfully. Model saved in {model_dir} and processor saved in {processor_dir}.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading...
From: https://drive.google.com/uc?id=1oFlf6m0p5_bS8Cnv7q3Mj6cRUAC-oAID
To: /content/your_dataset.csv
100%|██████████| 3.10k/3.10k [00:00<00:00, 3.45MB/s]


Audio directory exists!
Number of audio files: 50
First few files: ['login_02.wav', 'login_03.wav', 'login_01.wav', 'login_04.wav', 'login_05.wav']


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]



Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618

Training completed successfully. Model saved in ./whisper-finetuned-model and processor saved in ./whisper-finetuned-processor.


In [None]:
#To download the fine tuned model and processor

import shutil
from google.colab import files

# Paths to the directories
model_dir = "./whisper-finetuned-model"
processor_dir = "./whisper-finetuned-processor"

# Compressing the directories into ZIP files
shutil.make_archive(model_dir, 'zip', model_dir)
shutil.make_archive(processor_dir, 'zip', processor_dir)

# To download the ZIP files
files.download(f"{model_dir}.zip")
files.download(f"{processor_dir}.zip")
