In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [2]:
!pip install -q transformers datasets librosa evaluate jiwer gradio bitsandbytes accelerate
!pip install -q git+https://github.com/huggingface/peft.git@main

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m25.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#!/usr/bin/env python
# Whisper Model Fine-Tuning Script

# Import required libraries
import os
import sys
import torch
from huggingface_hub import login, Repository
from datasets import load_dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Configuration Parameters
MODEL_NAME = "openai/whisper-small.en"
DATASET_NAME = "Tarakeshwaran/Whisper-train-data"
LANGUAGE = "English"
TASK = "transcribe"
OUTPUT_DIR = "./whisper-finetuned"
HUB_MODEL_ID = "Tarakeshwaran/whisper-small-en-finetuned"

def setup_git_config():
    """
    Ensure Git configuration is set up before repository operations.
    """
    try:
        import subprocess
        subprocess.run(["git", "config", "--global", "user.email", "tarakeshwaran.sampath@gmail.com"], check=True)
        subprocess.run(["git", "config", "--global", "user.name", "Tarakeshwaran"], check=True)
    except Exception as e:
        print(f"Warning: Could not set up Git configuration: {e}")
        print("Please set up Git configuration manually using:")
        print("git config --global user.email 'your_email@example.com'")
        print("git config --global user.name 'Your Name'")
        sys.exit(1)

# Login to Hugging Face (replace with your token)
def huggingface_login():
    try:
        # Prompt for Hugging Face token if not already set
        token = os.environ.get('HF_TOKEN')
        if not token:
            token = input("ENter HF_Token")
        login(token=token)
    except Exception as e:
        print(f"Hugging Face login failed: {e}")
        sys.exit(1)

# Set environment variables for GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load and prepare dataset
def load_and_prepare_dataset():
    whisper_data = DatasetDict()
    whisper_data["train"] = load_dataset(DATASET_NAME, split="train")
    whisper_data["test"] = load_dataset(DATASET_NAME, split="test")
    whisper_data = whisper_data.remove_columns(["start", "end"])

    # Cast audio column and set sampling rate
    whisper_data = whisper_data.cast_column("audio", Audio(sampling_rate=16000))

    return whisper_data

# Initialize processor components
def initialize_processor():
    feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
    tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)
    processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

    return feature_extractor, tokenizer, processor

# Prepare dataset for training
def prepare_dataset(batch, feature_extractor, tokenizer):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# Custom Data Collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

# Compute Metrics
def compute_metrics(pred, tokenizer):
    wer_metric = evaluate.load("wer")
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with pad token
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER
    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Main Training Function
def train_whisper_model():
    # Setup Git and Hugging Face login first
    setup_git_config()
    huggingface_login()

    # Create a local repository
    repo = Repository(local_dir=OUTPUT_DIR, clone_from=HUB_MODEL_ID)

    # Load dataset
    whisper_data = load_and_prepare_dataset()

    # Initialize processor components
    feature_extractor, tokenizer, processor = initialize_processor()

    # Prepare dataset
    prepare_fn = lambda batch: prepare_dataset(batch, feature_extractor, tokenizer)
    whisper_data = whisper_data.map(
        prepare_fn,
        remove_columns=whisper_data.column_names["train"],
        num_proc=2
    )

    # Initialize data collator
    data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

    # Load model with 8-bit quantization
    model = WhisperForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        load_in_8bit=True,
        device_map="auto"
    )

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Add gradient hooks for encoder
    def make_inputs_require_grad(module, input, output):
        output.requires_grad_(True)
    model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

    # Configure LoRA
    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none"
    )
    model = get_peft_model(model, peft_config)

    # Training Arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        learning_rate=1e-3,
        warmup_steps=50,
        max_steps=500,
        fp16=True,
        evaluation_strategy="steps",
        save_steps=10,
        eval_steps=10,
        logging_steps=10,
        per_device_eval_batch_size=8,
        generation_max_length=128,
        predict_with_generate=True,
        metric_for_best_model="wer",
        greater_is_better=False,
        remove_unused_columns=False,
        label_names=["labels"],
    )

    # Initialize Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=whisper_data["train"],
        eval_dataset=whisper_data["test"],
        data_collator=data_collator,
        compute_metrics=lambda pred: compute_metrics(pred, tokenizer),
        tokenizer=processor.tokenizer,
    )

    # Disable cache for training
    model.config.use_cache = False

    # Train the model
    trainer.train()

    # Save and Push Components to Hub
    # 1. Save Processor
    feature_extractor.save_pretrained("feature_extractor")
    tokenizer.save_pretrained("tokenizer")
    processor.save_pretrained("processor")

    # 2. Save Model Configuration
    model.config.save_pretrained("config")

    # 3. Save and Push Entire Model
    model.save_pretrained("model", save_config=True)
    repo.push_to_hub()

    print(f"Model successfully trained and pushed to {HUB_MODEL_ID}")

# Run the training
if __name__ == "__main__":
    train_whisper_model()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/./whisper-finetuned is already a clone of https://huggingface.co/Tarakeshwaran/whisper-small-en-finetuned. Make sure you pull the latest changes with `repo.git_pull()`.


KeyboardInterrupt: 