# Whisper LoRA Fine-tuning for Heve AI

This notebook fine-tunes OpenAI Whisper using LoRA (Low-Rank Adaptation) on your collected speech data.

**Requirements:**
- GPU runtime (T4, V100, or A100 recommended)
- 100+ corrected transcription samples
- Upload your `training/data/` folder to this environment

## 1. Install Dependencies

In [None]:
!pip install transformers datasets peft bitsandbytes accelerate librosa soundfile evaluate jiwer

## 2. Import Libraries

In [None]:
import pandas as pd
import torch
from pathlib import Path
from datasets import Dataset, Audio
from transformers import (
    WhisperProcessor, 
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np

## 3. Load and Prepare Data

In [None]:
# Load your training data
data_dir = Path("training/data")
csv_file = data_dir / "transcriptions.csv"
audio_dir = data_dir / "audio"

# Read CSV and filter for corrected transcriptions
df = pd.read_csv(csv_file)
corrected_df = df[df['corrected_transcription'].notna() & (df['corrected_transcription'] != '')]

print(f"Total samples: {len(df)}")
print(f"Corrected samples: {len(corrected_df)}")

if len(corrected_df) < 10:
    raise ValueError("Need at least 10 corrected samples for training")

# Prepare dataset
training_data = []
for _, row in corrected_df.iterrows():
    audio_path = audio_dir / row['audio_file']
    if audio_path.exists():
        training_data.append({
            'audio': str(audio_path),
            'sentence': row['corrected_transcription']
        })

print(f"Training samples with audio: {len(training_data)}")

## 4. Create Dataset

In [None]:
# Create Hugging Face dataset
dataset = Dataset.from_list(training_data)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Split into train/validation (80/20)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")

## 5. Load Model with LoRA

In [None]:
# Load processor and model
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)

# Quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load model with quantization
model = WhisperForConditionalGeneration.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# LoRA configuration
lora_config = LoraConfig(
    r=32,  # rank
    lora_alpha=64,  # scaling parameter
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.FEATURE_EXTRACTION,
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 6. Data Preprocessing

In [None]:
def preprocess_function(examples):
    """Preprocess audio and text for training"""
    audio = examples["audio"]
    
    # Process audio
    inputs = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"], 
        return_tensors="pt"
    )
    
    # Process text
    labels = processor.tokenizer(
        examples["sentence"], 
        return_tensors="pt", 
        padding=True, 
        truncation=True,
        max_length=448
    ).input_ids
    
    # Replace padding token id's of the labels by -100 so it's ignored by loss
    labels[labels == processor.tokenizer.pad_token_id] = -100
    
    examples["input_features"] = inputs.input_features[0]
    examples["labels"] = labels[0]
    
    return examples

# Apply preprocessing
train_dataset = train_dataset.map(
    preprocess_function, 
    remove_columns=train_dataset.column_names
)
eval_dataset = eval_dataset.map(
    preprocess_function, 
    remove_columns=eval_dataset.column_names
)

## 7. Training Configuration

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-lora-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=5,
    warmup_steps=100,
    save_steps=500,
    eval_steps=500,
    logging_steps=50,
    save_total_limit=2,
    remove_unused_columns=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    dataloader_pin_memory=False,
)

# Evaluation metric
wer_metric = evaluate.load("wer")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Decode predictions and labels
    decoded_preds = processor.tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
    decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute WER
    wer = wer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {"wer": wer}

## 8. Train Model

In [None]:
# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

# Start training
print("🚀 Starting LoRA fine-tuning...")
trainer.train()

# Save the final model
trainer.save_model()
print("✅ Training complete! Model saved to ./whisper-lora-finetuned")

## 9. Test Fine-tuned Model

In [None]:
# Test on a sample
test_sample = eval_dataset[0]
input_features = test_sample["input_features"].unsqueeze(0)

# Generate prediction
with torch.no_grad():
    predicted_ids = model.generate(input_features, max_length=448)
    
# Decode prediction
transcription = processor.tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
print(f"Fine-tuned prediction: {transcription}")

# Compare with original label
original_label = processor.tokenizer.decode(test_sample["labels"], skip_special_tokens=True)
print(f"Ground truth: {original_label}")

## 10. Download Model for Local Use

In [None]:
# Create a zip file with the LoRA weights
import shutil

# Zip the model directory
shutil.make_archive("whisper-lora-finetuned", "zip", "./whisper-lora-finetuned")

print("📦 Model packaged as whisper-lora-finetuned.zip")
print("💾 Download this file and extract it to your local Heve AI directory")
print("🔧 Update src/asr.py to load the fine-tuned model:")
print("   from peft import PeftModel")
print("   base_model = WhisperModel('openai/whisper-base')")
print("   model = PeftModel.from_pretrained(base_model, './whisper-lora-finetuned')")

## Results Summary

**Training completed!** Your personalized Whisper model is ready.

**Next steps:**
1. Download `whisper-lora-finetuned.zip`
2. Extract to your Heve AI directory
3. Update `src/asr.py` to use the fine-tuned model
4. Enjoy improved accuracy for your voice and vocabulary!
