In [1]:
!pip install -q transformers datasets librosa soundfile jiwer accelerate evaluate peft
!pip install -q bitsandbytes  # For 8-bit training if needed
!pip install -q av ffmpeg-python  # For audio decoding support
!apt-get -qq install -y ffmpeg  # System-level ffmpeg

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m 

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import librosa
import soundfile as sf
from pathlib import Path
from tqdm import tqdm
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import re

from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import Dataset, Audio
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import evaluate

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"CUDA Version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}")

# Set paths
TRAIN_PATH = "/kaggle/input/shobdotori/Train"
TRAIN_ANNOTATION_PATH = "/kaggle/input/shobdotori/Train_annotation"
TEST_PATH = "/kaggle/input/shobdotori/Test"
OUTPUT_DIR = "./whisper-bengali-lora"
SUBMISSION_FILE = "submission.csv"

2025-11-16 10:20:19.333213: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763288419.497909      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763288419.544218      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda
GPU: Tesla P100-PCIE-16GB
CUDA Version: 12.4


In [3]:
def load_training_data():
    """Load all training data from regional folders and their annotations"""
    all_data = []

    # Get all region folders
    region_folders = [f for f in os.listdir(TRAIN_PATH) if os.path.isdir(os.path.join(TRAIN_PATH, f))]

    print(f"Found {len(region_folders)} regions")

    for region in tqdm(region_folders, desc="Loading regions"):
        # Load annotation CSV for this region
        annotation_file = os.path.join(TRAIN_ANNOTATION_PATH, f"{region}.csv")

        if not os.path.exists(annotation_file):
            print(f"Warning: No annotation file for {region}")
            continue

        # Read annotations
        annotations_df = pd.read_csv(annotation_file)

        # Strip whitespace from column names
        annotations_df.columns = annotations_df.columns.str.strip()

        # Get audio folder path
        audio_folder = os.path.join(TRAIN_PATH, region)

        # Process each annotation
        for idx, row in annotations_df.iterrows():
            audio_file = row['audio'].strip() if 'audio' in annotations_df.columns else row.iloc[0]
            text = row['text'].strip() if 'text' in annotations_df.columns else row.iloc[1]

            audio_path = os.path.join(audio_folder, audio_file)

            if os.path.exists(audio_path):
                all_data.append({
                    'audio': audio_path,
                    'text': text,
                    'region': region
                })

    print(f"\nTotal training samples: {len(all_data)}")
    return pd.DataFrame(all_data)

# Load training data
train_df = load_training_data()
print(f"\nSample data:")
print(train_df.head())

Found 20 regions


Loading regions: 100%|██████████| 20/20 [00:08<00:00,  2.43it/s]


Total training samples: 3350

Sample data:
                                               audio  \
0  /kaggle/input/shobdotori/Train/Mymensingh/fema...   
1  /kaggle/input/shobdotori/Train/Mymensingh/fema...   
2  /kaggle/input/shobdotori/Train/Mymensingh/fema...   
3  /kaggle/input/shobdotori/Train/Mymensingh/fema...   
4  /kaggle/input/shobdotori/Train/Mymensingh/male...   

                              text      region  
0  আজ সকালে আমি বাজারে গিয়েছিলাম।  Mymensingh  
1       তুমি কি নতুন বই পড়তে চাও?  Mymensingh  
2            আকাশে আজ মেঘ জমে আছে।  Mymensingh  
3   আমি বন্ধুদের সাথে খেলা খেলেছি।  Mymensingh  
4   দরজাটা ধীরে ধীরে বন্ধ করে দাও।  Mymensingh  





In [15]:
!pip install -q transformers[sentencepiece] datasets torchaudio librosa sacrebleu evaluate tqdm


In [18]:
import os
import time
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torchaudio
import librosa
import re

from transformers import (
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
)

# ---------------------------
# Reproducibility & device
# ---------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ---------------------------
# Paths / config (শব্দতরী)
# ---------------------------
DATA_DIR = "/kaggle/input/shobdotori"   # <- tumi ja bolechho

# NOTE: folder nameগুলো screenshot অনুযায়ী capital:
TRAIN_AUDIO_DIR = os.path.join(DATA_DIR, "Train")
TEST_AUDIO_DIR  = os.path.join(DATA_DIR, "Test")
TRAIN_ANN_DIR   = os.path.join(DATA_DIR, "Train_annotation")

SAMPLE_SUB_PATH = os.path.join(DATA_DIR, "sample_submission.csv")

print("\nChecking paths...")
print("TRAIN_AUDIO_DIR :", TRAIN_AUDIO_DIR, os.path.exists(TRAIN_AUDIO_DIR))
print("TEST_AUDIO_DIR  :", TEST_AUDIO_DIR, os.path.exists(TEST_AUDIO_DIR))
print("TRAIN_ANN_DIR   :", TRAIN_ANN_DIR, os.path.exists(TRAIN_ANN_DIR))
print("SAMPLE_SUB_PATH :", SAMPLE_SUB_PATH, os.path.exists(SAMPLE_SUB_PATH))

if not os.path.exists(TRAIN_ANN_DIR):
    raise FileNotFoundError(f"Train_annotation dir not found at: {TRAIN_ANN_DIR}")

if not os.path.exists(SAMPLE_SUB_PATH):
    raise FileNotFoundError(f"sample_submission.csv not found at: {SAMPLE_SUB_PATH}")

# ---------------------------
# Load all Train_annotation/*.csv and concat
# ---------------------------
train_dfs = []
for fname in sorted(os.listdir(TRAIN_ANN_DIR)):
    if not fname.lower().endswith(".csv"):
        continue
    csv_path = os.path.join(TRAIN_ANN_DIR, fname)
    df_reg = pd.read_csv(csv_path)

    # region name from file, e.g. "Barisal.csv" -> "Barisal"
    region = os.path.splitext(fname)[0]
    df_reg["region"] = region

    train_dfs.append(df_reg)

train_df = pd.concat(train_dfs, ignore_index=True)

print("\n✅ Loaded Train_annotation CSVs:", len(train_dfs), "files")
print("Columns:", train_df.columns.tolist())
print("Total rows in train_df:", len(train_df))
print(train_df.head())

# sample_submission for test list
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
print("\n✅ Loaded sample_submission.csv with", len(sample_sub), "rows")
print(sample_sub.head())


Using device: cuda

Checking paths...
TRAIN_AUDIO_DIR : /kaggle/input/shobdotori/Train True
TEST_AUDIO_DIR  : /kaggle/input/shobdotori/Test True
TRAIN_ANN_DIR   : /kaggle/input/shobdotori/Train_annotation True
SAMPLE_SUB_PATH : /kaggle/input/shobdotori/sample_submission.csv True

✅ Loaded Train_annotation CSVs: 20 files
Columns: ['audio', 'text', 'region']
Total rows in train_df: 3350
                  audio                             text   region
0  female_barisal_1.wav  আজ সকালে আমি বাজারে গিয়েছিলাম।  Barisal
1  female_barisal_3.wav            আকাশে আজ মেঘ জমে আছে।  Barisal
2  female_barisal_4.wav   আমি বন্ধুদের সাথে খেলা খেলেছি।  Barisal
3  female_barisal_5.wav   দরজাটা ধীরে ধীরে বন্ধ করে দাও।  Barisal
4  female_barisal_6.wav    তুমি কি আমাকে পানি দিতে পারো?  Barisal

✅ Loaded sample_submission.csv with 450 rows
          audio          text
0  test_001.wav  আমি ভাত খাই।
1  test_002.wav  আমি ভাত খাই।
2  test_003.wav  আমি ভাত খাই।
3  test_004.wav  আমি ভাত খাই।
4  test_005.wav  আমি

In [None]:
MODEL_NAME = "openai/whisper-medium"

print(f"Loading model: {MODEL_NAME}")
print("This may take a few minutes...")

# Load processor
processor = WhisperProcessor.from_pretrained(
    MODEL_NAME,
    language="bengali",
    task="transcribe"
)


model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)


model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="bn",   # short code ব্যবহার করো
    task="transcribe"
)
model.config.suppress_tokens = []
model.config.use_cache = True

# Prepare model for training
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# Apply LoRA - Target only attention layers
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],  # একটু বেশি capacity
    lora_dropout=0.05,
    bias="none",
)

# Apply LoRA to model.model (inner Whisper model)
model.model = get_peft_model(model.model, lora_config)
model = model.to(device)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\n✓ LoRA applied successfully")
print(f"  Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"  Total params: {total_params:,}")

In [None]:
def prepare_dataset(df, processor, test_size=0.1):
    """Prepare dataset for training"""

    def prepare_dataset_batch(batch):
        """Process a single batch"""
        try:
            audio_path = batch["audio"]
            # audio_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
            audio_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
            audio_array, _ = librosa.effects.trim(audio_array, top_db=20)
            audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-8)


            # Compute input features
            input_features = processor.feature_extractor(
                audio_array,
                sampling_rate=16000
            ).input_features[0]

            # Encode target text
            labels = processor.tokenizer(batch["text"]).input_ids

            return {
                "input_features": input_features,
                "labels": labels
            }
        except Exception as e:
            print(f"Error processing {batch['audio']}: {e}")
            return {
                "input_features": np.zeros((80, 3000)),
                "labels": [processor.tokenizer.pad_token_id]
            }

    dataset = Dataset.from_pandas(df[['audio', 'text']])
    dataset = dataset.train_test_split(test_size=test_size, seed=42)

    print("Processing training dataset...")
    train_dataset = dataset["train"].map(
        prepare_dataset_batch,
        remove_columns=dataset["train"].column_names,
        desc="Processing train",
    )

    print("Processing validation dataset...")
    val_dataset = dataset["test"].map(
        prepare_dataset_batch,
        remove_columns=dataset["test"].column_names,
        desc="Processing validation",
    )

    return {"train": train_dataset, "test": val_dataset}

print("Preparing dataset...")
dataset = prepare_dataset(train_df, processor)
print(f"\nTrain samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['test'])}")  

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
print("✓ Data collator initialized")

In [None]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    """Compute WER metric"""
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

print("✓ Metrics function initialized")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2, # REDUCED from 8
    gradient_accumulation_steps=8,  # INCREASED from 2
    learning_rate=1e-4,  # Higher LR for faster convergence
    warmup_ratio=0.05,  # REDUCED
    num_train_epochs=2,  # REDUCED from 5 for faster training
    gradient_checkpointing=True,  # ENABLED
    fp16=True,  # DISABLED - use FP32 for training stability
    eval_strategy="steps",
    # per_device_eval_batch_size=4,  # REDUCED
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=400,
    eval_steps=400,
    logging_steps=50,
    logging_first_step=True,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    save_total_limit=2,  # REDUCED to save space
    dataloader_num_workers=2,
    remove_unused_columns=False,
    label_names=["labels"],
    optim="adamw_torch",  # Explicit optimizer
)

print("✓ Training arguments configured")

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
)

print("✓ Trainer initialized successfully")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Total training steps: ~{len(dataset['train']) * training_args.num_train_epochs // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")

In [None]:
import sys
import gc
import torch
from transformers import logging as hf_logging

hf_logging.set_verbosity_info()

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)

try:
    sys.stdout.flush()

    # Clear cache before training (only if CUDA is available)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    # Train the model
    train_result = trainer.train()

    print("\n" + "="*60)
    print("✓ TRAINING COMPLETED!")
    print("="*60)
    # Some Trainer versions use train_result.training_loss; fallback to metrics if needed
    tr_loss = getattr(train_result, "training_loss", None)
    if tr_loss is None and hasattr(train_result, "metrics"):
        tr_loss = train_result.metrics.get("train_loss", None)
    if tr_loss is not None:
        print(f"Training loss: {tr_loss:.4f}")

    # Save the model
    print("\nSaving model...")
    trainer.save_model(OUTPUT_DIR)
    processor.save_pretrained(OUTPUT_DIR)
    print(f"✓ Model and processor saved to {OUTPUT_DIR}")

except Exception as e:
    print("\n" + "="*60)
    print(" TRAINING ERROR")
    print("="*60)
    print(f"Error: {str(e)}")
    import traceback
    traceback.print_exc()
    raise 

In [None]:
def clean_bengali_text(text):
    """Clean and normalize Bengali text"""
    text = re.sub(r'\s+', ' ', text)   
    text = text.strip() 
    text = text.replace('।।', '।')
    return text  

def transcribe_audio_enhanced(audio_path, model, processor, device):
    """FIXED: Enhanced transcription with proper dtype handling"""
    try:
        # Load and preprocess audio
        audio_input, sr = librosa.load(audio_path, sr=16000, mono=True)
        audio_input, _ = librosa.effects.trim(audio_input, top_db=20)
        audio_input = audio_input / (np.max(np.abs(audio_input)) + 1e-8)

        # Process - returns dict with input_features
        inputs = processor(
            audio_input,
            sampling_rate=16000,
            return_tensors="pt"
        )


        input_features = inputs.input_features.to(device)
        if model.dtype == torch.float16:
            input_features = input_features.half()
        elif model.dtype == torch.float32:
            input_features = input_features.float()

        # Set forced decoder IDs for Bengali
        forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

        # Generate with proper settings
        with torch.no_grad():
            predicted_ids = model.generate(
                input_features,
                forced_decoder_ids=forced_decoder_ids,
                max_length=225,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3,
                temperature=0.0,  # Deterministic
            )

        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        return clean_bengali_text(transcription) if transcription else "আমি ভাত খাই।"

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return "আমি ভাত খাই।"

print("✓ Transcription function ready")

In [None]:
print("\n" + "="*60)
print("PREPARING FOR INFERENCE")
print("="*60)

model = model.to(torch.float16)
print("✓ Model converted to FP16 for inference")

# Clear cache
torch.cuda.empty_cache()
gc.collect()

print("\n" + "="*60)
print("GENERATING PREDICTIONS FOR TEST SET")
print("="*60)

test_files = sorted([f for f in os.listdir(TEST_PATH) if f.endswith('.wav')])
print(f"\nFound {len(test_files)} test files to process")

if len(test_files) == 0:
    print("No test files found!")
else:
    results = []
    model.eval()

    print("\nProcessing test files...")
    for test_file in tqdm(test_files, desc="Transcribing audio"):
        audio_path = os.path.join(TEST_PATH, test_file)
        transcription = transcribe_audio_enhanced(audio_path, model, processor, device)
        results.append({'audio': test_file, 'text': transcription})

    # Create submission dataframe
    submission_df = pd.DataFrame(results)
    submission_df.to_csv(SUBMISSION_FILE, index=False, encoding='utf-8')

    print(f"\n{'='*60}")
    print(f"✓ SUBMISSION FILE CREATED: {SUBMISSION_FILE}")
    print(f"{'='*60}")
    print(f"Total predictions: {len(submission_df)}")
    print(f"\nSample predictions (first 10):")
    print(submission_df.head(10).to_string(index=False))

    # Statistics
    fallback_count = (submission_df['text'] == "আমি ভাত খাই।").sum()
    print(f"\n{'='*60}")
    print(f"Statistics:")
    print(f"  Total files: {len(submission_df)}")
    print(f"  Successful: {len(submission_df) - fallback_count}")
    print(f"  Fallback: {fallback_count}")
    print(f"  Success rate: {100 * (len(submission_df) - fallback_count) / len(submission_df):.1f}%")

    # Text length statistics
    submission_df['text_length'] = submission_df['text'].str.len()
    print(f"\nText length statistics:")
    print(f"  Mean: {submission_df['text_length'].mean():.1f} characters")
    print(f"  Min: {submission_df['text_length'].min()}")
    print(f"  Max: {submission_df['text_length'].max()}")
    print(f"{'='*60}")
    print("✓ Ready for submission!")

In [None]:
# import shutil
# from pathlib import Path

# # Path to Kaggle working directory
# working_dir = Path("/kaggle/working")

# # Confirm current contents before deleting
# print("Files and folders before cleanup:")
# for item in working_dir.iterdir():
#     print(" -", item.name)

# # Remove all files and subdirectories inside /kaggle/working
# for item in working_dir.iterdir():
#     try:
#         if item.is_file() or item.is_symlink():
#             item.unlink()
#         elif item.is_dir():
#             shutil.rmtree(item)
#     except Exception as e:
#         print(f"Failed to remove {item}: {e}")

# print("\n Cleanup complete. Current contents:")
# print(list(working_dir.iterdir()))