In [None]:
# Install required dependencies
!pip install --upgrade huggingface_hub transformers librosa torchaudio -q

In [None]:
import os
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device used is: {device}")

In [None]:
# Load BanglaASR model
model_path = "bangla-speech-processing/BanglaASR"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
tokenizer = WhisperTokenizer.from_pretrained(model_path)
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)

print(f"Model loaded on {device}")

In [None]:
# Demo: Test inference on a sample audio from HuggingFace
mp3_path = "https://huggingface.co/bangla-speech-processing/BanglaASR/resolve/main/mp3/common_voice_bn_31515636.mp3"

speech_array, sampling_rate = torchaudio.load(mp3_path, format="mp3")
speech_array = speech_array[0].numpy()
speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

predicted_ids = model.generate(inputs=input_features.to(device))[0]
transcription = processor.decode(predicted_ids, skip_special_tokens=True)

print("Demo Transcription Result:")
print(transcription)

In [None]:
# Test audio directory path
test_audio_dir = "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/test/audio"

# Get all .wav audio files
audio_files = sorted(glob(os.path.join(test_audio_dir, "*.wav")))

print(f"Found {len(audio_files)} audio files")

In [None]:
# Configuration
SAMPLING_RATE = 16000  # Whisper expects 16kHz
CHUNK_DURATION = 30    # seconds per chunk

def load_audio(audio_path):
    """
    Load audio file with proper preprocessing for Whisper.
    - Resample to 16kHz (Whisper requirement)
    - Convert stereo to mono
    """
    # Load with librosa, automatically resamples to target sr
    # mono=True ensures single channel output
    audio, sr = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
    return audio

def transcribe_audio_chunk(audio_chunk):
    """
    Transcribe a single audio chunk using BanglaASR model.
    
    Args:
        audio_chunk: numpy array of audio samples at 16kHz
    
    Returns:
        Transcript string
    """
    input_features = feature_extractor(
        audio_chunk, 
        sampling_rate=SAMPLING_RATE, 
        return_tensors="pt"
    ).input_features
    
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    
    return transcription.strip()

def transcribe_long_audio(audio_path, chunk_duration=30):
    """
    Transcribe long audio by splitting into chunks.
    
    Args:
        audio_path: Path to the audio file
        chunk_duration: Duration of each chunk in seconds
    
    Returns:
        Full transcript as a single string
    """
    # Load and preprocess audio
    audio = load_audio(audio_path)
    
    total_samples = len(audio)
    chunk_samples = SAMPLING_RATE * chunk_duration
    
    transcripts = []
    num_chunks = (total_samples + chunk_samples - 1) // chunk_samples
    
    # Process audio in chunks
    for i, start in enumerate(range(0, total_samples, chunk_samples)):
        end = min(start + chunk_samples, total_samples)
        audio_chunk = audio[start:end]
        
        # Skip if chunk is too short (less than 0.5 seconds)
        if len(audio_chunk) < SAMPLING_RATE * 0.5:
            continue
        
        # Transcribe chunk using BanglaASR
        transcript = transcribe_audio_chunk(audio_chunk)
        transcripts.append(transcript)
    
    # Join all chunk transcripts
    return " ".join(transcripts)

In [None]:
# Test on first audio file before running on all
if len(audio_files) > 0:
    test_file = audio_files[0]
    filename = os.path.splitext(os.path.basename(test_file))[0]
    duration = librosa.get_duration(path=test_file)
    
    print(f"Testing on: {filename} ({duration/60:.1f} mins)")
    test_transcript = transcribe_long_audio(test_file, chunk_duration=CHUNK_DURATION)
    print(f"\nTranscript Preview:")
    print(test_transcript[:500] + "..." if len(test_transcript) > 500 else test_transcript)

In [None]:
# Run inference on all audio files
results = []

for audio_path in tqdm(audio_files, desc="Transcribing files"):
    # Extract filename without extension (e.g., "test_001" from "test_001.wav")
    filename = os.path.splitext(os.path.basename(audio_path))[0]
    
    try:
        # Get audio duration for progress info
        duration = librosa.get_duration(path=audio_path)
        print(f"\nProcessing {filename} ({duration/60:.1f} mins)...")
        
        transcript = transcribe_long_audio(audio_path, chunk_duration=CHUNK_DURATION)
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        transcript = ""
    
    results.append({
        "filename": filename,
        "transcript": transcript
    })

print(f"\nTranscribed {len(results)} files")

In [None]:
# Create DataFrame and save to CSV
df = pd.DataFrame(results)

# Save to CSV
output_path = "submission.csv"
df.to_csv(output_path, index=False)

print(f"Saved {len(df)} transcriptions to {output_path}")
df.head(10)