# Bangla Long-Form ASR Baseline

Minimal baseline using **wav2vec2-large-xlsr-53** with CTC decoding.

- **Input**: Long Bangla .wav files
- **Output**: Bangla text transcription
- **Method**: 25-second chunking, greedy CTC decoding

In [1]:
# Install dependencies
!pip install -q 'nemo_toolkit[asr]'
import warnings
warnings.filterwarnings("ignore")

^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
# Imports
import os
import glob
import numpy as np
import pandas as pd
import torch
import librosa
import torchaudio
from tqdm import tqdm
import nemo.collections.asr as nemo_asr

# Configuration
BASE_INPUT_DIR = "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription"
BASE_OUTPUT_DIR = "/kaggle/working/"
TEST_AUDIO_DIR = os.path.join(BASE_INPUT_DIR, "test")
SUBMISSION_PATH = os.path.join(BASE_OUTPUT_DIR, "submission.csv")

MODEL_NAME = "hishab/titu_stt_bn_fastconformer"  # NeMo FastConformer model

SAMPLE_RATE = 16000
CHUNK_LENGTH_SEC = 15

# Spectral Gating Configuration
ENABLE_DENOISING = True  # Toggle denoising on/off
NOISE_GATE_THRESHOLD_K = 2.0  # Conservative: 1.5-2.5 (higher = less aggressive)
STFT_WIN_LENGTH_MS = 25  # ~25ms window
STFT_HOP_LENGTH_MS = 10  # ~10ms hop
SOFT_MASK_MIN = 0.1  # Minimum mask value (no hard zeroing)

# Create output directory
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")
print(f"Denoising enabled: {ENABLE_DENOISING}")

2026-02-02 20:24:45.501633: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770063885.873101     102 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770063886.009890     102 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770063886.876627     102 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770063886.876668     102 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770063886.876670     102 computation_placer.cc:177] computation placer alr

Device: cuda
Denoising enabled: True


In [None]:
print(f"Loading NeMo model: {MODEL_NAME}")
asr_model = nemo_asr.models.ASRModel.from_pretrained(MODEL_NAME)

# Move model to GPU if available
if torch.cuda.is_available():
    asr_model = asr_model.to(DEVICE)
    print(f"Model moved to {DEVICE}")
else:
    print("Running on CPU")

asr_model.eval()
print("Model loaded successfully")

Loading processor: arijitx/wav2vec2-xls-r-300m-bengali


preprocessor_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

Loading model: arijitx/wav2vec2-xls-r-300m-bengali


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Model loaded successfully


In [None]:
class SpectralGatingDenoiser:
    """
    Conservative spectral gating denoiser optimized for ASR (not audio quality).
    
    Uses soft masking with no hard zeroing to preserve phonetic content.
    Designed for Bangla speech where consonant articulation is critical.
    
    Args:
        sample_rate: Audio sample rate (default: 16000)
        threshold_k: Noise gate threshold multiplier (default: 2.0)
                     Higher = more conservative (less denoising)
        win_length_ms: STFT window length in milliseconds
        hop_length_ms: STFT hop length in milliseconds
        soft_mask_min: Minimum mask value to prevent hard zeroing
    """
    
    def __init__(
        self,
        sample_rate=16000,
        threshold_k=2.0,
        win_length_ms=25,
        hop_length_ms=10,
        soft_mask_min=0.1
    ):
        self.sample_rate = sample_rate
        self.threshold_k = threshold_k
        self.soft_mask_min = soft_mask_min
        
        # Convert ms to samples
        self.win_length = int(win_length_ms * sample_rate / 1000)
        self.hop_length = int(hop_length_ms * sample_rate / 1000)
        
        # Ensure win_length is valid
        if self.win_length % 2 == 1:
            self.win_length += 1
            
    def __call__(self, waveform):
        """
        Apply spectral gating to waveform.
        
        Args:
            waveform: numpy array or torch tensor of shape (n_samples,)
            
        Returns:
            Denoised waveform as numpy array
        """
        # Convert to numpy if tensor
        if isinstance(waveform, torch.Tensor):
            waveform = waveform.cpu().numpy()
            
        # Ensure mono
        if waveform.ndim > 1:
            waveform = waveform.mean(axis=0)
            
        # Skip if audio too short
        if len(waveform) < self.win_length:
            return waveform
            
        # Step 1: Compute STFT with Hann window
        stft = librosa.stft(
            waveform,
            n_fft=self.win_length,
            hop_length=self.hop_length,
            window='hann'
        )
        
        magnitude = np.abs(stft)
        phase = np.angle(stft)
        
        # Step 2: Estimate noise profile from low-energy frames
        # Use bottom 20% of frames by energy as noise estimate
        frame_energy = np.sum(magnitude ** 2, axis=0)
        noise_threshold_percentile = 20
        noise_frames_mask = frame_energy <= np.percentile(frame_energy, noise_threshold_percentile)
        
        # Ensure we have some noise frames
        if noise_frames_mask.sum() < 5:
            # Fallback: use lowest 5 frames
            noise_frame_indices = np.argsort(frame_energy)[:5]
            noise_frames_mask = np.zeros_like(noise_frames_mask, dtype=bool)
            noise_frames_mask[noise_frame_indices] = True
            
        noise_magnitude = magnitude[:, noise_frames_mask]
        
        # Step 3: Compute per-frequency noise statistics
        noise_mean = np.mean(noise_magnitude, axis=1, keepdims=True)
        noise_std = np.std(noise_magnitude, axis=1, keepdims=True)
        
        # Add small epsilon to prevent division by zero
        noise_std = np.maximum(noise_std, 1e-8)
        
        # Step 4: Compute gating threshold
        # threshold(f) = noise_mean(f) + k * noise_std(f)
        threshold = noise_mean + self.threshold_k * noise_std
        
        # Step 5: Apply soft spectral mask
        # Mask = min(1.0, max(soft_mask_min, magnitude / threshold))
        mask = magnitude / (threshold + 1e-8)
        mask = np.clip(mask, self.soft_mask_min, 1.0)
        
        # Apply mask to magnitude
        denoised_magnitude = magnitude * mask
        
        # Step 6: Reconstruct with original phase
        denoised_stft = denoised_magnitude * np.exp(1j * phase)
        
        # Inverse STFT
        denoised_waveform = librosa.istft(
            denoised_stft,
            hop_length=self.hop_length,
            window='hann',
            length=len(waveform)  # Ensure same length as input
        )
        
        return denoised_waveform


print("SpectralGatingDenoiser class defined")

SpectralGatingDenoiser class defined


In [None]:
class ASRDataset:
    """
    Memory-safe dataset for long-form ASR with on-the-fly denoising.
    
    Features:
    - Lazy audio loading (no preloading)
    - Chunk-wise processing with temporary file creation for NeMo
    - Optional spectral gating denoising
    """
    
    def __init__(
        self,
        audio_paths,
        chunk_length_sec=15,
        sample_rate=16000,
        denoiser=None,
        temp_dir="/tmp/audio_chunks"
    ):
        self.audio_paths = audio_paths
        self.chunk_length_sec = chunk_length_sec
        self.sample_rate = sample_rate
        self.denoiser = denoiser
        self.temp_dir = temp_dir
        
        # Create temp directory for chunks
        os.makedirs(temp_dir, exist_ok=True)
        
        # Precompute chunk info for each audio file
        self.chunk_info = []  # [(audio_idx, chunk_idx, total_chunks)]
        
        for audio_idx, audio_path in enumerate(audio_paths):
            # Get audio duration without loading full file
            info = torchaudio.info(audio_path)
            duration_sec = info.num_frames / info.sample_rate
            
            # Calculate number of chunks
            chunk_samples = int(chunk_length_sec * sample_rate)
            total_chunks = int(np.ceil(duration_sec / chunk_length_sec))
            
            for chunk_idx in range(total_chunks):
                self.chunk_info.append((audio_idx, chunk_idx, total_chunks))
                
    def __len__(self):
        return len(self.chunk_info)
    
    def get_chunk_path(self, audio_idx, chunk_idx):
        """Create a temporary chunk file and return its path."""
        audio_path = self.audio_paths[audio_idx]
        
        # Load audio chunk
        chunk_samples = int(self.chunk_length_sec * self.sample_rate)
        start_frame = chunk_idx * chunk_samples
        
        waveform, sr = torchaudio.load(
            audio_path,
            frame_offset=start_frame,
            num_frames=chunk_samples
        )
        
        # Convert to mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
            
        # Resample if needed
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)
            
        # Normalize
        max_val = torch.max(torch.abs(waveform))
        if max_val > 0:
            waveform = waveform / max_val
            
        # Apply denoising if enabled
        if self.denoiser is not None:
            waveform_np = waveform.squeeze(0).numpy()
            denoised_np = self.denoiser(waveform_np)
            waveform = torch.from_numpy(denoised_np).unsqueeze(0)
            
        # Skip if too short
        if waveform.shape[1] < self.sample_rate // 2:
            return None
            
        # Save to temporary file
        temp_path = os.path.join(
            self.temp_dir, 
            f"chunk_{audio_idx}_{chunk_idx}.wav"
        )
        torchaudio.save(temp_path, waveform, self.sample_rate)
        
        return temp_path, audio_path, chunk_idx


print("ASRDataset defined")

ASRDataset and collate_fn defined


In [None]:
def clean_text(text):
    """Minimal text cleanup: remove extra spaces and empty tokens."""
    # Remove special tokens that might appear
    text = text.replace("<s>", "").replace("</s>", "")
    text = text.replace("<pad>", "").replace("<unk>", "")
    # Normalize whitespace
    text = " ".join(text.split())
    return text.strip()


print("Text cleaning function defined")

Transcription helper functions defined


In [None]:
# Initialize denoiser (if enabled)
denoiser = None
if ENABLE_DENOISING:
    denoiser = SpectralGatingDenoiser(
        sample_rate=SAMPLE_RATE,
        threshold_k=NOISE_GATE_THRESHOLD_K,
        win_length_ms=STFT_WIN_LENGTH_MS,
        hop_length_ms=STFT_HOP_LENGTH_MS,
        soft_mask_min=SOFT_MASK_MIN
    )
    print(f"Denoiser initialized (threshold_k={NOISE_GATE_THRESHOLD_K})")
else:
    print("Denoising disabled")

# Get test audio files
test_files = sorted(glob.glob(os.path.join(TEST_AUDIO_DIR, "audio", "*.wav")))
print(f"\nFound {len(test_files)} test audio files")

# Create dataset
dataset = ASRDataset(
    audio_paths=test_files,
    chunk_length_sec=CHUNK_LENGTH_SEC,
    sample_rate=SAMPLE_RATE,
    denoiser=denoiser
)

print(f"Dataset created: {len(dataset)} total chunks")
print(f"Processing chunks...")

# Process all chunks and aggregate by file
from collections import defaultdict
file_transcriptions = defaultdict(list)

total_chunks = len(dataset)
for idx in range(total_chunks):
    result = dataset.get_chunk_path(*dataset.chunk_info[idx][:2])
    
    if result is None:
        continue
        
    chunk_path, audio_path, chunk_idx = result
    filename = os.path.basename(audio_path)
    
    try:
        # Transcribe using NeMo (returns Hypothesis object)
        hypothesis = asr_model.transcribe([chunk_path])[0]
        transcription = hypothesis.text if hasattr(hypothesis, 'text') else str(hypothesis)
        
        if transcription.strip():
            file_transcriptions[filename].append((chunk_idx, transcription))
    finally:
        # Clean up temporary chunk file
        if os.path.exists(chunk_path):
            os.remove(chunk_path)
    
    # Simple progress update
    if (idx + 1) % 10 == 0 or (idx + 1) == total_chunks:
        print(f"\rProgress: {idx + 1}/{total_chunks} chunks", end="")

print()  # New line after progress

# Merge transcriptions for each file
results = []
for filename in sorted(file_transcriptions.keys()):
    # Sort by chunk index and merge
    chunks = sorted(file_transcriptions[filename], key=lambda x: x[0])
    transcriptions = [clean_text(t) for _, t in chunks]
    full_text = " ".join(transcriptions)
    
    results.append({
        "filename": filename,
        "transcript": full_text
    })

print(f"\nProcessed {len(results)} files")

Denoiser initialized (threshold_k=2.0)

Found 24 test audio files
Dataset created: 5341 total chunks
Processing with batch_size=4



Processing audio chunks: 100%|██████████| 1336/1336 [11:41<00:00,  1.90it/s]


Processed 24 files





In [None]:
SUBMISSION_PATH = "/kaggle/working/"
# Create submission DataFrame
submission_df = pd.DataFrame(results)
submission_df = submission_df[["filename", "transcript"]]

# Remove .wav extension from filenames
submission_df["filename"] = submission_df["filename"].str.replace(r"\.wav", "", regex=True)

# Fill any empty transcriptions
submission_df["transcript"] = submission_df["transcript"].fillna("")

# Save submission
submission_df.to_csv(SUBMISSION_PATH + "submission.csv", index=False, encoding="utf-8")
print(f"Submission saved to: {SUBMISSION_PATH}submission.csv")

# Display preview
print(f"\nSubmission preview ({len(submission_df)} rows):")
print(submission_df.head(10))

Submission saved to: /kaggle/working/submission.csv

Submission preview (24 rows):
       filename                                         transcript
0  test_001.wav  এআক্সক্ষির এটেনিকেপকা আপনাকেদে ালোমানার মিডিগি...
1  test_002.wav  মিন্তু রচ্ছাধারীনা আগিন সৈনাআমি সব দিল পমলা এব...
2  test_003.wav  গল্পুটির সত্য আনন্দ পাবলিশাস প্রাইভেটলিমেটে কো...
3  test_004.wav  যে কোনো জায়গায় যেতে রাতের ট্রেনি আমাদের প্সব...
4  test_005.wav  বেচি নিবেদন ফ্রাইডেইক্লাসেক্স পরে বকিমেগ গাকছে...
5  test_006.wav  আাদের খুব প্রন্দ হইছে আা ছেলে পছন্দমি ইলে আপনা...
6  test_008.wav  বাদরির পোচাগর এই রকম াঙি দুগডবাটা পড সইে যাওয়...
7  test_009.wav  দ এব দি কেকদ মৃত্যাগত গলতে সরায কেবে িতঅমিবভাজ...
8  test_010.wav  আাই একটা তাড়াতালি ক ইে ফোন জলে আসছে আরে এতারা...
9  test_011.wav  বেচি নিবেদন ফ্রাইডে ক্লাসেক্স ই জুই মন্টু গিয়...


In [None]:
# Verify submission file
final_df = pd.read_csv(SUBMISSION_PATH + "submission.csv")
print("Submission verification:")
print(f"  - Total rows: {len(final_df)}")
print(f"  - Columns: {list(final_df.columns)}")
print(f"  - Empty transcripts: {(final_df['transcript'] == '').sum()}")
print(f"  - Sample filenames: {final_df['filename'].head(3).tolist()}")
print("\nDone!")

IsADirectoryError: [Errno 21] Is a directory: '/kaggle/working/'