In [None]:
# Install required dependencies
!pip install --upgrade huggingface_hub transformers librosa torchaudio noisereduce scipy -q

In [None]:
import os
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import noisereduce as nr
from scipy import signal

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

In [None]:
# =============================================================================
# MEMORY OPTIMIZATION SETTINGS
# =============================================================================
import gc
import os

# Limit CPU threads to reduce memory overhead
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"

# Force garbage collection
def clear_memory():
    """Aggressively clear memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

print("Memory optimization settings applied")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device used is: {device}")

In [None]:
# Load Wav2Vec2-XLS-R-300M-Bengali model with memory optimizations
model_path = "arijitx/wav2vec2-xls-r-300m-bengali"

print("Loading processor...")
processor = Wav2Vec2Processor.from_pretrained(model_path)

print("Loading model with memory optimizations...")
# Load model with low_cpu_mem_usage to reduce memory during loading
model = Wav2Vec2ForCTC.from_pretrained(
    model_path,
    low_cpu_mem_usage=True,  # Reduces peak memory during loading
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # Use FP16 on GPU
)

# Move to device
model = model.to(device)

# Set to evaluation mode (disables dropout, saves memory)
model.eval()

# Clear any loading artifacts
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(f"‚úÖ Model loaded on {device}")
print(f"   Model dtype: {next(model.parameters()).dtype}")
if torch.cuda.is_available():
    print(f"   GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")

In [None]:
# Test audio directory path
test_audio_dir = "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/test/audio"

# Get all .wav audio files
audio_files = sorted(glob(os.path.join(test_audio_dir, "*.wav")))

print(f"Found {len(audio_files)} audio files")

In [None]:
# =============================================================================
# SIMPLE MEMORY-EFFICIENT AUDIO PROCESSING - NO MS-LEVEL PREPROCESSING
# =============================================================================
# 
# New Approach: Minimal preprocessing, maximum memory efficiency
# - Skip complex noise reduction (memory intensive)
# - Skip frame-level processing (not needed for wav2vec2)
# - Process in 30-second chunks at a time
# - Simple: load ‚Üí normalize ‚Üí transcribe ‚Üí clear ‚Üí repeat
#
# =============================================================================

SAMPLING_RATE = 16000  # Wav2Vec2 requirement

# Simple chunk processing at SECOND level (not ms)
CHUNK_DURATION_SECONDS = 30  # Process 30 seconds at a time
OVERLAP_SECONDS = 2          # Small overlap to avoid cutting words

print("=" * 60)
print("SIMPLE MEMORY-EFFICIENT APPROACH")
print("=" * 60)
print(f"Processing in {CHUNK_DURATION_SECONDS}-second chunks")
print(f"No complex preprocessing - just normalize and transcribe")
print("=" * 60)

In [None]:
# =============================================================================
# STREAMLINED TRANSCRIPTION - PROCESS ONE CHUNK AT A TIME
# =============================================================================

def transcribe_chunk_simple(audio_chunk):
    """
    Transcribe one chunk with minimal memory footprint.
    No preprocessing - wav2vec2 is robust enough.
    """
    try:
        # Simple normalization only
        max_val = np.max(np.abs(audio_chunk))
        if max_val > 0:
            audio_chunk = audio_chunk / max_val
        
        # Process with wav2vec2
        inputs = processor(
            audio_chunk,
            sampling_rate=SAMPLING_RATE,
            return_tensors="pt",
            padding=True
        )
        
        # Move to device
        input_values = inputs.input_values.to(device)
        
        # Inference
        with torch.no_grad():
            logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcript = processor.batch_decode(predicted_ids)[0]
        
        # Clear memory immediately
        del inputs, input_values, logits, predicted_ids
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        return transcript.strip()
        
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f"    ‚ö†Ô∏è OOM error - skipping chunk")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
            return ""
        raise e

def transcribe_audio_streaming(audio_path):
    """
    Stream-process audio in chunks to minimize memory usage.
    
    Process:
    1. Get audio duration without loading full file
    2. Load ONLY current chunk
    3. Transcribe chunk
    4. Clear memory
    5. Move to next chunk
    
    This way we never load the full audio into memory.
    """
    # Get total duration without loading audio
    duration = librosa.get_duration(path=audio_path)
    
    chunk_duration = CHUNK_DURATION_SECONDS
    overlap = OVERLAP_SECONDS
    
    transcripts = []
    current_time = 0
    chunk_num = 0
    total_chunks = int(np.ceil(duration / (chunk_duration - overlap)))
    
    print(f"  Duration: {duration:.1f}s ‚Üí {total_chunks} chunks of {chunk_duration}s")
    
    while current_time < duration:
        chunk_num += 1
        
        # Calculate chunk boundaries
        start_time = max(0, current_time - overlap if current_time > 0 else 0)
        end_time = min(current_time + chunk_duration, duration)
        
        # Load ONLY this chunk (memory efficient!)
        audio_chunk, sr = librosa.load(
            audio_path,
            sr=SAMPLING_RATE,
            offset=start_time,
            duration=end_time - start_time,
            mono=True
        )
        
        # Transcribe this chunk
        transcript = transcribe_chunk_simple(audio_chunk)
        
        if transcript:
            transcripts.append(transcript)
        
        # Free chunk memory immediately
        del audio_chunk
        gc.collect()
        
        # Move to next chunk
        current_time = end_time
        
        # Progress indicator
        if chunk_num % 5 == 0 or chunk_num == total_chunks:
            print(f"    ‚Üí Processed {chunk_num}/{total_chunks} chunks")
    
    # Merge transcripts
    final_transcript = " ".join(transcripts)
    
    # Clear transcripts list
    del transcripts
    gc.collect()
    
    return final_transcript

print("‚úÖ Simple streaming transcription ready")
print("   ‚Ä¢ Loads one chunk at a time")
print("   ‚Ä¢ No complex preprocessing")
print("   ‚Ä¢ Aggressive memory cleanup")

In [None]:
# Test the streaming approach on first file

if len(audio_files) > 0:
    test_file = audio_files[0]
    filename = os.path.splitext(os.path.basename(test_file))[0]
    
    print("=" * 60)
    print("TESTING STREAMING TRANSCRIPTION")
    print("=" * 60)
    print(f"\nüìÅ File: {filename}")
    
    # Clear memory before starting
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print(f"\nüîÑ Processing...")
    transcript = transcribe_audio_streaming(test_file)
    
    # Clear memory after
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print(f"\n‚úÖ Done!")
    print(f"üìä Length: {len(transcript)} chars, {len(transcript.split())} words")
    print(f"\n{'‚îÄ' * 60}")
    print("TRANSCRIPT:")
    print('‚îÄ' * 60)
    print(transcript[:300] + "..." if len(transcript) > 300 else transcript)
    print('‚îÄ' * 60)
else:
    print("‚ùå No audio files found")

In [None]:
import pandas as pd

df = pd.DataFrame(test_transcript, columns=["transcript"])
df.to_csv("test_transcript.csv", index=False)


In [None]:
# Process all files with streaming approach
results = []

print(f"\nProcessing {len(audio_files)} files with streaming approach...")
print("=" * 60)

for i, audio_path in enumerate(audio_files):
    filename = os.path.splitext(os.path.basename(audio_path))[0]
    
    print(f"\n[{i+1}/{len(audio_files)}] {filename}")
    
    # Clear memory before each file
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    try:
        transcript = transcribe_audio_streaming(audio_path)
        print(f"  ‚úÖ {len(transcript)} chars, {len(transcript.split())} words")
        
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        transcript = ""
    
    results.append({
        "filename": filename,
        "transcript": transcript
    })
    
    # Clear memory after each file
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\n{'=' * 60}")
print(f"‚úÖ Completed {len(results)} files")

In [None]:
# Create DataFrame and save to CSV
df = pd.DataFrame(results)

# Save to CSV
output_path = "submission.csv"
df.to_csv(output_path, index=False)

print(f"Saved {len(df)} transcriptions to {output_path}")
df.head(10)