In [1]:
import librosa
import soundfile as sf
import numpy as np
import torch
import torchaudio
from scipy import signal
import noisereduce as nr
from pathlib import Path
import multiprocessing as mp
from typing import Tuple, Dict, List, Optional
import logging
from dataclasses import dataclass
from concurrent.futures import ProcessPoolExecutor, as_completed
import warnings
warnings.filterwarnings('ignore')
from pyannote.audio import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import numpy as np
import librosa
import soundfile as sf

# Try to import noisereduce
try:
    import noisereduce as nr
    NOISEREDUCE_AVAILABLE = True
except ImportError:
    NOISEREDUCE_AVAILABLE = False
    print("Warning: noisereduce not installed. Install with: pip install noisereduce")

class SimpleAudioCleaner:
    """
    Simple audio processor that ONLY:
    1. Converts to mono
    2. Removes noise
    3. Removes silence
    4. Downsamples for transcription
    """
    
    def __init__(self, silence_threshold_db=-40, min_silence_duration_ms=500, 
                 noise_reduce_strength=0.6, target_sample_rate=16000):
        """
        Args:
            silence_threshold_db: Volume level below which audio is considered silence (default -40dB)
            min_silence_duration_ms: Minimum duration of silence to remove (default 500ms)
            noise_reduce_strength: Strength of noise reduction 0.0-1.0 (default 0.6)
            target_sample_rate: Target sample rate for transcription (default 16000Hz, set to None to keep original)
        """
        self.silence_threshold_db = silence_threshold_db
        self.min_silence_duration_ms = min_silence_duration_ms
        self.noise_reduce_strength = noise_reduce_strength
        self.target_sample_rate = target_sample_rate

    
    def convert_to_mono(self, audio):
        """Convert stereo to mono by averaging channels"""
        if audio.ndim == 2:
            # Average the channels
            mono = np.mean(audio, axis=0)
            print(f"   ✓ Converted stereo to mono")
            return mono
        else:
            print(f"   ✓ Already mono")
            return audio
    
    def remove_noise(self, audio, sr):
        """Remove background noise with adjustable strength"""
        if not NOISEREDUCE_AVAILABLE:
            print(f"   ⚠ Skipping noise reduction (noisereduce not installed)")
            return audio
        
        try:
            # Gentle noise reduction with adjustable parameters
            cleaned = nr.reduce_noise(
                y=audio, 
                sr=sr,
                prop_decrease=self.noise_reduce_strength,  # How much to reduce noise (0.0-1.0)
                stationary=True,  # Assume noise is consistent throughout
                freq_mask_smooth_hz=500,  # Smoothing in frequency domain (prevents artifacts)
                time_mask_smooth_ms=50    # Smoothing in time domain (prevents artifacts)
            )
            print(f"   ✓ Removed background noise (strength: {self.noise_reduce_strength})")
            return cleaned
        except Exception as e:
            print(f"   ⚠ Noise reduction failed: {e}")
            return audio
    
    def remove_silence(self, audio, sr):
        """Remove silent parts from audio"""
        
        # Use librosa to detect non-silent intervals
        intervals = librosa.effects.split(
            audio,
            top_db=-self.silence_threshold_db,
            frame_length=2048,
            hop_length=512
        )
        
        # Concatenate non-silent parts
        non_silent_parts = []
        
        for start, end in intervals:
            non_silent_parts.append(audio[start:end])
        
        if non_silent_parts:
            # Join all non-silent parts
            cleaned = np.concatenate(non_silent_parts)
            
            # Calculate how much was removed
            original_duration = len(audio) / sr
            new_duration = len(cleaned) / sr
            removed_duration = original_duration - new_duration
            removed_percent = (removed_duration / original_duration) * 100
            
            print(f"   ✓ Removed {removed_duration:.1f}s of silence ({removed_percent:.1f}%)")
            return cleaned
        else:
            print(f"   ⚠ No audio remained after silence removal, keeping original")
            return audio
    
    def downsample_audio(self, audio, original_sr, target_sr):
        """Downsample audio to target sample rate for transcription"""
        if target_sr is None or original_sr == target_sr:
            print(f"   ✓ Keeping original sample rate: {original_sr}Hz")
            return audio, original_sr
        
        try:
            # Resample using librosa's high-quality resampler
            resampled = librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)
            print(f"   ✓ Downsampled: {original_sr}Hz → {target_sr}Hz")
            return resampled, target_sr
        except Exception as e:
            print(f"   ⚠ Downsampling failed: {e}, keeping original sample rate")
            return audio, original_sr
    
    def process(self, input_path, output_path):
        """
        Process audio file: convert to mono, remove noise, remove silence, downsample
        
        Args:
            input_path: Path to input audio file
            output_path: Path to save processed audio
        """
        
        print(f"\n{'='*50}")
        print(f"Processing: {input_path}")
        print(f"{'='*50}")
        
        # Step 1: Load audio (keeping original sample rate)
        print(f"\n1. Loading audio...")
        audio, sr = librosa.load(input_path, sr=None, mono=False)
        original_duration = len(audio.flatten()) / sr if audio.ndim == 2 else len(audio) / sr
        print(f"   ✓ Loaded: {original_duration:.1f}s, {sr}Hz")
        
        # Step 2: Convert to mono
        print(f"\n2. Converting to mono...")
        audio = self.convert_to_mono(audio)
        
        # Step 3: Remove noise
        print(f"\n3. Removing noise...")
        audio = self.remove_noise(audio, sr)
        
        # Step 4: Remove silence
        print(f"\n4. Removing silence...")
        audio = self.remove_silence(audio, sr)
        
        # Step 5: Downsample for transcription
        print(f"\n5. Downsampling for transcription...")
        audio, sr = self.downsample_audio(audio, sr, self.target_sample_rate)
        
        # Step 6: Save
        print(f"\n6. Saving processed audio...")
        sf.write(output_path, audio, sr, subtype='PCM_16')
        
        # Final stats
        final_duration = len(audio) / sr
        print(f"\n{'='*50}")
        print(f"✓ COMPLETE!")
        print(f"✓ Saved to: {output_path}")
        print(f"✓ Sample rate: {sr}Hz (optimized for transcription)")
        print(f"✓ Original: {original_duration:.1f}s → Final: {final_duration:.1f}s")
        print(f"✓ Total removed: {original_duration - final_duration:.1f}s")
        print(f"{'='*50}\n")
        
        return audio, sr

# Simple usage
if __name__ == "__main__":
    
    # Create cleaner with custom settings if needed
    cleaner = SimpleAudioCleaner(
        silence_threshold_db=-40,  # Adjust this if it's removing too much/little
        min_silence_duration_ms=700,  # Minimum silence duration to remove
        noise_reduce_strength=0.6,  # 0.0-1.0: Lower = gentler noise reduction
        target_sample_rate=16000  # 16kHz is optimal for most speech-to-text models
                                   # Set to None to keep original sample rate
    )
    
    # Process your file
    input_file = "interview_audio.wav"
    output_file = "interview_cleaned.wav"
    
    try:
        audio, sr = cleaner.process(input_file, output_file)
        print("✓ Audio cleaning complete!")
        print(f"✓ Audio is ready for transcription at {sr}Hz")
        
    except FileNotFoundError:
        print(f"Error: Could not find {input_file}")
    except Exception as e:
        print(f"Error: {e}")


Processing: interview_audio.wav

1. Loading audio...
   ✓ Loaded: 394.0s, 48000Hz

2. Converting to mono...
   ✓ Converted stereo to mono

3. Removing noise...
   ✓ Removed background noise (strength: 0.6)

4. Removing silence...
   ✓ Removed 39.5s of silence (20.1%)

5. Downsampling for transcription...
   ✓ Downsampled: 48000Hz → 16000Hz

6. Saving processed audio...

✓ COMPLETE!
✓ Saved to: interview_cleaned.wav
✓ Sample rate: 16000Hz (optimized for transcription)
✓ Original: 394.0s → Final: 157.5s
✓ Total removed: 236.5s

✓ Audio cleaning complete!
✓ Audio is ready for transcription at 16000Hz
