In [1]:
import librosa, soundfile as sf
import noisereduce as nr
import numpy as np
from pydub import AudioSegment, silence

# 1️⃣ Load audio
audio = AudioSegment.from_file("sample.mp3")
audio = audio.set_channels(1).set_frame_rate(16000)

# 2️⃣ Remove long silences (>800ms below -35dB)
nonsilent = silence.split_on_silence(
    audio,
    min_silence_len=800,
    silence_thresh=audio.dBFS - 35,
    keep_silence=200  # keep some padding
)
cleaned_audio = AudioSegment.silent(duration=0)
for chunk in nonsilent:
    cleaned_audio += chunk
cleaned_audio.export("temp_nosilence.wav", format="wav")

# 3️⃣ Noise reduction (spectral gating)
y, sr = librosa.load("temp_nosilence.wav", sr=16000)
reduced_noise = nr.reduce_noise(y=y, sr=sr)

sf.write("sample_denoised.wav", reduced_noise, sr)
print("✅ Saved denoised, silence-removed audio as 'sample_denoised.wav'")

✅ Saved denoised, silence-removed audio as 'sample_denoised.wav'


In [1]:
from faster_whisper import WhisperModel

model = WhisperModel("medium", device="cuda", compute_type="int8")

segments, info = model.transcribe("sample_denoised.wav")
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

: 