### noise reduction using deepfilternet

In [None]:
from df.enhance import enhance, init_df, load_audio, save_audio
from df.utils import download_file

if __name__ == "__main__":
    # Load default model
    model, df_state, _ = init_df()
    # Download and open some audio file. You use your audio files here
    # audio_path = download_file(
    #     "https://github.com/Rikorose/DeepFilterNet/raw/e031053/assets/noisy_snr0.wav",
    #     download_dir=".",
    # )
    audio_path = 'audio1.wav'
    audio, _ = load_audio(audio_path, sr=df_state.sr())
    # Denoise the audio
    enhanced = enhance(model, df_state, audio)
    # Save for listening
    save_audio("deepfilternet_output/audio1.wav", enhanced, df_state.sr())

### normalization/DRC

In [3]:
import os
import subprocess
from pathlib import Path

# ==== SETTINGS ====
INPUT_DIR = "deepfilternet_output/"
OUTPUT_DIR = "output/"
APPLY_NORMALIZATION = True  # Set to False if you only want compression

# ==== ENSURE OUTPUT DIRECTORY EXISTS ====
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# ==== COMPRESSION AND NORMALIZATION SETTINGS ====
compress_filter = "acompressor=threshold=-30dB:ratio=6:attack=10:release=1000"
normalize_filter = "loudnorm=I=-16:TP=-1.5:LRA=11"

# ==== PROCESS EACH .WAV FILE ====
for filename in os.listdir(INPUT_DIR):
    if not filename.lower().endswith(".wav"):
        continue

    input_path = os.path.join(INPUT_DIR, filename)
    temp_path = os.path.join(OUTPUT_DIR, f"temp_{filename}")
    output_path = os.path.join(OUTPUT_DIR, filename)

    print(f"Processing: {filename}")

    # Step 1: Apply Dynamic Range Compression
    subprocess.run([
        "ffmpeg", "-y", "-i", input_path,
        "-af", compress_filter,
        temp_path
    ], check=True)

    # Step 2: Apply Loudness Normalization (Optional)
    if APPLY_NORMALIZATION:
        subprocess.run([
            "ffmpeg", "-y", "-i", temp_path,
            "-af", normalize_filter,
            output_path
        ], check=True)
        os.remove(temp_path)  # Clean up intermediate file
    else:
        os.rename(temp_path, output_path)

    print(f"Saved to: {output_path}")

print("✅ All files processed.")


Processing: input2.wav


ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

Saved to: output/input2.wav
Processing: input1.wav


[out#0/wav @ 0x55cd441b9a80] video:0kB audio:4354kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.001750%
size=    4354kB time=00:00:23.21 bitrate=1536.6kbits/s speed= 127x    
ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --

Saved to: output/input1.wav
Processing: test2.wav


[out#0/wav @ 0x558dac2a3c80] video:0kB audio:5211kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.001462%
size=    5212kB time=00:00:27.77 bitrate=1537.0kbits/s speed= 129x    
ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --

Saved to: output/test2.wav
Processing: audio1.wav


[aist#0:0/pcm_s16le @ 0x563ed04aa480] Guessed Channel Layout: stereo
Input #0, wav, from 'deepfilternet_output/audio1.wav':
  Metadata:
    encoder         : Lavf60.16.100
  Duration: 00:00:25.61, bitrate: 1536 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, 2 channels, s16, 1536 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (pcm_s16le (native) -> pcm_s16le (native))
Press [q] to stop, [?] for help
Output #0, wav, to 'output/temp_audio1.wav':
  Metadata:
    ISFT            : Lavf60.16.100
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s
    Metadata:
      encoder         : Lavc60.31.102 pcm_s16le
[out#0/wav @ 0x563ed0483a80] video:0kB audio:4802kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.001586%
size=    4802kB time=00:00:25.60 bitrate=1536.7kbits/s speed= 109x    
ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configur

Saved to: output/audio1.wav
✅ All files processed.


[out#0/wav @ 0x55bf5eee7b40] video:0kB audio:19209kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000519%
size=   19209kB time=00:00:22.80 bitrate=6901.7kbits/s speed=21.8x    


In [None]:
import whisper
from pydub import AudioSegment
import os
import json

# === Configuration ===
model = whisper.load_model("medium.en")  # or "large-v3" if available
audio_path = "data/filler_word_audio.wav"
output_audio_path = "fwr/filler_word_audio.wav"
output_text_path = "fwr/filler_word_audio.txt"
output_json_path = "fwr/filler_word_audio.json"

filler_words = {"um", "uh", "ah", "erm"}
buffer_sec = 0.15  # seconds before/after each filler word to remove
fade_ms = 20       # fade duration in milliseconds

# === Step 1: Transcribe audio ===
print("🎙️ Transcribing audio...")
result = model.transcribe(audio_path, word_timestamps=True)

# Save full transcription
full_text = result["text"].strip()
os.makedirs("fwr", exist_ok=True)
with open(output_text_path, "w", encoding="utf-8") as f:
    f.write(full_text)
print(f"\n📝 Transcription saved to: {output_text_path}")
print(f"\n📝 Full Transcription:\n{full_text}")

# Save word-level timestamps
words = []
for segment in result["segments"]:
    for word_info in segment.get("words", []):
        word_entry = {
            "word": word_info["word"].strip(),
            "start": round(word_info["start"], 2),
            "end": round(word_info["end"], 2)
        }
        words.append(word_entry)

with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(words, f, indent=2)
print(f"\n🕒 Word timestamps saved to: {output_json_path}")
print(f"\n🔍 Timestamp preview:\n{words[:5]}")

# === Step 2: Remove filler words ===
print("\n🚫 Removing filler words with buffer and fades...")

audio = AudioSegment.from_wav(audio_path)
filler_intervals = []
for word_info in words:
    word = word_info["word"].lower()
    if word in filler_words:
        start = max(0, word_info["start"] - buffer_sec)
        end = min(audio.duration_seconds, word_info["end"] + buffer_sec)
        filler_intervals.append((start, end))

# Merge overlapping intervals
def merge_intervals(intervals):
    if not intervals:
        return []
    intervals.sort()
    merged = [intervals[0]]
    for current in intervals[1:]:
        last = merged[-1]
        if current[0] <= last[1]:
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)
    return merged

filler_intervals = merge_intervals(filler_intervals)

# Build keep intervals
keep_intervals = []
prev_end = 0
for start, end in filler_intervals:
    if start > prev_end:
        keep_intervals.append((prev_end, start))
    prev_end = end
if prev_end < audio.duration_seconds:
    keep_intervals.append((prev_end, audio.duration_seconds))

# Stitch audio
clean_audio = AudioSegment.empty()
for start, end in keep_intervals:
    segment = audio[start * 1000 : end * 1000]
    segment = segment.fade_in(fade_ms).fade_out(fade_ms)
    clean_audio += segment

# Export final cleaned audio
clean_audio.export(output_audio_path, format="wav")
print(f"\n✅ Cleaned audio saved to: {output_audio_path}")


🎙️ Transcribing audio...

📝 Transcription saved to: fwr/filler_word_audio.txt

📝 Full Transcription:
Do you know the theories? This is, look, some things, everything you know can be accepted from somewhere, everywhere. But I think we are already, with Jason, lots of people are talking about it and they are going to die.

🕒 Word timestamps saved to: fwr/filler_word_audio.json

🔍 Timestamp preview:
[{'word': 'Do', 'start': 0.0, 'end': 0.0}, {'word': 'you', 'start': 0.0, 'end': 0.0}, {'word': 'know', 'start': 0.0, 'end': 0.0}, {'word': 'the', 'start': 0.0, 'end': 0.0}, {'word': 'theories?', 'start': 0.0, 'end': 0.52}]

🚫 Removing filler words with buffer and fades...

✅ Cleaned audio saved to: fwr/filler_word_audio.wav


In [None]:
import whisper
from pydub import AudioSegment
import os
import json

def generate_transcription_and_timestamps(model, audio_path):
    """
    Transcribe audio using Whisper with word timestamps.
    Returns (full_text, words) where words is a list of dicts with 'word', 'start', 'end'.
    """
    print("🎙️ Transcribing audio...")
    result = model.transcribe(audio_path, word_timestamps=True)
    full_text = result["text"].strip()

    words = []
    for segment in result["segments"]:
        for word_info in segment.get("words", []):
            words.append({
                "word": word_info["word"].strip(),
                "start": round(word_info["start"], 2),
                "end": round(word_info["end"], 2)
            })

    return full_text, words

def save_transcript_text(text, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"📝 Transcription saved to: {path}")

def save_word_timestamps(words, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(words, f, indent=2)
    print(f"🕒 Word timestamps saved to: {path}")

def remove_non_word_regions(audio_path, words, filler_words, buffer_sec=0.05, fade_ms=20):
    """
    Keep only audio segments corresponding to non-filler words.
    Remove all other parts of the audio.
    """
    print("\n🔍 Keeping only non-filler word segments...")
    audio = AudioSegment.from_wav(audio_path)

    keep_intervals = []
    for w in words:
        word = w["word"].strip().lower()
        if word not in filler_words:
            start = max(0, w["start"] - buffer_sec)
            end = min(audio.duration_seconds, w["end"] + buffer_sec)
            keep_intervals.append((start, end))

    # Merge close intervals
    def merge_intervals(intervals, gap_threshold=0.1):
        if not intervals:
            return []
        intervals.sort()
        merged = [intervals[0]]
        for current in intervals[1:]:
            last = merged[-1]
            if current[0] <= last[1] + gap_threshold:
                merged[-1] = (last[0], max(last[1], current[1]))
            else:
                merged.append(current)
        return merged

    merged_intervals = merge_intervals(keep_intervals)

    # Build clean audio
    clean_audio = AudioSegment.empty()
    for start, end in merged_intervals:
        segment = audio[start * 1000: end * 1000]
        segment = segment.fade_in(fade_ms).fade_out(fade_ms)
        clean_audio += segment

    return clean_audio

def main():
    model_name = "medium.en"  
    audio_path = "data/filler_word_audio.wav"
    output_audio_path = "fwr/cleaned_audio.wav"
    output_text_path = "fwr/transcript.txt"
    output_json_path = "fwr/word_timestamps.json"
    filler_words = {"um", "uh", "ah", "erm", "mm", "hmm"}
    buffer_sec = 0.05  
    fade_ms = 20

    # Load Whisper model
    model = whisper.load_model(model_name)

    # Generate transcription & timestamps
    full_text, words = generate_transcription_and_timestamps(model, audio_path)

    # Save transcript and timestamps
    save_transcript_text(full_text, output_text_path)
    save_word_timestamps(words, output_json_path)

    # Remove non-word regions and get cleaned audio
    clean_audio = remove_non_word_regions(audio_path, words, filler_words, buffer_sec, fade_ms)

    # Export cleaned audio
    os.makedirs(os.path.dirname(output_audio_path), exist_ok=True)
    clean_audio.export(output_audio_path, format="wav")
    print(f"\n✅ Cleaned audio saved to: {output_audio_path}")

if __name__ == "__main__":
    main()


🎙️ Transcribing audio...
📝 Transcription saved to: fwr/transcript.txt
🕒 Word timestamps saved to: fwr/word_timestamps.json

🔍 Keeping only non-filler word segments...

✅ Cleaned audio saved to: fwr/cleaned_audio.wav


: 