In [5]:
!pip install numpy scipy

import numpy as np
import scipy.io.wavfile as wav




In [6]:
files = [
    "/content/vm1_output.wav",
    "/content/vm2_output.wav",
    "/content/vm3_output.wav",
    "/content/vm4_output.wav",
    "/content/vm5_output.wav",
    "/content/vm6_output.wav",
    "/content/vm7_output.wav",
]

def detect_voicemail_start(filepath):
    sr, data = wav.read(filepath)

    if data.ndim > 1:
        data = data.mean(axis=1)

    data = data / np.max(np.abs(data))

    window = int(0.05 * sr)
    hop = window // 2

    beep_band = (900, 1400)
    times = []
    ratios = []

    for i in range(0, len(data) - window, hop):
        segment = data[i:i + window]
        spectrum = np.abs(np.fft.rfft(segment))
        freqs = np.fft.rfftfreq(len(segment), 1 / sr)

        band_energy = spectrum[(freqs >= beep_band[0]) & (freqs <= beep_band[1])].mean()
        total_energy = spectrum.mean()

        ratios.append(band_energy / (total_energy + 1e-6))
        times.append(i / sr)

    beep_indices = np.where(np.array(ratios) > 3.5)[0]

    if len(beep_indices) > 0:
        return round(times[beep_indices[-1]] + 0.1, 2), "beep"

    window_energy = []
    for i in range(0, len(data) - window, hop):
        segment = data[i:i + window]
        window_energy.append(np.sqrt(np.mean(segment ** 2)))

    silence_frames = int(0.8 / (hop / sr))
    count = 0
    for i, e in enumerate(window_energy):
        if e < 0.02:
            count += 1
            if count >= silence_frames:
                return round(i * hop / sr, 2), "silence"
        else:
            count = 0

    return None, "unknown"


In [7]:
print("Starting voicemail detection...\n")

for f in files:
    t, method = detect_voicemail_start(f)
    print(f.split("/")[-1], "=>", t, "|", method)

print("\nDone.")


Starting voicemail detection...

vm1_output.wav => 0.78 | silence
vm2_output.wav => 7.92 | beep
vm3_output.wav => 15.47 | beep
vm4_output.wav => 3.5 | beep
vm5_output.wav => 11.88 | beep
vm6_output.wav => 3.62 | beep
vm7_output.wav => 13.3 | silence

Done.
