In [None]:
import numpy as np
from scipy.signal import butter, sosfilt, sosfiltfilt, hilbert, resample
import ipywidgets as widgets
from IPython.display import display, Audio
from scipy.io.wavfile import read

In [143]:
def envelope(signal):
    # Hilbert transform gives analytic signal whose magnitude = envelope
    analytic = hilbert(signal)
    return np.abs(analytic)

def audio_widget(data, fs):
    out = widgets.Output()
    with out:
        display(Audio(data, rate=fs))
    return out

def frame_signal(x, frame_size, hop_size):
    #Split signal into overlapping frames
    num_frames = 1 + (len(x) - frame_size) // hop_size
    frames = np.zeros((num_frames, frame_size))
    for i in range(num_frames):
        start = i * hop_size
        frames[i] = x[start:start+frame_size]
    return frames

def zero_crossing_rate(frame):
    return np.mean(np.abs(np.diff(np.sign(frame))))

def autocorr_pitch(frame, fs, fmin=50, fmax=400): 
    # Autocorrelation method for pitch estimation
    corr = np.correlate(frame, frame, mode='full')
    corr = corr[len(corr)//2:]  # keep positive lags
    min_lag = int(fs/fmax)
    max_lag = int(fs/fmin)
    lag = np.argmax(corr[min_lag:max_lag]) + min_lag
    f0 = fs / lag
    return f0

def voiced_unvoiced(frame, energy_thresh=1e-3, zcr_thresh=0.1):
    energy = np.sum(frame**2)
    zcr = zero_crossing_rate(frame)
    if energy > energy_thresh and zcr < zcr_thresh:
        return "voiced"
    else:
        return "unvoiced"


def generate_excitation(frame, fs):
    vu = voiced_unvoiced(frame)
    if vu == "voiced":
        f0 = autocorr_pitch(frame, fs) # Estimate pitch for voiced frames
        N = int(fs / f0)
        excitation = np.zeros(len(frame))
        excitation[::N] = 1.0
    else:
        excitation = np.random.randn(len(frame)) * 0.1 # Random noise for unvoiced frames
    return excitation, vu

def make_bands(fs,
               n_bands=8,
               fmin=200.0,
               fmax=None,
               log=True,
               min_bandwidth_hz=1.0,
               top_margin=0.999):
    """
    Create n_bands frequency (low, high) tuples between fmin and fmax.

    - top_margin: fraction of Nyquist to use as top cap (must be < 1.0).
    - min_bandwidth_hz: enforce a tiny minimum bandwidth to avoid duplicates.
    """
    if fmax is None:
        fmax = fs / 2.0

    # Safety clamps
    nyq = fs / 2.0
    if fmin <= 0:
        raise ValueError("fmin must be > 0 Hz")
    if fmax <= fmin:
        raise ValueError("fmax must be > fmin")
    # Ensure top edge is strictly below Nyquist
    max_allowed = nyq * float(top_margin)
    if fmax > max_allowed:
        fmax = max_allowed

    if log:
        edges = np.logspace(np.log10(fmin), np.log10(fmax), n_bands + 1)
    else:
        edges = np.linspace(fmin, fmax, n_bands + 1)

    bands = []
    for i in range(n_bands):
        low = float(edges[i])
        high = float(edges[i + 1])
        if high - low < min_bandwidth_hz:
            # Expand tiny bands slightly to avoid degenerate filters
            high = low + min_bandwidth_hz
            if high > max_allowed:
                high = max_allowed
                low = high - min_bandwidth_hz
        # final safety clamp
        low = max(low, 1e-6)
        high = min(high, max_allowed)
        if low >= high:
            raise ValueError(f"Invalid band edges: low={low} >= high={high}")
        bands.append((low, high))
    return bands


def bandpass_filter(x, lowcut, highcut, fs, order=4, zero_phase=True):
    """
    Stable bandpass using second-order-sections. Uses filtfilt for zero-phase if requested.
    lowcut/highcut in Hz. Internally clamps to (0, Nyquist).
    """
    nyq = fs / 2.0
    # guard against accidental edges at/above Nyquist or <= 0
    low = max(lowcut, 1e-6) / nyq
    high = min(highcut, nyq * 0.999999) / nyq
    if not (0.0 < low < high < 1.0):
        raise ValueError(f"Normalized band edges must satisfy 0 < low < high < 1, got low={low}, high={high}")

    # Use SOS for numerical stability
    sos = butter(order, [low, high], btype='band', output='sos')
    if zero_phase:
        return sosfiltfilt(sos, x)
    else:
        return sosfilt(sos, x)

def vocode(signal, fs, carrier=None, n_bands=4, frame_size=400, hop_size=160, log_bands=True):
    # Determine output length
    L_out = min(len(signal), len(carrier)) if carrier is not None else len(signal)
    speech = signal[:L_out] / np.max(np.abs(signal[:L_out]))
    frames = frame_signal(speech, frame_size=frame_size, hop_size=hop_size)
    vu_mask = np.ones_like(speech)
    
    if carrier is not None:
        # Voiced/unvoiced mask
        for i, frame in enumerate(frames):
            vu = voiced_unvoiced(frame)
            start = i * hop_size
            end = start + len(frame)
            if vu == "voiced":
                vu_mask[start:end] = 1.0  # pass voiced
            else:
                vu_mask[start:end] = 0.0  # silence unvoiced

    # Generate carrier if not provided
    if carrier is None:
        carrier = np.zeros_like(speech)
        for i, frame in enumerate(frames):
            excitation, vu = generate_excitation(frame, fs)
            start = i * hop_size
            end = start + len(frame)
            carrier[start:end] += excitation

    # Make frequency bands
    bands = make_bands(fs, n_bands=n_bands, fmin=200, fmax=fs/2, log=log_bands)
    
    out = np.zeros_like(speech)
    for low, high in bands:
        speech_band = bandpass_filter(speech, low, high, fs)
        carrier_band = bandpass_filter(carrier[:L_out], low, high, fs)
        env = envelope(speech_band)
        
        # Apply envelope to carrier and mask unvoiced if carrier was provided
        L = min(len(speech_band), len(carrier_band), len(env))
        modulated = carrier_band[:L] * env[:L]
        if carrier is not None:
            modulated *= vu_mask[:L]  # silence unvoiced segments
        out[:L] += modulated

    # Normalize output RMS
    rms = np.sqrt(np.mean(out**2))
    vocoder_out = out / rms if rms > 0 else out

    return vocoder_out, carrier

# Simulate first vocoder

In this experiment, we construct a carrier signal that has an estimated $F_x$ for voiced portions of speech, and a hiss (white noise) for unvoiced segments. First, we use linearly spaced bands, which is what was initially used.

In [146]:
# Load the WAV file
fs, speech = read("audio/The distant future.wav") 

vocoder_out,  carrier = vocode(speech, fs, log_bands=False)
vocoder_8band, carrier_8band = vocode(speech, fs, n_bands=8, log_bands=False) 
vocoder_20band, carrier_20band = vocode(speech, fs, n_bands=20, log_bands=False)
vocoder_40band, carrier_40band = vocode(speech, fs, n_bands=40, log_bands=False)


# --- Audio players ---
print("Play audio:")
def audio_widget(data, fs):
    out = widgets.Output()
    with out:
        display(Audio(data, rate=fs))
    return out

speech_w = audio_widget(speech, fs)
carrier_w = audio_widget(carrier, fs)
vocoder_w = audio_widget(vocoder_out, fs)
vocoder8_w = audio_widget(vocoder_8band, fs)
vocoder20_w = audio_widget(vocoder_20band, fs)
vocoder40_w = audio_widget(vocoder_40band, fs)

ui = widgets.VBox([
    widgets.Label("Speech (source)"), speech_w,
    widgets.Label("Carrier"), carrier_w,
    widgets.Label("4- band Vocoder output"), vocoder_w,
    widgets.Label("8-band Vocoder output"), vocoder8_w,
    widgets.Label("20-band Vocoder output"), vocoder20_w,
    widgets.Label("40-band Vocoder output"), vocoder40_w,
])

display(ui)



Play audio:


VBox(children=(Label(value='Speech (source)'), Output(), Label(value='Carrier'), Output(), Label(value='4- ban…

Hear the difference when we use frequency bands separated at a log-scale (which is closer to the way our ear perceives sound):

In [147]:
# Load the WAV file
fs, speech = read("audio/The distant future.wav") 

vocoder_out,  carrier = vocode(speech, fs)
vocoder_8band, carrier_8band = vocode(speech, fs, n_bands=8) 
vocoder_20band, carrier_20band = vocode(speech, fs, n_bands=20)
vocoder_40band, carrier_40band = vocode(speech, fs, n_bands=40)


# --- Audio players ---
print("Play audio:")
def audio_widget(data, fs):
    out = widgets.Output()
    with out:
        display(Audio(data, rate=fs))
    return out

speech_w = audio_widget(speech, fs)
carrier_w = audio_widget(carrier, fs)
vocoder_w = audio_widget(vocoder_out, fs)
vocoder8_w = audio_widget(vocoder_8band, fs)
vocoder20_w = audio_widget(vocoder_20band, fs)
vocoder40_w = audio_widget(vocoder_40band, fs)

ui = widgets.VBox([
    widgets.Label("Speech (source)"), speech_w,
    widgets.Label("Carrier"), carrier_w,
    widgets.Label("4- band Vocoder output"), vocoder_w,
    widgets.Label("8-band Vocoder output"), vocoder8_w,
    widgets.Label("20-band Vocoder output"), vocoder20_w,
    widgets.Label("40-band Vocoder output"), vocoder40_w,
])

display(ui)



Play audio:


VBox(children=(Label(value='Speech (source)'), Output(), Label(value='Carrier'), Output(), Label(value='4- ban…

# Vocoding a basic synth chord progression

In [149]:
# --- Basic synth functions ---
def midi_to_freq(midi_note):
    """Convert MIDI note number to frequency in Hz."""
    return 440.0 * (2.0 ** ((midi_note - 69) / 12.0))

def saw_wave(freq, duration, fs):
    """Generate a sawtooth wave for a given frequency and duration."""
    t = np.linspace(0, duration, int(fs*duration), endpoint=False)
    return 2 * (t*freq - np.floor(0.5 + t*freq))

def chord_wave(notes, duration, fs):
    """Generate a chord (sum of saw waves)."""
    waves = [saw_wave(midi_to_freq(n), duration, fs) for n in notes]
    return np.sum(waves, axis=0) / len(waves)

def match_rms(x, target_rms):
    return x * (target_rms / (np.sqrt(np.mean(x**2)) + 1e-9))

# --- Define some MIDI chords ---
fs = 16000
chords = {
    "C" : [60, 64, 67],   # C major
    "Am": [69, 72, 76],   # A minor
    "G7" : [55, 59, 62, 66],   # G major 7
}

# --- Synthesize and vocode each chord ---
fs, speech = read("audio/What did you say.wav") 

notes = [("C", 2.4), ("Am", 1.0), ("G7", 1.0)]
synth_audio = np.concatenate([chord_wave(chords[name], dur, fs) for name, dur in notes])


In [150]:

vocoder_out,  _ = vocode(speech, fs, carrier=synth_audio)
vocoder_8band, _ = vocode(speech, fs, carrier=synth_audio, n_bands=8) 
vocoder_20band, _ = vocode(speech, fs, carrier=synth_audio, n_bands=20)
vocoder_40band, _ = vocode(speech, fs, carrier=synth_audio, n_bands=40)
vocoder_100band, _ = vocode(speech, fs, carrier=synth_audio, n_bands=100)


# --- Corrected Audio Combination ---
L = min(len(speech), len(vocoder_40band))

speech_norm = speech[:L] / np.max(np.abs(speech[:L]) + 1e-9)
vocoder_norm = vocoder_40band[:L] / np.max(np.abs(vocoder_20band[:L]) + 1e-9)

mix_ratio_speech = 0.45  # Controls how much of the original speech you hear
mix_ratio_vocoder = 1 - mix_ratio_speech # Controls how much of the vocoded effect you hear

combined = (speech_norm * mix_ratio_speech) + (vocoder_norm * mix_ratio_vocoder)

# 3. Final normalization to prevent clipping and set a good playback level
combined /= np.max(np.abs(combined) + 1e-9)

# --- Audio players ---
print("Play audio:")
def audio_widget(data, fs):
    out = widgets.Output()
    with out:
        display(Audio(data, rate=fs))
    return out

speech_w = audio_widget(speech, fs)
carrier_w = audio_widget(synth_audio, fs)
vocoder_w = audio_widget(vocoder_out, fs)
vocoder8_w = audio_widget(vocoder_8band, fs)
vocoder20_w = audio_widget(vocoder_20band, fs)
vocoder40_w = audio_widget(vocoder_40band, fs)
vocoder100_w = audio_widget(vocoder_100band, fs)
combined_w = audio_widget(combined, fs)


ui = widgets.VBox([
    widgets.Label("Speech (source)"), speech_w,
    widgets.Label("Carrier (Synth chords)"), carrier_w,
    widgets.Label("4- band Vocoder output"), vocoder_w,
    widgets.Label("8-band Vocoder output"), vocoder8_w,
    widgets.Label("20-band Vocoder output"), vocoder20_w,
    widgets.Label("40-band Vocoder output"), vocoder40_w,
    widgets.Label("100-band Vocoder output"), vocoder100_w,
    widgets.Label("Combined original + 20-band vocoder"), combined_w,
])

display(ui)

Play audio:


VBox(children=(Label(value='Speech (source)'), Output(), Label(value='Carrier (Synth chords)'), Output(), Labe…