In [None]:
import pyrubberband as pyrb
import soundfile as sf
import librosa
import madmom
from PIL import Image
import matplotlib.pyplot as plt
print(pyrb.__version__)

# Load audio and Detect Key + Downbeats

In [None]:
filename = "../Songs/bob_marley--redemption_song.mp3"
y, sr = librosa.load(filename, sr=None)

In [None]:
# Process with madmom's RNN downbeat processor
proc = madmom.features.downbeats.RNNDownBeatProcessor()(filename)

# Decode with a DBN to get sequences of [beat, downbeat]
beats = madmom.features.downbeats.DBNDownBeatTrackingProcessor(beats_per_bar=[3, 4],
                                                               fps=100)(proc)

# beats is an array of shape (N, 2):
#   [:,0] = time (s)
#   [:,1] = 1 if downbeat, 0 if beat

# Extract beat times and downbeat times
beat_times = beats[:,0]
downbeat_times = beats[beats[:,1] == 1, 0]

# Make Beat-aligned Spectrograms

In [None]:
import numpy as np
import librosa

def make_downbeat_aligned_images(y, sr, downbeat_times, time_bins_per_downbeat=32, downbeats_in_image=4):
    """
    Create log-mel spectrogram images aligned to downbeats.

    Parameters
    ----------
    y : np.ndarray
        Audio signal.
    sr : int
        Sampling rate.
    downbeat_times : list of float
        Times (in seconds) of detected downbeats.
    time_bins_per_downbeat : int
        Number of spectrogram frames allocated between two downbeats.
    downbeats_in_image : int
        How many downbeats per image.

    Returns
    -------
    images : list of np.ndarray
        Each image has shape (128, time_bins_per_downbeat * downbeats_in_image).
    """
    # Compute mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_db = librosa.power_to_db(S, ref=np.max)

    # Map downbeat times -> spectrogram frames
    hop_length = 512
    frame_times = librosa.frames_to_time(np.arange(S_db.shape[1]), sr=sr, hop_length=hop_length)
    downbeat_frames = np.searchsorted(frame_times, downbeat_times)

    segments = []
    for i in range(len(downbeat_frames) - 1):
        start, end = downbeat_frames[i], downbeat_frames[i + 1]
        segment = S_db[:, start:end]

        if segment.shape[1] < 2:
            continue

        # Resample along time axis to exactly time_bins_per_downbeat
        x_old = np.linspace(0, 1, segment.shape[1])
        x_new = np.linspace(0, 1, time_bins_per_downbeat)
        segment_resampled = np.vstack([
            np.interp(x_new, x_old, row) for row in segment
        ])

        segments.append(segment_resampled)

    # Group consecutive segments into images
    images = []
    for i in range(0, len(segments) - downbeats_in_image + 1, downbeats_in_image):
        image = np.hstack(segments[i:i + downbeats_in_image])
        images.append(image)

    return images



# Example usage:
# y, sr = librosa.load("song.wav", sr=None)
# downbeat_times = [0.0, 1.0, 2.0, 3.0, 4.0]  # dummy
imgs = make_downbeat_aligned_images(y, sr, downbeat_times, time_bins_per_downbeat=128, downbeats_in_image=1)
print(len(imgs), imgs[0].shape)  # -> (N, (128, 128))


### Test Spectrogram Output

In [None]:
arr = imgs[2]

# If the array is float (e.g. values in [0,1] or arbitrary floats), scale to 0-255
if np.issubdtype(arr.dtype, np.floating):
    arr = (255 * (arr - arr.min()) / (arr.max() - arr.min())).astype(np.uint8)

pil_img = Image.fromarray(arr)
pil_img

### Download Image Chunks

In [None]:
for i, arr in enumerate(imgs):
    # If the array is float (e.g. values in [0,1] or arbitrary floats), scale to 0-255
    if np.issubdtype(arr.dtype, np.floating):
        arr = (255 * (arr - arr.min()) / (arr.max() - arr.min())).astype(np.uint8)
    
    pil_img = Image.fromarray(arr)
    pil_img.save(f"../Results/spectrogram_{i}.png")

# Make Beat-aligned Chromagrams

In [None]:
import numpy as np
import librosa

def make_downbeat_aligned_chromagrams(y, sr, downbeat_times, time_bins_per_downbeat=128, downbeats_in_image=1):
    """
    Create chromagram images aligned to downbeats.

    Parameters
    ----------
    y : np.ndarray
        Audio signal.
    sr : int
        Sampling rate.
    downbeat_times : list of float
        Times (in seconds) of detected downbeats.
    time_bins_per_downbeat : int
        Number of chroma frames allocated between two downbeats.
    downbeats_in_image : int
        How many downbeats per image.

    Returns
    -------
    images : list of np.ndarray
        Each image has shape (12, time_bins_per_downbeat * downbeats_in_image).
    """
    # Compute chromagram
    hop_length = 512
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)

    # Map downbeat times -> chroma frames
    frame_times = librosa.frames_to_time(np.arange(chroma.shape[1]), sr=sr, hop_length=hop_length)
    downbeat_frames = np.searchsorted(frame_times, downbeat_times)

    segments = []
    for i in range(len(downbeat_frames) - 1):
        start, end = downbeat_frames[i], downbeat_frames[i + 1]
        segment = chroma[:, start:end]

        if segment.shape[1] < 2:
            continue

        # Resample along time axis to exactly time_bins_per_downbeat
        x_old = np.linspace(0, 1, segment.shape[1])
        x_new = np.linspace(0, 1, time_bins_per_downbeat)
        segment_resampled = np.vstack([
            np.interp(x_new, x_old, row) for row in segment
        ])

        segments.append(segment_resampled)

    # Group consecutive segments into images
    images = []
    for i in range(0, len(segments) - downbeats_in_image + 1, downbeats_in_image):
        image = np.hstack(segments[i:i + downbeats_in_image])
        images.append(image)

    return images


# Example usage:
# y, sr = librosa.load("song.wav", sr=None)
# downbeat_times = [0.0, 1.0, 2.0, 3.0, 4.0]  # dummy
imgs = make_downbeat_aligned_chromagrams(y, sr, downbeat_times, time_bins_per_downbeat=128, downbeats_in_image=1)
print(len(imgs), imgs[0].shape)  # -> (N, (12, 128))


### Test Raw Chromagram Output (small on purpose)

In [None]:
arr = imgs[2]

# If the array is float (e.g. values in [0,1] or arbitrary floats), scale to 0-255
if np.issubdtype(arr.dtype, np.floating):
    arr = (255 * (arr - arr.min()) / (arr.max() - arr.min())).astype(np.uint8)

pil_img = Image.fromarray(arr)
pil_img

### Visualize Chromagram

In [None]:
def plot_chromagram_image(chroma_img, sr=22050, hop_length=512, cmap="magma", y_pixels=12):
    """
    Visualize a chromagram image with matplotlib.

    Parameters
    ----------
    chroma_img : np.ndarray
        A single chromagram image, shape (12, T).
    sr : int
        Sampling rate (optional, used for time axis labeling).
    hop_length : int
        Hop length (optional, used for time axis labeling).
    cmap : str
        Matplotlib colormap.
    y_pixels : int
        Number of vertical pixels for display (default 12).
    """
    plt.figure(figsize=(10, 3))
    plt.imshow(chroma_img, 
               aspect='auto', 
               origin='lower', 
               cmap=cmap,
               extent=[0, chroma_img.shape[1], 0, y_pixels])
    plt.colorbar(label="Chroma Energy")
    plt.xlabel("Time (interpolated bins)")
    plt.ylabel("Pitch Class")
    plt.yticks(np.linspace(0.5, y_pixels - 0.5, 12), 
               ["C", "C#", "D", "D#", "E", "F", 
                "F#", "G", "G#", "A", "A#", "B"])
    plt.title("Chromagram")
    plt.tight_layout()
    plt.show()

# Suppose you already ran:
imgs = make_downbeat_aligned_chromagrams(y, sr, downbeat_times)

# Visualize the first chromagram
plot_chromagram_image(imgs[0], cmap="inferno", y_pixels=12)

In [None]:
import numpy as np
import librosa

def make_fixed_size_spectrogram(y, sr, n_seconds=3.0, width=128, n_mels=128):
    """
    Create a fixed-size mel spectrogram image with exactly (n_mels, width) shape.
    
    Parameters
    ----------
    y : np.ndarray
        Audio signal.
    sr : int
        Sample rate.
    n_seconds : float
        Duration (in seconds) of each spectrogram window.
    width : int
        Number of time bins (pixels) in the spectrogram.
    n_mels : int
        Number of mel frequency bins.
    
    Returns
    -------
    specs : list of np.ndarray
        Each spectrogram of shape (n_mels, width).
    """
    # Compute hop_length so that n_seconds -> width frames
    hop_length = int(round((n_seconds * sr) / width))

    # Frame length: usually 2–4× hop length for STFT stability
    n_fft = hop_length * 4

    # Compute mel spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr,
                                         n_fft=n_fft,
                                         hop_length=hop_length,
                                         n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    # Segment into non-overlapping windows of size `width`
    specs = []
    for i in range(0, mel_db.shape[1] - width + 1, width):
        window = mel_db[:, i:i + width]
        specs.append(window)

    return specs


# Example usage
# y, sr = librosa.load("song.wav", sr=None)
spectros = make_fixed_size_spectrogram(y, sr, n_seconds=3.0, width=128, n_mels=128)
print(spectros[0].shape)  # (64, 128)


In [None]:
arr = spectros[30]

# If the array is float (e.g. values in [0,1] or arbitrary floats), scale to 0-255
if np.issubdtype(arr.dtype, np.floating):
    arr = (255 * (arr - arr.min()) / (arr.max() - arr.min())).astype(np.uint8)

pil_img = Image.fromarray(arr)
pil_img

In [None]:
import numpy as np
import librosa

def make_fixed_size_chromagrams(y, sr, n_seconds=3.0, width=128, hop_length=512):
    """
    Create fixed-size chromagram images (not beat aligned) using
    a single chromagram computation + interpolation.

    Parameters
    ----------
    y : np.ndarray
        Audio signal.
    sr : int
        Sampling rate.
    n_seconds : float
        Duration (in seconds) of each window.
    width : int
        Number of chroma frames allocated per window (output width).
    hop_length : int
        Hop length used to compute the base chromagram.

    Returns
    -------
    images : list of np.ndarray
        Each image has shape (12, width).
    """
    # Compute base chromagram once
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)

    # Get frame times
    frame_times = librosa.frames_to_time(
        np.arange(chroma.shape[1]), sr=sr, hop_length=hop_length
    )

    # Window length in frames
    window_length_frames = int(round(n_seconds * sr / hop_length))

    images = []
    for start in range(0, chroma.shape[1] - window_length_frames + 1, window_length_frames):
        end = start + window_length_frames
        segment = chroma[:, start:end]

        if segment.shape[1] < 2:
            continue

        # Resample to fixed width
        x_old = np.linspace(0, 1, segment.shape[1])
        x_new = np.linspace(0, 1, width)
        segment_resampled = np.vstack([
            np.interp(x_new, x_old, row) for row in segment
        ])

        images.append(segment_resampled)

    return images


# Example usage
# y, sr = librosa.load("song.wav", sr=None)
chromas = make_fixed_size_chromagrams(y, sr, n_seconds=3.0, width=128, hop_length=512)
print(len(chromas), chromas[0].shape)  # -> (N, (12, 128))

In [None]:
plot_chromagram_image(chromas[3], cmap="inferno", y_pixels=12)