In [None]:
import pyrubberband as pyrb
import soundfile as sf
import librosa
import madmom
print(pyrb.__version__)

In [None]:
filename = "../Songs/bob_marley--redemption_song.mp3"
y, sr = librosa.load(filename, sr=None)

In [None]:
sr

In [None]:
# Process with madmom's RNN downbeat processor
proc = madmom.features.downbeats.RNNDownBeatProcessor()(filename)

# Decode with a DBN to get sequences of [beat, downbeat]
beats = madmom.features.downbeats.DBNDownBeatTrackingProcessor(beats_per_bar=[3, 4],
                                                               fps=100)(proc)

# beats is an array of shape (N, 2):
#   [:,0] = time (s)
#   [:,1] = 1 if downbeat, 0 if beat

# Extract beat times and downbeat times
beat_times = beats[:,0]
downbeat_times = beats[beats[:,1] == 1, 0]

In [None]:
import numpy as np
import librosa

def make_downbeat_aligned_images(y, sr, downbeat_times, time_bins_per_downbeat=32, downbeats_in_image=4):
    """
    Create log-mel spectrogram images aligned to downbeats.

    Parameters
    ----------
    y : np.ndarray
        Audio signal.
    sr : int
        Sampling rate.
    downbeat_times : list of float
        Times (in seconds) of detected downbeats.
    time_bins_per_downbeat : int
        Number of spectrogram frames allocated between two downbeats.
    downbeats_in_image : int
        How many downbeats per image.

    Returns
    -------
    images : list of np.ndarray
        Each image has shape (128, time_bins_per_downbeat * downbeats_in_image).
    """
    # Compute mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_db = librosa.power_to_db(S, ref=np.max)

    # Map downbeat times -> spectrogram frames
    hop_length = 512
    frame_times = librosa.frames_to_time(np.arange(S_db.shape[1]), sr=sr, hop_length=hop_length)
    downbeat_frames = np.searchsorted(frame_times, downbeat_times)

    segments = []
    for i in range(len(downbeat_frames) - 1):
        start, end = downbeat_frames[i], downbeat_frames[i + 1]
        segment = S_db[:, start:end]

        if segment.shape[1] < 2:
            continue

        # Resample along time axis to exactly time_bins_per_downbeat
        x_old = np.linspace(0, 1, segment.shape[1])
        x_new = np.linspace(0, 1, time_bins_per_downbeat)
        segment_resampled = np.vstack([
            np.interp(x_new, x_old, row) for row in segment
        ])

        segments.append(segment_resampled)

    # Group consecutive segments into images
    images = []
    for i in range(0, len(segments) - downbeats_in_image + 1, downbeats_in_image):
        image = np.hstack(segments[i:i + downbeats_in_image])
        images.append(image)

    return images



# Example usage:
# y, sr = librosa.load("song.wav", sr=None)
# downbeat_times = [0.0, 1.0, 2.0, 3.0, 4.0]  # dummy
imgs = make_downbeat_aligned_images(y, sr, downbeat_times, time_bins_per_downbeat=32, downbeats_in_image=4)
print(len(imgs), imgs[0].shape)  # -> (N, (128, 128))
