In [5]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def extract_chroma_timbre(audio_file, hop_length=512, chroma_dim=12):
    # Load the audio file
    y, sr = librosa.load(audio_file)

    # Extract chroma features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length, n_chroma=chroma_dim)

    # Extract timbre features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=chroma_dim)

    # Calculate the number of beats
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)
    num_beats = len(beat_frames)

    # Initialize arrays to store per-beat chroma and timbre vectors
    per_beat_chroma = np.zeros((num_beats, chroma_dim))
    per_beat_timbre = np.zeros((num_beats, chroma_dim))

    # Compute chroma and timbre vectors for each beat
    for i, beat_start in enumerate(beat_frames[:-1]):
        beat_end = beat_frames[i+1]
        chroma_mean = np.mean(chroma[:, beat_start:beat_end], axis=1)
        timbre_mean = np.mean(mfcc[:, beat_start:beat_end], axis=1)
        per_beat_chroma[i] = chroma_mean
        per_beat_timbre[i] = timbre_mean

    return per_beat_chroma, per_beat_timbre

In [12]:
audio_file = 'D:\\vs_code\\DL\\proj\\resources\\audio_files\\.wav\\Rachel_Platten_-_Fight_Song_CeeNaija.com_.wav'
chroma_vectors, timbre_vectors = extract_chroma_timbre(audio_file)

# Print the dimensions of the resulting arrays
print("Chroma vectors shape:", chroma_vectors.shape)
print("Timbre vectors shape:", timbre_vectors.shape)

Chroma vectors shape: (585, 12)
Timbre vectors shape: (585, 12)
