In [15]:
import json
from pathlib import Path
import librosa
import numpy as np
import os

In [16]:
# Load tagging data
with open("favorite_sections.json", "r") as f:
    tag_data = json.load(f)

audio_directory = r"C:\Users\thoma\Downloads"

audio_path = Path(f"{audio_directory}\{tag_data['file']}")
sample_rate = tag_data["sample_rate"]
sections = tag_data["sections"]

print(f"Path: {audio_path}")
print(f"Processing: {audio_path.name}")
print(f"Sections to analyze: {sections}")

Path: C:\Users\thoma\Downloads\Disappear.mp3
Processing: Disappear.mp3
Sections to analyze: [{'start_sec': 0.0, 'end_sec': 14.0}]


In [17]:
# Load full audio file
y, sample_rate_actual = librosa.load(str(audio_path), sr=sr)

# Confirm sample rate matches
assert sample_rate == sample_rate_actual, "Sample rate mismatch between tag file and loaded audio"

In [26]:
# Function that extracts timbre, harmonic content, frequency band strength, and Tempo from a given section
def extract_audio_features(y_segment, sample_rate):
    features = {}

    # MFCCs (timbre)
    mfcc = librosa.feature.mfcc(y=y_segment, sr=sample_rate, n_mfcc=13)
    features["mfcc_mean"] = np.mean(mfcc, axis=1).tolist()

    # Chroma (harmonic content)
    chroma = librosa.feature.chroma_stft(y=y_segment, sr=sample_rate)
    features["chroma_mean"] = np.mean(chroma, axis=1).tolist()

    # Spectral contrast (frequency band strength)
    contrast = librosa.feature.spectral_contrast(y=y_segment, sr=sample_rate)
    features["contrast_mean"] = np.mean(contrast, axis=1).tolist()

    # Tempo (", _" used to return only the first item (result[0]) in the tuple and ignore the other item)
    tempo, _ = librosa.beat.beat_track(y=y_segment, sr=sample_rate)
    features["tempo"] = tempo.tolist()

    return features

In [27]:
# Extract featues from each section
extracted = []

for section in sections:
    start_sample = int(section["start_sec"] * sample_rate)
    end_sample = int(section["end_sec"] * sample_rate)
    segment = y[start_sample:end_sample]

    segment_features = extract_audio_features(segment, sample_rate)
    extracted.append({
        "start_sec": section["start_sec"],
        "end_sec": section["end_sec"],
        "features": segment_features
    })

print(f"Extracted features for {len(extracted)} sections")
print(extracted)

Extracted features for 1 sections
[{'start_sec': 0.0, 'end_sec': 14.0, 'features': {'mfcc_mean': [-225.90866088867188, 235.04234313964844, -81.98894500732422, -13.515945434570312, 28.49778175354004, -23.154523849487305, -6.584556579589844, -4.579257965087891, -16.709625244140625, -6.49915075302124, -8.874130249023438, -9.901187896728516, -5.448530673980713], 'chroma_mean': [0.31316226720809937, 0.3492315113544464, 0.5614087581634521, 0.4568599462509155, 0.433299720287323, 0.5794953107833862, 0.45216867327690125, 0.5390928387641907, 0.30066603422164917, 0.2997592091560364, 0.5430914759635925, 0.43964463472366333], 'contrast_mean': [19.337029807377537, 9.7839523209309, 13.448022318402186, 15.433296410480425, 15.345127805522717, 23.552519680457525, 42.968395797030716], 'tempo': [109.95678191489361]}}]


In [28]:
output_path = Path("section_features.json")

with open(output_path, "w") as f:
    json.dump({
        "file": audio_path.name,
        "features": extracted
    }, f, indent=2)

print(f"Features saved to {output_path}")


Features saved to section_features.json
