# AcousticBrainz High-Level Sample Data Processing for Music Recommendation System

## Goals
- [x] Iterate through all track files in the high-level sample
- [x] Extract MusicBrainz ID, metadata, audio features to a dict
- [ ] Map the dict to SQL (we'll add Django ORM later)

> The code assumes you have downloaded the AcousticBrainz DB dumps in the same directory, under `acousticbrainz-highlevel-sample-json-20220623-0/highlevel/`. They can be downloaded from here: https://acousticbrainz.org/download

In [12]:
import os
import sys
import json
import django
import time
from pprint import pprint

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "music_recommendation.settings")
django.setup()

from recommend_api.models import Track

In [13]:
def extract_data_from_json(filepath):
    """
    Returns a dict with values for corresponding audio features from the AcousticBrainz dataset.
    """
    with open(filepath, 'r') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            print("Bad JSON:", filepath)
            return None

        highlevel = data.get('highlevel') or {}
        metadata = data.get('metadata') or {}
        tags = metadata.get('tags') or {}

        return {
            # metadata
            "artist":tags.get('artist', [None])[0],
            "album":tags.get('album', [None])[0],
            "title":tags.get('title', [None])[0],
            "release_date":tags.get('originaldate', [None])[0],
            "duration": metadata.get('audio_properties', {}).get('length', None),

            # high level features
            "genre":  highlevel.get('genre_dortmund', {}).get('value', None),
            "danceability":  highlevel.get('danceability', {}).get('all', {}).get('danceable', None),
            "aggressiveness": highlevel.get('mood_aggressive', {}).get('all', {}).get('aggressive', None),
            "happiness": highlevel.get('mood_happy', {}).get('all', {}).get('happy', None),
            "sadness": highlevel.get('mood_sad', {}).get('all', {}).get('sad', None),
            "relaxedness": highlevel.get('mood_relaxed', {}).get('all', {}).get('relaxed', None),
            "partyness": highlevel.get('mood_party', {}).get('all', {}).get('party', None),
            "acousticness": highlevel.get('mood_acoustic', {}).get('all', {}).get('acoustic', None),
            "electronicness": highlevel.get('mood_electronic', {}).get('all', {}).get('electronic', None),
            "instrumentalness": highlevel.get('voice_instrumental', {}).get('all', {}).get('instrumental', None),
            "tonality": highlevel.get('tonal_atonal', {}).get('all', {}).get('tonal', None),
            "brightness": highlevel.get('timbre', {}).get('all', {}).get('bright', None),
        }

In [14]:
highlevel_path = 'acousticbrainz-highlevel-sample-json-20220623/highlevel/'


# test = extract_data_from_json(os.path.join(highlevel_path, '00', '0', '000a9db8-949f-4fa2-9f40-856127df0dbc-0.json'))
# pprint(test)

json_paths = []

# walks through a branch of the directory tree, it will look at all subfolders and files recursively
for root, dirs, files in os.walk(highlevel_path):
    for name in files:
        json_paths.append(os.path.join(root, name))

Track.objects.all().delete()


SynchronousOnlyOperation: You cannot call this from an async context - use a thread or sync_to_async.

In [None]:
records = []
start = time.time()

print(f"Will load {len(json_paths)} records")

for json_path in json_paths:
    features = extract_data_from_json(json_path)

    if features is None:
        continue

    records.append(Track(**features))

end = time.time()
print(f"Finished loading records into memory in {end - start:.2f}s, now running the ORM inserts.")

start = time.time()

batch_size = 1000
for i in range(0, len(records), batch_size):
    print(str(i) + '/' + str(len(records)) + ' processed')
    Track.objects.bulk_create(records[i:i+batch_size])

end = time.time()
print(f"Inserted {len(records)} records in {end - start:.2f} seconds")

print("DONE")