# Data analysis for AcousticBrainz dataset from the perspective of music recommendation systems

In [None]:
import os, sys
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from track_processing_helpers import process_file

# Create cache dir if needed
CACHE = Path("cache"); CACHE.mkdir(exist_ok=True)

# Read from cache to prevent costly rebuild
if Path("cache/tracks.pkl").exists():
    df = pd.read_pickle("cache/tracks.pkl")
else:
    dataset_path = "sample/"
    json_paths = []

    for root, dirs, files in os.walk(dataset_path):
        for name in files:
            if name.lower().endswith(".json"):
                json_paths.append(os.path.join(root, name))
            else:
                print(f"Non-JSON file skipped: {name}")

    print(len(json_paths))

    #json_paths = json_paths[0:100]

    rows = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(process_file, path) for path in json_paths]
        for future in futures:
            result = future.result()
            if not result:
                continue
            
            rows.append(result)

    df = pd.DataFrame(rows)
    df.to_pickle("cache/tracks.pkl")

df.drop(['file_path'], axis=1, inplace=True)
df.head()

In [None]:
df.info()

df.describe()

In [None]:
df.groupby('musicbrainz_recordingid')