# Music Collection Analysis

This notebook provides both static and interactive visualizations of the music collection analysis.

In [1]:
import json
import os
from typing import List, Dict

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import seaborn as sns
from tqdm import tqdm

# Static plot settings
plt.style.use("seaborn-v0_8-deep")
sns.set_palette("husl")
plt.rcParams.update({
    "figure.figsize": (12, 8),
    "font.size": 12,
    "axes.titlesize": 16,
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
})

## Data Loading and Processing

Load the JSON analysis files and prepare the data for visualization.

In [2]:
def load_analysis_files(analysis_dir: str) -> pd.DataFrame:
    """Load and process analysis files"""
    json_files = []
    for root, _, files in os.walk(analysis_dir):
        for f in files:
            if f.endswith(".json"):
                json_files.append(os.path.join(root, f))

    data = []
    for json_file in tqdm(json_files, desc="Loading files"):
        try:
            with open(json_file) as f:
                entry = json.load(f)
                entry["path"] = os.path.relpath(json_file, analysis_dir)
                data.append(entry)
        except Exception as e:
            print(f"Skipping {json_file}: {str(e)}")

    return pd.DataFrame(data)

def process_genres(genres: Dict) -> List[Dict]:
    """Process genre entries with proper separator handling"""
    processed = []
    for genre_str, prob in genres.items():
        if "---" in genre_str:
            parts = genre_str.split("---", 1)
        elif "—" in genre_str:
            parts = genre_str.split("—", 1)
        else:
            parts = [genre_str, "unknown-style"]
        
        parent = parts[0].strip()
        style = parts[1].strip() if len(parts) > 1 else "unknown-style"
        processed.append({"parent": parent, "style": style, "probability": prob})
    return processed

# Load data
analysis_dir = "results"
df = load_analysis_files(analysis_dir)

# Preprocess data
df["genres_processed"] = df["music_styles"].apply(process_genres)
key_profiles = ["temperley", "krumhansl", "edma"]
for profile in key_profiles:
    df[f"key_{profile}"] = df["key"].apply(
        lambda x: f"{x[profile]['key']} {x[profile]['scale'].capitalize()}"
    )

Loading files: 100%|██████████| 2100/2100 [00:00<00:00, 2680.67it/s]


## Genre Analysis

Analyze the distribution of music genres in the collection.

In [3]:
# Process genre data
parent_data = []
for track in df.genres_processed:
    total = sum(g["probability"] for g in track)
    for genre in track:
        parent_data.append({
            "parent": genre["parent"],
            "weight": genre["probability"] / total
        })

parent_df = pd.DataFrame(parent_data)
top_parents = parent_df.groupby("parent")["weight"].sum().nlargest(20)

# Interactive plot
fig = go.Figure(go.Bar(
    x=top_parents.values,
    y=top_parents.index,
    orientation='h',
    text=top_parents.values.round(3),
    textposition='auto',
))

fig.update_layout(
    title="Top 20 Parent Genre Distribution",
    xaxis_title="Weighted Proportion",
    yaxis_title="Genre Category",
    height=800
)

fig.show()

## Tempo and Danceability Analysis

Analyze the tempo and danceability distributions.

In [4]:
# Interactive plot
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=("Tempo Distribution", "Danceability Distribution"))

fig.add_trace(
    go.Histogram(x=df["tempo"], nbinsx=50, name="Tempo"),
    row=1, col=1
)

fig.add_trace(
    go.Violin(y=df["danceability"], name="Danceability", box_visible=True),
    row=1, col=2
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Tempo and Danceability Analysis"
)

fig.show()

## Musical Key Analysis

Compare key distributions across different estimation methods.

In [5]:
# Process key data
key_data = []
for profile in ["temperley", "krumhansl", "edma"]:
    counts = df[f"key_{profile}"].value_counts().reset_index()
    counts.columns = ["Key", "Count"]
    counts["Profile"] = profile.capitalize()
    key_data.append(counts)

key_df = pd.concat(key_data)

# Interactive plot
fig = px.bar(key_df, x="Key", y="Count", color="Profile",
             barmode="group",
             title="Key Distribution Comparison",
             color_discrete_sequence=px.colors.qualitative.Set2)

fig.update_layout(
    xaxis_tickangle=-45,
    height=700
)

fig.show()

## Loudness Analysis

Analyze the integrated loudness distribution.

In [6]:
# Interactive plot
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df["loudness"],
    nbinsx=30,
    name="Loudness"
))

fig.add_vline(x=-14, line_dash="dash", line_color="red",
              annotation_text="Music (-14 LUFS)")
fig.add_vline(x=-23, line_dash="dash", line_color="green",
              annotation_text="Broadcast (-23 LUFS)")

fig.update_layout(
    title="Integrated Loudness Distribution",
    xaxis_title="LUFS",
    yaxis_title="Count"
)

fig.show()

## Emotion Analysis

Visualize the emotional characteristics of the music collection.

In [7]:
# Normalize the valence and arousal values from [1,9] to [-1,1]
df["valence_normalized"] = (df["valence"] - 5) / 4.0
df["arousal_normalized"] = (df["arousal"] - 5) / 4.0

# Plot the normalized values
fig = go.Figure(go.Histogram2d(
    x=df["valence_normalized"],
    y=df["arousal_normalized"],
    colorscale="Viridis",
))
annotations = [
    dict(
        x=0.65,
        y=0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Exciting<br>Positive</span>",
        font=dict(color="white")
    ),
    dict(
        x=-0.65,
        y=0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Stressful<br>Negative</span>",
        font=dict(color="white")
    ),
    dict(
        x=0.65,
        y=-0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Content<br>Calm</span>",
        font=dict(color="white")
    ),
    dict(
        x=-0.65,
        y=-0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Depressing<br>Sad</span>",
        font=dict(color="white")
    )
]

fig.update_layout(
    title="Musical Emotion Landscape (Normalized)",
    xaxis_title="Valence (Normalized from -1 to 1)",
    yaxis_title="Arousal (Normalized from -1 to 1)",
    annotations=annotations,
    height=800,
    width=800
)

fig.show()

## Vocal Analysis

Analyze the distribution of vocal vs instrumental content.

In [8]:
# Calculate vocal mean
vocal_mean = df["voice_instrumental"].apply(lambda x: x["voice"]).mean()

# Interactive plot
fig = go.Figure(go.Pie(
    values=[vocal_mean, 1 - vocal_mean],
    labels=["Vocal", "Instrumental"],
    hole=0.7,
    textinfo="percent+label",
    marker_colors=["#FF6B6B", "#4ECDC4"]
))

fig.update_layout(
    title="Vocal Presence in Collection",
    annotations=[dict(
        text=f"{vocal_mean:.1%}<br>Vocal<br>Tracks",
        x=0.5, y=0.5,
        font_size=20,
        showarrow=False
    )]
)

fig.show()