# Music Collection Analysis

This notebook provides interactive visualizations of the music collection analysis.

In [22]:
import colorsys
import json
import os
from typing import Dict, List

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly.subplots import make_subplots
from tqdm import tqdm


# Static plot settings
plt.style.use("seaborn-v0_8-deep")
sns.set_palette("husl")
plt.rcParams.update(
    {
        "figure.figsize": (12, 8),
        "font.size": 12,
        "axes.titlesize": 16,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 12,
        "legend.fontsize": 12,
    }
)

## Data Loading and Processing

Load the JSON analysis files and prepare the data for visualization.

In [23]:
def load_analysis_files(analysis_dir: str) -> pd.DataFrame:
    """Load and process analysis files"""
    json_files = []
    for root, _, files in os.walk(analysis_dir):
        for f in files:
            if f.endswith(".json"):
                json_files.append(os.path.join(root, f))

    data = []
    for json_file in tqdm(json_files, desc="Loading files"):
        try:
            with open(json_file) as f:
                entry = json.load(f)
                entry["path"] = os.path.relpath(json_file, analysis_dir)
                data.append(entry)
        except Exception as e:
            print(f"Skipping {json_file}: {str(e)}")

    return pd.DataFrame(data)


def process_genres(genres: Dict) -> List[Dict]:
    """Process genre entries with proper separator handling"""
    processed = []
    for genre_str, prob in genres.items():
        if "---" in genre_str:
            parts = genre_str.split("---", 1)
        elif "—" in genre_str:
            parts = genre_str.split("—", 1)
        else:
            parts = [genre_str, "unknown-style"]

        parent = parts[0].strip()
        style = parts[1].strip() if len(parts) > 1 else "unknown-style"
        processed.append({"parent": parent, "style": style, "probability": prob})
    return processed


# Load data
analysis_dir = "results"
df = load_analysis_files(analysis_dir)

# Preprocess data
df["genres_processed"] = df["music_styles"].apply(process_genres)
key_profiles = ["temperley", "krumhansl", "edma"]
for profile in key_profiles:
    df[f"key_{profile}"] = df["key"].apply(
        lambda x: f"{x[profile]['key']} {x[profile]['scale'].capitalize()}"
    )

Loading files: 100%|██████████| 2100/2100 [00:00<00:00, 2754.59it/s]


## Genre & Style Analysis

Analyze the distribution of music genres and sub-genres in the collection.

In [24]:
# Process genre data
parent_data = []
for track in df.genres_processed:
    total = sum(g["probability"] for g in track)
    for genre in track:
        parent_data.append(
            {
                "parent": genre["parent"],
                "style": genre["style"],
                "weight": genre["probability"] / total,
            }
        )

parent_df = pd.DataFrame(parent_data)
parent_totals = parent_df.groupby(["parent", "style"])["weight"].sum().reset_index()
parent_summary = parent_totals.groupby("parent")["weight"].sum().reset_index()

# Sort parent genres by weight in descending order
parent_summary = parent_summary.sort_values(by="weight", ascending=False)

# Sort styles within each parent genre by weight in descending order
parent_totals = parent_totals.sort_values(
    by=["parent", "weight"], ascending=[True, False]
)

# Define a dictionary to store the sorted order of styles for each parent genre
style_order = {}
for parent in parent_summary["parent"]:
    # Get the styles for the current parent genre and sort them by weight in descending order
    styles = (
        parent_totals[parent_totals["parent"] == parent]
        .sort_values(by="weight", ascending=False)["style"]
        .tolist()
    )
    # Add the sorted styles to the style_order dictionary
    style_order[parent] = styles

# Interactive plot
fig = px.bar(
    parent_totals,
    x="parent",
    y="weight",
    color="style",
    hover_data=["style", "weight"],
    title="Genre & Style Distribution",
    labels={"weight": "Weight", "parent": "Genre"},
    category_orders={
        "parent": parent_summary["parent"].tolist(),
        "style": [
            style
            for parent in parent_summary["parent"]
            for style in style_order[parent]
        ],
    },
)

fig.update_layout(xaxis_tickangle=-45, height=700, showlegend=False)

fig.show()


Export TSV file with style distribution data

In [36]:
output_dir = "reports"
os.makedirs(output_dir, exist_ok=True)

# Count each style only once per song
unique_styles = []
for track in df.genres_processed:
    unique_styles.extend(set(genre["style"] for genre in track))

style_counts = pd.Series(unique_styles).value_counts().reset_index()
style_counts.columns = ["Style", "Count"]

style_counts.to_csv(
    os.path.join(output_dir, "all_styles_distribution.tsv"),
    sep="\t",
    index=False,
)

threshold = 0.05  # only count style if probability is at least 5%
filtered_styles = []
for track in df.genres_processed:
    # Use a set to avoid counting the same style more than once per track.
    styles = {genre["style"] for genre in track if genre["probability"] >= threshold}
    filtered_styles.extend(styles)

style_counts = pd.Series(filtered_styles).value_counts().reset_index()
style_counts.columns = ["Style", "Count"]

style_counts.to_csv(
    os.path.join(output_dir, "threshold_styles_distribution.tsv"),
    sep="\t",
    index=False,
)

## Tempo and Danceability Analysis

Analyze the tempo and danceability distributions.

In [18]:
# Calculate statistics for tempo
tempo_mean = df["tempo"].mean()
tempo_median = df["tempo"].median()
tempo_std = df["tempo"].std()
tempo_variance = df["tempo"].var()

# Calculate statistics for danceability
danceability_mean = df["danceability"].mean()
danceability_median = df["danceability"].median()
danceability_std = df["danceability"].std()
danceability_variance = df["danceability"].var()

# Interactive plot with statistics annotations
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=(
        f"Tempo Distribution (Mean: {tempo_mean:.2f}, Median: {tempo_median:.2f}, Std: {tempo_std:.2f})",
        f"Danceability Distribution (Mean: {danceability_mean:.2f}, Median: {danceability_median:.2f}, Std: {danceability_std:.2f})",
    ),
)

fig.add_trace(go.Histogram(x=df["tempo"], nbinsx=50, name="Tempo"), row=1, col=1)

fig.add_trace(
    go.Histogram(x=df["danceability"], nbinsx=30, name="Danceability"), row=1, col=2
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Tempo and Danceability Analysis",
    xaxis1_title="Tempo (BPM)",
    yaxis1_title="Count",
    xaxis2_title="Danceability",
    yaxis2_title="Count",
)

fig.show()


## Musical Key Analysis

Compare key distributions across different estimation methods.

In [19]:
profiles = ["temperley", "krumhansl", "edma"]

for profile in profiles:
    df[f"key_{profile}"] = df["key"].apply(
        lambda x: x.get(profile, {}).get("key") if isinstance(x, dict) else None
    )
    df[f"scale_{profile}"] = df["key"].apply(
        lambda x: x.get(profile, {}).get("scale") if isinstance(x, dict) else None
    )

# Transform the data into long form:
key_data = []
for profile in profiles:
    col_key = f"key_{profile}"
    col_scale = f"scale_{profile}"
    if col_key in df.columns and col_scale in df.columns:
        temp = df[[col_key, col_scale]].dropna().copy()
        temp = temp.rename(columns={col_key: "Key", col_scale: "Scale"})
        temp["Profile"] = profile.capitalize()
        key_data.append(temp)

if key_data:
    key_df = pd.concat(key_data, ignore_index=True)
else:
    key_df = pd.DataFrame(columns=["Key", "Scale", "Profile"])

# Group the data to count occurrences per combination:
key_df = key_df.groupby(["Key", "Scale", "Profile"]).size().reset_index(name="Count")

color_map = {"major": "#1f77b4", "minor": "#ff7f0e"}
pattern_map = {"major": "", "minor": "/"}

# Create a facet bar chart:
fig = px.bar(
    key_df,
    x="Key",
    y="Count",
    color="Scale",
    facet_col="Profile",
    barmode="group",
    title="Key Distribution by Algorithm (Major vs Minor)",
    color_discrete_map=color_map,
    category_orders={"Key": sorted(key_df["Key"].unique())},
)

# Add pattern shapes manually using update_traces
for trace in fig.data:
    if "major" in trace.name.lower():
        trace.marker.pattern.shape = pattern_map["major"]
    elif "minor" in trace.name.lower():
        trace.marker.pattern.shape = pattern_map["minor"]

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    xaxis_tickangle=-45,
    height=600,
    margin=dict(l=20, r=20, t=40, b=20),
    xaxis_title="Musical Key",
    yaxis_title="Number of Tracks",
)

fig.show()

In [20]:
key_data = []
for profile in ["temperley", "krumhansl", "edma"]:
    # Select only rows where both key and scale are available.
    temp = df[[f"key_{profile}", f"scale_{profile}"]].dropna()
    # Rename columns for consistency.
    temp = temp.rename(columns={f"key_{profile}": "Key", f"scale_{profile}": "Scale"})
    # Create a new column that combines the algorithm (capitalized) with scale.
    temp["ProfileScale"] = profile.capitalize() + "_" + temp["Scale"].str.lower()
    # Also add a column for the Profile name.
    temp["Profile"] = profile.capitalize()
    key_data.append(temp)

# Combine data from all profiles.
key_df = pd.concat(key_data)

# Now aggregate counts per Key and per ProfileScale.
key_df = (
    key_df.groupby(["Key", "ProfileScale", "Profile", "Scale"])
    .size()
    .reset_index(name="Count")
)

# Calculate percentages
total_counts = key_df.groupby("ProfileScale")["Count"].transform("sum")
key_df["Percentage"] = (key_df["Count"] / total_counts) * 100

# --- Define base colors for each profile ---
base_colors = {"Temperley": "#66c2a5", "Krumhansl": "#fc8d62", "Edma": "#8da0cb"}


def adjust_color_brightness(hex_color: str, factor: float) -> str:
    """
    Adjust the brightness of a hex color by multiplying its lightness by the given factor.
    factor < 1 darkens the color; factor > 1 lightens the color.
    """
    hex_color = hex_color.lstrip("#")
    r, g, b = tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))
    # Convert RGB to HLS (Hue, Lightness, Saturation)
    h, l, s = colorsys.rgb_to_hls(r / 255, g / 255, b / 255)
    # Adjust the lightness
    l = max(0, min(1, l * factor))
    r_new, g_new, b_new = colorsys.hls_to_rgb(h, l, s)
    return "#{:02x}{:02x}{:02x}".format(
        int(r_new * 255), int(g_new * 255), int(b_new * 255)
    )


color_map = {
    "Temperley_major": base_colors["Temperley"],
    "Temperley_minor": adjust_color_brightness(base_colors["Temperley"], 0.8),
    "Krumhansl_major": base_colors["Krumhansl"],
    "Krumhansl_minor": adjust_color_brightness(base_colors["Krumhansl"], 0.8),
    "Edma_major": base_colors["Edma"],
    "Edma_minor": adjust_color_brightness(base_colors["Edma"], 0.8),
}

pattern_map = {"Temperley": "", "Krumhansl": "/", "Edma": "\\"}

# Create the grouped bar chart.
fig = px.bar(
    key_df,
    x="Key",
    y="Count",
    color="ProfileScale",
    barmode="group",
    title="Key Distribution Comparison Across Profiles",
    color_discrete_map=color_map,
    pattern_shape="Profile",
    pattern_shape_map=pattern_map,
    hover_data=["Percentage"],
)

fig.update_layout(
    xaxis_tickangle=-45,
    height=700,
    xaxis_title="Musical Key",
    yaxis_title="Number of Tracks",
)

fig.update_traces(
    hovertemplate="<b>%{x}</b><br>Number of Tracks: %{y}<br>Percentage: %{customdata[0]:.2f}%<extra></extra>"
)

fig.update_layout(bargroupgap=0.2)

fig.show()

In [21]:
# Calculate agreement between all three profiles
df["agreement"] = (
    (df["key_temperley"] == df["key_krumhansl"])
    & (df["key_krumhansl"] == df["key_edma"])
).astype(int)

agreement_percentage = df["agreement"].mean() * 100
print(f"Percentage of tracks with agreement across all profiles: {agreement_percentage:.2f}%")


Percentage of tracks with agreement across all profiles: 53.19%


## Loudness Analysis

Analyze the integrated loudness distribution.

In [9]:
fig = go.Figure()

# Histogram of integrated loudness values
fig.add_trace(
    go.Histogram(
        x=df["loudness"],
        nbinsx=30,
        name="Integrated Loudness",
        marker_color="rgba(100, 149, 237, 0.7)",  # Cornflower blue
    )
)

# Shaded target range: Broadcast (-23 LUFS) to Music Master (-14 LUFS)
fig.add_vrect(
    x0=-23,
    x1=-14,
    fillcolor="LightGreen",
    opacity=0.2,
    layer="below",
    line_width=0,
    annotation_text="Target Range\n(-23 to -14 LUFS)",
    annotation_position="bottom left",
)

# Add vertical dashed lines for key targets

# Apple Music target: -16 LUFS
fig.add_vline(
    x=-16,
    line_dash="dash",
    line_color="purple",
    annotation_text="Apple Music (-16 LUFS)",
    annotation_position="bottom right",
)

# Spotify, SoundCloud, YouTube target: -14 LUFS
fig.add_vline(
    x=-14,
    line_dash="dash",
    line_color="red",
    annotation_text="Streaming Target (-14 LUFS)",
    annotation_position="top left",
)

# Typical rock/pop master: around -8 LUFS
fig.add_vline(
    x=-8,
    line_dash="dot",
    line_color="orange",
    annotation_text="Rock/Pop (~-8 LUFS)",
    annotation_position="top left",
)

# Pop track (where dynamics collapse): around -7 LUFS
(
    fig.add_vline(
        x=-7,
        line_dash="dot",
        line_color="darkorange",
        annotation_text="Pop Master (-7 LUFS)",
        annotation_position="bottom left",
    ),
)
# Aggressive EDM master: around -5 LUFS
fig.add_vline(
    x=-5,
    line_dash="dot",
    line_color="blue",
    annotation_text="Aggressive EDM (-5 LUFS)",
    annotation_position="top right",
)

fig.update_layout(
    title="Integrated Loudness Distribution",
    xaxis_title="Integrated Loudness (LUFS)",
    yaxis_title="Count",
    template="plotly_white",
)

fig.show()

In [10]:
def get_primary_genre(genres_list):
    """
    Extracts the primary genre from a list of genre dictionaries by choosing the one
    with the highest probability.
    """
    if not genres_list:
        return None
    # Return the parent genre of the entry with the maximum probability
    return max(genres_list, key=lambda x: x["probability"])["parent"]

# Create a new column 'primary_genre' in df
df["primary_genre"] = df["genres_processed"].apply(get_primary_genre)

# Create a box plot of integrated loudness (LUFS) by primary genre
fig = px.box(
    df,
    x="primary_genre",
    y="loudness",
    points=False,
    title="Integrated Loudness Distribution by Primary Genre",
    labels={"primary_genre": "Genre", "loudness": "Integrated Loudness (LUFS)"},
    template="plotly_white"
)

# Expected target integrated LUFS values based on the statement:
expected_targets = {
    "rock": -8.5,   # Rock tracks typically around -8 or -9 LUFS
    "pop": -7.0,    # Pop tracks around -7 LUFS
    "electronic": -5.0     # Aggressive EDM masters around -5 LUFS
}

# Define marker symbols for each genre
genre_markers = {
    "rock": "x",
    "pop": "circle",
    "electronic": "diamond",
}

# Overlay markers for the expected values where applicable.
for genre in df["primary_genre"].dropna().unique():
    genre_lower = genre.lower()
    if genre_lower in expected_targets:
        target_lufs = expected_targets[genre_lower]
        marker_symbol = genre_markers.get(genre_lower, "circle")  # Default to circle if genre not found
        fig.add_trace(go.Scatter(
            x=[genre],
            y=[target_lufs],
            mode="markers",
            marker=dict(size=12, color="green", symbol=marker_symbol, opacity=0.7),  # Use genre-specific marker
            name=f"Expected {genre} ({target_lufs} LUFS)"  # Include LUFS in legend
        ))

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_title="Integrated Loudness (LUFS)"
)

fig.show()

## Emotion Analysis

Visualize the emotional characteristics of the music collection.

In [85]:
# Normalize the valence and arousal values from [1,9] to [-1,1]
df["valence_normalized"] = (df["valence"] - 5) / 4.0
df["arousal_normalized"] = (df["arousal"] - 5) / 4.0

# Plot the normalized values
fig = go.Figure(
    go.Histogram2d(
        x=df["valence_normalized"],
        y=df["arousal_normalized"],
        colorscale="Viridis",
        xbins=dict(start=-1, end=1, size=0.1),  # Define x-axis bins
        ybins=dict(start=-1, end=1, size=0.1),  # Define y-axis bins
        colorbar=dict(title="Count"),  # Add colorbar title
    )
)
annotations = [
    dict(
        x=0.65,
        y=0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Exciting<br>Positive</span>",
        font=dict(color="white"),
    ),
    dict(
        x=-0.65,
        y=0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Stressful<br>Negative</span>",
        font=dict(color="white"),
    ),
    dict(
        x=0.65,
        y=-0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Content<br>Calm</span>",
        font=dict(color="white"),
    ),
    dict(
        x=-0.65,
        y=-0.65,
        text="<span style='text-shadow: -1px -1px 0px black, 1px -1px 0px black, -1px 1px 0px black, 1px 1px 0px black;'>Depressing<br>Sad</span>",
        font=dict(color="white"),
    ),
]

fig.update_layout(
    title="Musical Emotion Landscape (Normalized)",
    xaxis_title="Valence (Normalized from -1 to 1)",
    yaxis_title="Arousal (Normalized from -1 to 1)",
    xaxis=dict(range=[-1, 1]),
    yaxis=dict(range=[-1, 1]),
    annotations=annotations,
    height=700,
    width=700,
)

fig.show()

## Vocal Analysis

Analyze the distribution of vocal vs instrumental content.

In [11]:
# Calculate vocal mean
vocal_mean = df["voice_instrumental"].apply(lambda x: x["voice"]).mean()

# Interactive plot
fig = go.Figure(
    go.Pie(
        values=[vocal_mean, 1 - vocal_mean],
        labels=["Vocal", "Instrumental"],
        hole=0.7,
        textinfo="percent+label",
        marker_colors=["#FF6B6B", "#4ECDC4"],
    )
)

fig.update_layout(
    title="Vocal Presence in Collection",
    annotations=[
        dict(
            text=f"{vocal_mean:.1%}<br>Vocal<br>Tracks",
            x=0.5,
            y=0.5,
            font_size=20,
            showarrow=False,
        )
    ],
)

fig.show()