In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Global setup
plt.rcParams.update({
    "font.family": "DejaVu Sans",
    "font.size": 13,
    "axes.labelsize": 13,
    "axes.titlesize": 15,
    "legend.fontsize": 11,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "axes.linewidth": 1.2,
    "savefig.dpi": 300,
    "figure.dpi": 120
})
sns.set_style("whitegrid")
sns.set_palette("Spectral")

output_dir = "/Users/vancefeng/Desktop/ords/AML/spotify_analysis_project/feature_popularity_analysis/figs"
os.makedirs(output_dir, exist_ok=True)

### 1 - Model performance comparison

In [2]:
baseline_results = pd.DataFrame({
    "Model": [
        "Random Forest", "Decision Tree", "Extra Trees",
        "HistGradient Boosting", "Gradient Boosting", "AdaBoost"
    ],
    "Accuracy": [0.808577, 0.807785, 0.807743, 0.689433, 0.675046, 0.602788],
    "F1": [0.846206, 0.844764, 0.844695, 0.786108, 0.779681, 0.749847]
})

df_melted = baseline_results.melt(id_vars="Model", var_name="Metric", value_name="Score")

sns.set_theme(style="ticks", context="talk")
plt.figure(figsize=(12, 7))


palette = sns.color_palette("Set2", 2) 

ax = sns.barplot(
    data=df_melted, x="Model", y="Score", hue="Metric",
    palette=palette,
    edgecolor="0.2",
    linewidth=1.5, 
    alpha=0.9 
)

patterns = ['///', '...']
if len(ax.containers) == len(patterns):
    for i, container in enumerate(ax.containers):
        pattern = patterns[i]
        for patch in container:
            patch.set_hatch(pattern)
else:
    print("Warning: The number of patterns does not match the number of containers.")

sns.despine()

plt.title("Model Performance Comparison (Balanced Dataset)", weight="bold", pad=20, fontsize=20)
plt.ylim(0, 0.95)
plt.xticks(rotation=30, ha='right')
plt.ylabel("Score")
plt.xlabel("")


plt.legend(
    frameon=False,        
    loc="upper left",
    bbox_to_anchor=(1.01, 1.0), 
    title='Metric'          
)


for container in ax.containers:
    ax.bar_label(
        container, fmt="%.3f", label_type="edge",
        fontsize=11, padding=3, color="black", weight="normal" # "normal" 权重可能更清晰
    )

plt.tight_layout()
plt.savefig(os.path.join(output_dir, f"1_model_performance.png"), dpi=300)
plt.close()

print(f"The figure has been saved to: {os.path.join(output_dir, f'1_model_performance.png')}")

The figure has been saved to: /Users/vancefeng/Desktop/ords/AML/spotify_analysis_project/feature_popularity_analysis/figs_v1/1_model_performance.png


### 2 - Per-class precision / recall / F1

In [3]:
class_report = pd.DataFrame({
    "Class": ["Less Popular", "Popular"],
    "Precision": [0.79, 0.82],
    "Recall": [0.70, 0.88],
    "F1-score": [0.75, 0.85]
})

cr_melt = class_report.melt(id_vars="Class", var_name="Metric", value_name="Score")

plt.figure(figsize=(6.5, 5))
ax = sns.barplot(
    data=cr_melt, x="Class", y="Score", hue="Metric",
    edgecolor="0.3", alpha=0.9
)
sns.despine()
plt.title("Per-Class Metrics (Random Forest)", weight="bold", pad=12)
plt.ylim(0.6, 0.9)
plt.ylabel("Score")

plt.legend(
    frameon=True,
    loc="lower right",       
    bbox_to_anchor=(1.0, 0.02),
    borderpad=0.6
)

for c in ax.containers:
    ax.bar_label(c, fmt="%.2f", label_type="edge", fontsize=10, padding=3)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, f"2_class_report.png"), dpi=300)
plt.close()

### 3 - Confusion matrix heatmap

In [4]:
cm = np.array([[26358, 11076],
               [6819, 49231]])
labels = ["Less Popular", "Popular"]
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

plt.figure(figsize=(5.5, 4.8))
ax = sns.heatmap(
    cm_df, annot=True, fmt="d", cmap="coolwarm", cbar=False,
    linewidths=0.8, annot_kws={"size": 12, "weight": "bold"}
)
plt.title("Confusion Matrix (Random Forest)", weight="bold", pad=12)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
sns.despine()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, f"3_confusion_matrix.png"), dpi=300)
plt.close()

### 4 - Feature importance (Top 15)

In [5]:
feature_importances = pd.DataFrame({
    "Feature": [
        "num_C4(Acoustic)", "num_C5(Big-Room Dance)", "num_C2(Instrumental Beats)", "num_C7(Latin Pop)",
        "num_C0(Balanced Mix)", "num_C1(Energetic Pop)", "num_C3(Hip-hop)", "num_C6(Mellow Blend)",
        "cat_season_4", "cat_season_1", "cat_season_3", "cat_season_2",
        "cat_is_weekend_1", "cat_is_weekend_0", "cat_closest_cluster_C5"
    ],
    "Importance": [
        0.090127, 0.086705, 0.086376, 0.085419, 0.084888, 0.084815, 0.084336, 0.083569,
        0.048809, 0.047077, 0.046797, 0.045853, 0.010970, 0.010955, 0.009095
    ]
})

plt.figure(figsize=(11, 6.5))
ax = sns.barplot(
    data=feature_importances, y="Feature", x="Importance",
    palette="viridis_r", edgecolor="0.2"
)
sns.despine()
plt.title("Top Feature Importances (Random Forest)", weight="bold", pad=12)
plt.xlabel("Relative Importance")
plt.ylabel("")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, f"4_feature_importances.png"), dpi=300)
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(


### 5 - Cluster-level weighted importance (C0–C7)

In [6]:
cluster_importances = feature_importances[
    feature_importances["Feature"].str.contains("weighted_C")
].copy()
cluster_importances["Cluster"] = cluster_importances["Feature"].str.extract(r'C(\d)')
cluster_importances = cluster_importances.sort_values("Cluster")

plt.figure(figsize=(7, 4.5))
ax = sns.barplot(
    data=cluster_importances, x="Cluster", y="Importance",
    palette="magma", edgecolor="0.3", alpha=0.9
)
sns.despine()
plt.title("Cluster Weighted Feature Importance (C0–C7)", weight="bold", pad=12)
plt.ylabel("Importance")
plt.xlabel("Cluster ID")
for c in ax.containers:
    ax.bar_label(c, fmt="%.3f", label_type="edge", fontsize=9, padding=3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, f"5_cluster_importance.png"), dpi=300)
plt.close()