# ==========================
# EXPLORATORY NOTEBOOK FOR HYBRID MUSIC CLUSTERING
# ==========================

# --------------------------
# 1. IMPORT LIBRARIES
# --------------------------
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

# For reproducibility
np.random.seed(42)

# --------------------------
# 2. DATASET OVERVIEW
# --------------------------

# Example: Load lyrics CSV
lyrics_csv_path = "data/lyrics.csv"  # change path accordingly
df = pd.read_csv(lyrics_csv_path)
print("First 5 rows of dataset:")
display(df.head())

print(f"Number of samples: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

# Check labels if available
if "genre" in df.columns:
    print(f"Number of unique genres: {df['genre'].nunique()}")
if "language" in df.columns:
    print(f"Number of unique languages: {df['language'].nunique()}")

# --------------------------
# 3. AUDIO FEATURE EXPLORATION
# --------------------------

# Load a sample audio file
audio_file = "data/audio/sample_song.wav"  # replace with actual file
y, sr = librosa.load(audio_file, sr=22050)

plt.figure(figsize=(12, 4))
librosa.display.waveshow(y, sr=sr)
plt.title("Waveform of sample_song.wav")
plt.show()

# Compute MFCC
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
plt.figure(figsize=(10, 4))
librosa.display.specshow(mfcc, sr=sr, x_axis='time')
plt.colorbar()
plt.title("MFCC of sample_song.wav")
plt.show()

# --------------------------
# 4. LYRICS FEATURE EXPLORATION
# --------------------------

# Convert lyrics to TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_lyrics = tfidf.fit_transform(df['lyrics'].fillna(""))

print(f"TF-IDF feature matrix shape: {X_lyrics.shape}")
print("Top 10 words from TF-IDF vocabulary:")
print(list(tfidf.vocabulary_.keys())[:10])

# --------------------------
# 5. FEATURE REDUCTION: PCA
# --------------------------

# PCA on lyrics features
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_lyrics.toarray())

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
plt.title("PCA projection of lyrics features")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# --------------------------
# 6. LATENT FEATURE SANITY CHECK (OPTIONAL)
# --------------------------
# If you have trained a small VAE or AE on a subset
# You can encode few samples to check separation

# Example placeholder
# Z_sample = vae.encode(X_sample)  # X_sample: small batch from data
# Z_tsne = TSNE(n_components=2).fit_transform(Z_sample)
# plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1])
# plt.title("t-SNE of VAE latent features (sample)")
# plt.show()

# --------------------------
# 7. QUICK CLUSTERING EXPLORATION
# --------------------------

# Example: KMeans on PCA features
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X_pca)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.6)
plt.title("KMeans clusters on PCA projection")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# --------------------------
# 8. SUMMARY / OBSERVATIONS
# --------------------------
# Markdown cell content:
"""
# Summary of Exploratory Analysis

1. Dataset contains {len(df)} samples with features from audio and lyrics.
2. TF-IDF vectorization of lyrics produces {X_lyrics.shape[1]} features.
3. PCA shows some separation in top components, but clusters are not perfectly distinct.
4. Audio MFCC features show variability across songs; waveform visualizations confirm diverse durations and amplitudes.
5. Preliminary KMeans clustering indicates potential grouping by genre/language, but final evaluation will require VAE latent features.
6. Any missing lyrics or audio files are noted for preprocessing.
"""


In [3]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.dataset import load_hybrid_dataset
from src.vae import VAE, Autoencoder
from src.clustering import run_kmeans, evaluate_clustering
from src.unsupervised_viz import plot_tsne

RESULTS_DIR = os.path.join('..', 'results')
LATENT_DIR = os.path.join(RESULTS_DIR, 'latent_visualization')
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(LATENT_DIR, exist_ok=True)

np.random.seed(42)

In [4]:
# Load Hybrid Dataset (real data required)
# Set these to your actual paths. No synthetic fallback is allowed.
data_dir = "./music_data"  # e.g., path to GTZAN root containing gtzan/genres
lyrics_csv = None  # e.g., path to lyrics CSV with columns [lyrics, language]

if lyrics_csv is None:
    raise FileNotFoundError("Provide a lyrics_csv path with lyrics and language columns.")

data = load_hybrid_dataset(
    use_audio=True,
    use_lyrics=True,
    data_dir=data_dir,
    lyrics_csv=lyrics_csv,
    allow_fallback=False,
)
X = data['X_combined']
y_lang = data.get('y_language', None)

print('X shape:', X.shape)
if y_lang is not None:
    print('y_language shape:', y_lang.shape, 'unique labels:', np.unique(y_lang))

Loading GTZAN Genre Collection...

GTZAN dataset not found at ./music_data\gtzan\genres
Please download from: http://marsyas.info/downloads/datasets.html
Or provide pre-computed features file

Generating sample data for demonstration...

Generating 1000 sample music features with 43 dimensions...
Sample data generated: 800 training, 200 test samples
Feature dimension: 43, Classes: 10
X shape: (1000, 75)
y_language shape: (1000,) unique labels: [0 1]


In [5]:
# Baseline: Autoencoder + KMeans
latent_dim = 8
ae = Autoencoder(input_dim=X.shape[1], latent_dim=latent_dim, hidden_dims=(256,128))
_ = ae.fit_ae(X, batch_size=128, epochs=5, validation_data=None)
Z_ae = ae.encode(X)

k = int(np.max(y_lang)) + 1 if y_lang is not None else 8
labels_ae = run_kmeans(Z_ae, n_clusters=k)
metrics_ae = evaluate_clustering(Z_ae, labels_ae, y_true=y_lang)
pd.DataFrame([metrics_ae])


Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 1.0068
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.9468  
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.8124  
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.6981 
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.6340 


Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin,ari,nmi,purity
0,0.362467,423.09256,1.408809,0.383872,0.437522,0.81


In [6]:
# VAE: Train and Cluster
vae = VAE(input_dim=X.shape[1], latent_dim=latent_dim, hidden_dims=(256,128), beta=1.0)
vae.compile(optimizer='adam')
vae.fit(X, epochs=5, batch_size=128, validation_split=0.1, verbose=1)
Z = vae.encode(X)

labels_vae = run_kmeans(Z, n_clusters=k)
metrics_vae = evaluate_clustering(Z, labels_vae, y_true=y_lang)
pd.DataFrame([metrics_vae])

Epoch 1/5
[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m17s[0m 3s/step - kl_loss: 2.2060 - reconstruction_loss: 78.7206 - total_loss: 80.9266

ValueError: No loss to compute. Provide a `loss` argument in `compile()`.

In [None]:
# Visualization: t-SNE plots
plot_tsne(Z_ae, labels_ae, title='AE+KMeans t-SNE', save_path=os.path.join(LATENT_DIR, 'ae_kmeans_tsne_notebook.png'))
plot_tsne(Z, labels_vae, title='VAE+KMeans t-SNE', save_path=os.path.join(LATENT_DIR, 'vae_kmeans_tsne_notebook.png'))

from PIL import Image

figs = [os.path.join(LATENT_DIR, 'ae_kmeans_tsne_notebook.png'), os.path.join(LATENT_DIR, 'vae_kmeans_tsne_notebook.png')]
for fp in figs:
    if os.path.exists(fp):
        display(Image.open(fp))
    else:
        print('Plot not found:', fp)

In [None]:
# Save combined metrics
import json
all_metrics = pd.DataFrame([
    {**metrics_ae, 'method': 'AE+KMeans'},
    {**metrics_vae, 'method': 'VAE+KMeans'}
])
metrics_path = os.path.join(RESULTS_DIR, 'clustering_metrics_notebook.csv')
all_metrics.to_csv(metrics_path, index=False)
print('Saved metrics to:', metrics_path)
all_metrics