In [1]:
# ✅ COLAB GPU HARD PIPELINE (fully corrected for torchcodec AudioDecoder)
# Fixes:
# - AudioDecoder has NO sampling_rate attr -> use SR you cast to
# - audio.get_all_samples().data returns torch.Tensor (often shape (C, N))
# - Correct mono conversion for (C, N) and conversion to numpy float32
# - DataLoader: start with num_workers=0 for stability with torchcodec

# =========================
# 0) Install dependencies
# =========================
!pip -q install datasets torchcodec librosa soundfile scikit-learn pandas numpy

# =========================
# 1) Load dataset (AudioDecoder)
# =========================
from datasets import load_dataset, Audio

SR = 22050
ds = load_dataset("jamendolyrics/jamendolyrics", split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=SR))   # ✅ no mono arg

print("Songs:", len(ds))
ex0 = ds[0]
print("Example keys:", ex0.keys())
print("Example language:", ex0.get("language"), "genre:", ex0.get("genre"))
print("Audio type:", type(ex0["audio"]))
print("Lines example:", ex0["lines"][:1])

# Wave sanity check (torch.Tensor)
audio0 = ex0["audio"]
y0 = audio0.get_all_samples().data
print("Raw wave tensor shape:", y0.shape, "dtype:", y0.dtype)

# Proper mono check for (C, N)
if hasattr(y0, "dim") and y0.dim() == 2:
    y0_mono = y0.mean(dim=0)
    print("Mono wave tensor shape:", y0_mono.shape)

# =========================
# 2) Build segment table from line timestamps
# =========================
import numpy as np
import pandas as pd

MIN_DUR = 1.5
MAX_DUR = 6.0
CAP_LINES_PER_SONG = 12
MAX_TOTAL_SEGS = 1200   # safe; increase later (e.g., 3000)

def detect_line_keys(lines):
    ex = lines[0]
    start_keys = ["start_time", "start", "t_start"]
    end_keys   = ["end_time", "end", "t_end"]
    text_keys  = ["lyrics_line", "line", "text"]
    def find(cands):
        for k in cands:
            if k in ex:
                return k
        return None
    return find(start_keys), find(end_keys), find(text_keys)

start_k = end_k = text_k = None
for ex in ds:
    lines = ex.get("lines", [])
    if isinstance(lines, list) and len(lines) and isinstance(lines[0], dict):
        start_k, end_k, text_k = detect_line_keys(lines)
        break

print("Detected line keys:", start_k, end_k, text_k)
assert start_k and end_k and text_k, "Could not detect line keys."

rows = []
for song_idx, ex in enumerate(ds):
    lang = ex.get("language", "unknown")
    genre = ex.get("genre", "unknown")
    lines = ex.get("lines", [])
    if not lines:
        continue

    kept = 0
    for line_idx, ln in enumerate(lines):
        if kept >= CAP_LINES_PER_SONG:
            break
        try:
            st = float(ln[start_k]); en = float(ln[end_k]); txt = str(ln[text_k]).strip()
        except Exception:
            continue

        dur = en - st
        if dur < MIN_DUR or not txt:
            continue
        if dur > MAX_DUR:
            en = st + MAX_DUR

        rows.append({
            "song_idx": song_idx,
            "line_idx": line_idx,
            "start": st,
            "end": en,
            "language": lang,
            "genre": genre,
            "lyrics": txt
        })
        kept += 1

        if len(rows) >= MAX_TOTAL_SEGS:
            break
    if len(rows) >= MAX_TOTAL_SEGS:
        break

seg_meta = pd.DataFrame(rows)
print("Segments:", len(seg_meta))
print("Language counts:\n", seg_meta["language"].value_counts())
print("Genre counts:\n", seg_meta["genre"].value_counts().head(10))

# =========================
# 3) On-the-fly Log-mel Dataset (✅ fixed tensor->numpy + mono)
# =========================
import librosa
import torch
from torch.utils.data import Dataset, DataLoader

N_MELS = 128
N_FFT = 2048
HOP = 512
T_FIXED = 256

class SegmentLogMelDataset(Dataset):
    def __init__(self, ds, seg_meta, sr=22050):
        self.ds = ds
        self.meta = seg_meta.reset_index(drop=True)
        self.sr = sr

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        r = self.meta.loc[idx]
        ex = self.ds[int(r.song_idx)]

        audio = ex["audio"]
        y = audio.get_all_samples().data  # torch.Tensor, often (C, N)

        # ✅ Force mono robustly
        if hasattr(y, "dim") and y.dim() == 2:
            # If shape (channels, samples)
            if y.shape[0] <= 8 and y.shape[1] > y.shape[0]:
                y = y.mean(dim=0)   # -> (samples,)
            else:
                y = y.mean(dim=1)   # -> (samples,)

        # ✅ Convert to numpy float32 for librosa
        y = y.detach().cpu().numpy().astype(np.float32)

        s0 = int(float(r.start) * self.sr)
        s1 = int(float(r.end) * self.sr)
        s0 = max(0, min(s0, len(y)))
        s1 = max(0, min(s1, len(y)))
        seg = y[s0:s1]

        mel = librosa.feature.melspectrogram(
            y=seg, sr=self.sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP
        )
        x = librosa.power_to_db(mel, ref=np.max).astype(np.float32)  # (128, T)

        if x.shape[1] < T_FIXED:
            x = np.pad(x, ((0, 0), (0, T_FIXED - x.shape[1])))
        else:
            x = x[:, :T_FIXED]

        x = (x - x.mean()) / (x.std() + 1e-6)
        return torch.from_numpy(x).unsqueeze(0)  # (1,128,256)

dataset = SegmentLogMelDataset(ds, seg_meta, sr=SR)

# ✅ torchcodec stability: start with num_workers=0
loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0, pin_memory=True)

xb = next(iter(loader))
print("Batch shape:", xb.shape)  # expect [32,1,128,256]

# =========================
# 4) Beta-ConvVAE train (GPU + mixed precision)
# =========================
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

class ConvVAE(nn.Module):
    def __init__(self, latent_dim=32):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Conv2d(1, 16, 4, 2, 1), nn.ReLU(),
            nn.Conv2d(16, 32, 4, 2, 1), nn.ReLU(),
            nn.Conv2d(32, 64, 4, 2, 1), nn.ReLU(),
            nn.Conv2d(64, 128, 4, 2, 1), nn.ReLU()
        )
        self.flat_dim = 128 * 8 * 16
        self.fc_mu = nn.Linear(self.flat_dim, latent_dim)
        self.fc_lv = nn.Linear(self.flat_dim, latent_dim)

        self.fc_dec = nn.Linear(latent_dim, self.flat_dim)
        self.dec = nn.Sequential(
            nn.ConvTranspose2d(128, 64, 4, 2, 1), nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 4, 2, 1), nn.ReLU(),
            nn.ConvTranspose2d(32, 16, 4, 2, 1), nn.ReLU(),
            nn.ConvTranspose2d(16, 1, 4, 2, 1)
        )

    def reparam(self, mu, lv):
        std = torch.exp(0.5 * lv)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.enc(x).view(x.size(0), -1)
        mu, lv = self.fc_mu(h), self.fc_lv(h)
        z = self.reparam(mu, lv)
        h2 = self.fc_dec(z).view(x.size(0), 128, 8, 16)
        recon = self.dec(h2)
        return recon, mu, lv

def beta_vae_loss(x, recon, mu, lv, beta=2.0):
    recon_loss = nn.functional.mse_loss(recon, x, reduction="mean")
    kl = -0.5 * torch.mean(1 + lv - mu.pow(2) - lv.exp())
    return recon_loss + beta * kl

LATENT = 32
BETA = 2.0
EPOCHS = 20
LR = 1e-3

model = ConvVAE(latent_dim=LATENT).to(device)
opt = torch.optim.Adam(model.parameters(), lr=LR)

scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

for ep in range(1, EPOCHS + 1):
    model.train()
    total = 0.0
    for xb in loader:
        xb = xb.to(device, non_blocking=True)
        opt.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            recon, mu, lv = model(xb)
            loss = beta_vae_loss(xb, recon, mu, lv, beta=BETA)

        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()
        total += loss.item()

    if ep == 1 or ep % 5 == 0:
        print(f"beta={BETA} epoch {ep:02d} loss {total/len(loader):.4f}")

# =========================
# 5) Extract audio embeddings (mu)
# =========================
model.eval()
Z_audio = []
with torch.no_grad():
    for xb in DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True):
        xb = xb.to(device, non_blocking=True)
        _, mu, _ = model(xb)
        Z_audio.append(mu.cpu().numpy())
Z_audio = np.concatenate(Z_audio, axis=0)
print("Z_audio:", Z_audio.shape)

# =========================
# 6) Lyrics embeddings (TF-IDF -> PCA 64)
# =========================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

texts = seg_meta["lyrics"].astype(str).tolist()

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
E = tfidf.fit_transform(texts).toarray().astype(np.float32)

pca = PCA(n_components=64, random_state=42)
E64 = pca.fit_transform(E).astype(np.float32)
print("E64:", E64.shape)

# =========================
# 7) Fuse + labels
# =========================
from sklearn.preprocessing import StandardScaler

Za = StandardScaler().fit_transform(Z_audio)
El = StandardScaler().fit_transform(E64)

ALPHA = 2.0
Z_fused = np.concatenate([Za, ALPHA * El], axis=1)
print("Z_fused:", Z_fused.shape)

y_lang  = seg_meta["language"].astype("category").cat.codes.values
y_genre = seg_meta["genre"].astype("category").cat.codes.values

k_lang  = len(np.unique(y_lang))
k_genre = len(np.unique(y_genre))
print("k_lang:", k_lang, "k_genre:", k_genre)

# =========================
# 8) Clustering + Hard metrics (Silhouette, NMI, ARI, Purity)
# =========================
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score, normalized_mutual_info_score

def purity_score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    total = 0
    for c in np.unique(y_pred):
        idx = np.where(y_pred == c)[0]
        total += np.max(np.bincount(y_true[idx]))
    return total / len(y_true)

def eval_all(Z, labels, y_true):
    if len(set(labels)) >= 2:
        sil = silhouette_score(Z, labels)
        db  = davies_bouldin_score(Z, labels)
    else:
        sil, db = None, None
    nmi = normalized_mutual_info_score(y_true, labels)
    ari = adjusted_rand_score(y_true, labels)
    pur = purity_score(y_true, labels)
    return sil, db, nmi, ari, pur

def run_suite(Z, y_true, k):
    rows = []

    lab = KMeans(n_clusters=k, random_state=42, n_init="auto").fit_predict(Z)
    rows.append(("kmeans", f"k={k}", *eval_all(Z, lab, y_true)))

    lab = AgglomerativeClustering(n_clusters=k).fit_predict(Z)
    rows.append(("agglo", f"k={k}", *eval_all(Z, lab, y_true)))

    lab = DBSCAN(eps=1.2, min_samples=10).fit_predict(Z)
    mask = lab != -1
    if mask.sum() > 50 and len(set(lab[mask])) > 1:
        sil = silhouette_score(Z[mask], lab[mask])
        db  = davies_bouldin_score(Z[mask], lab[mask])
    else:
        sil, db = None, None
    nmi = normalized_mutual_info_score(y_true, lab)
    ari = adjusted_rand_score(y_true, lab)
    pur = purity_score(y_true, lab)
    rows.append(("dbscan", "eps=1.2,min_samples=10", sil, db, nmi, ari, pur))

    return pd.DataFrame(rows, columns=["algorithm","params","silhouette","davies_bouldin","NMI","ARI","purity"])

print("\n=== Evaluate vs LANGUAGE ===")
print(run_suite(Z_fused, y_lang, k_lang))

print("\n=== Evaluate vs GENRE ===")
print(run_suite(Z_fused, y_genre, k_genre))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m1.6/2.1 MB[0m [31m47.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/80 [00:00<?, ?files/s]

metadata.jsonl: 0.00B [00:00, ?B/s]

subsets/de/mp3/Bitte_beweg_dich_nicht_-_(…):   0%|          | 0.00/6.57M [00:00<?, ?B/s]

subsets/de/mp3/Freifliegen_-_durch.dick.(…):   0%|          | 0.00/6.14M [00:00<?, ?B/s]

subsets/de/mp3/Intro_[Pulsschlag]_-_Zeug(…):   0%|          | 0.00/4.58M [00:00<?, ?B/s]

subsets/de/mp3/Die_Revolution_gehört_Dir(…):   0%|          | 0.00/4.25M [00:00<?, ?B/s]

subsets/de/mp3/Fußabdrücke_-_Andreas_Jac(…):   0%|          | 0.00/5.83M [00:00<?, ?B/s]

subsets/de/mp3/Burn_Out_Man_-_Abendblau.(…):   0%|          | 0.00/5.74M [00:00<?, ?B/s]

subsets/de/mp3/Ich_kann_dich_nicht_verge(…):   0%|          | 0.00/6.40M [00:00<?, ?B/s]

subsets/de/mp3/1_Freak_-_Automatisch_Gek(…):   0%|          | 0.00/4.43M [00:00<?, ?B/s]

subsets/de/mp3/Keine_Lust_-_Jonny_M.mp3:   0%|          | 0.00/5.66M [00:00<?, ?B/s]

subsets/de/mp3/Da_wurdest_du_geboren_-_A(…):   0%|          | 0.00/5.01M [00:00<?, ?B/s]

subsets/de/mp3/Cafe_Jenseitz_Bebelstraße(…):   0%|          | 0.00/6.19M [00:00<?, ?B/s]

subsets/de/mp3/Der_Musiker_-_d-music.mp3:   0%|          | 0.00/6.65M [00:00<?, ?B/s]

subsets/de/mp3/Musik_-_Heiko.mp3:   0%|          | 0.00/5.83M [00:00<?, ?B/s]

subsets/de/mp3/Der_Baum_-_Dienstag_is_Da(…):   0%|          | 0.00/4.23M [00:00<?, ?B/s]

subsets/de/mp3/Drei_Nüsse_-_patrouille.m(…):   0%|          | 0.00/6.39M [00:00<?, ?B/s]

subsets/de/mp3/SHANEY_23_feat._G1NA_G._x(…):   0%|          | 0.00/4.79M [00:00<?, ?B/s]

subsets/de/mp3/Schnodderdodder_-_Abendbl(…):   0%|          | 0.00/4.64M [00:00<?, ?B/s]

subsets/de/mp3/Sehnsucht-Unplugged_-_REH(…):   0%|          | 0.00/6.19M [00:00<?, ?B/s]

subsets/de/mp3/Veränderung_-_doromusis.m(…):   0%|          | 0.00/3.87M [00:00<?, ?B/s]

subsets/en/mp3/Color_Out_-_Falling_Star.(…):   0%|          | 0.00/5.63M [00:00<?, ?B/s]

subsets/en/mp3/Avercage_-_Embers.mp3:   0%|          | 0.00/5.75M [00:00<?, ?B/s]

subsets/de/mp3/dich_gehen_zu_sehn._-_Win(…):   0%|          | 0.00/4.54M [00:00<?, ?B/s]

subsets/en/mp3/Explosive_Ear_Candy_-_Lik(…):   0%|          | 0.00/4.63M [00:00<?, ?B/s]

subsets/en/mp3/Cortez_-_Feel__Stripped_.(…):   0%|          | 0.00/5.95M [00:00<?, ?B/s]

subsets/en/mp3/HILA_-_Give_Me_the_Same.m(…):   0%|          | 0.00/5.52M [00:00<?, ?B/s]

subsets/en/mp3/JASON_MILLER_-_CROWD_PLEA(…):   0%|          | 0.00/4.21M [00:00<?, ?B/s]

subsets/en/mp3/Kinematic_-_Peyote.mp3:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

subsets/en/mp3/Lower_Loveday_-_Is_It_Rig(…):   0%|          | 0.00/4.52M [00:00<?, ?B/s]

subsets/en/mp3/Moon_I_Mean_-_Wrong_Conce(…):   0%|          | 0.00/5.27M [00:00<?, ?B/s]

subsets/en/mp3/Pure_Mids_-_The_Leader.mp(…):   0%|          | 0.00/5.87M [00:00<?, ?B/s]

subsets/en/mp3/Quentin_Hannappe_-_Keep_O(…):   0%|          | 0.00/4.34M [00:00<?, ?B/s]

subsets/en/mp3/Ridgway_-_Fire_Inside.mp3:   0%|          | 0.00/7.19M [00:00<?, ?B/s]

subsets/en/mp3/Rxbyn_-_Bad_Side.mp3:   0%|          | 0.00/5.67M [00:00<?, ?B/s]

subsets/en/mp3/Slingshot_Miracle_-_Whist(…):   0%|          | 0.00/5.76M [00:00<?, ?B/s]

subsets/en/mp3/LUNABLIND_-_Vision__Radio(…):   0%|          | 0.00/5.05M [00:00<?, ?B/s]

subsets/en/mp3/Songwriterz_-_Back_In_Tim(…):   0%|          | 0.00/4.78M [00:00<?, ?B/s]

subsets/en/mp3/The.madpix.project_-_One_(…):   0%|          | 0.00/4.26M [00:00<?, ?B/s]

subsets/en/mp3/Wordsmith_-_The_Statement(…):   0%|          | 0.00/4.47M [00:00<?, ?B/s]

subsets/en/mp3/The_Rinn_-_Voices__2017_V(…):   0%|          | 0.00/5.94M [00:00<?, ?B/s]

subsets/en/mp3/Tom_Orlando_-_The_One__fe(…):   0%|          | 0.00/5.68M [00:00<?, ?B/s]

subsets/es/mp3/10._Disparan_-_criatura.m(…):   0%|          | 0.00/4.46M [00:00<?, ?B/s]

subsets/es/mp3/Baila_-_Alfonso_Lugo.mp3:   0%|          | 0.00/5.09M [00:00<?, ?B/s]

subsets/es/mp3/Besando_Sapos_-_Dream_Tab(…):   0%|          | 0.00/5.71M [00:00<?, ?B/s]

subsets/es/mp3/CLUB_DESTINO__-_Pensando_(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

subsets/es/mp3/Caralibro_-_Vagos_Permane(…):   0%|          | 0.00/7.71M [00:00<?, ?B/s]

subsets/es/mp3/Diosa_de_la_noche_-_Brune(…):   0%|          | 0.00/5.19M [00:00<?, ?B/s]

subsets/es/mp3/Esencia_-_NandoMalo_.mp3:   0%|          | 0.00/6.23M [00:00<?, ?B/s]

subsets/es/mp3/Guayeteo_-_JhoyKing.mp3:   0%|          | 0.00/3.82M [00:00<?, ?B/s]

subsets/es/mp3/Fantasma_-_Los_Rombos.mp3:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

subsets/es/mp3/Háblame_-_(Talk_to_me)_-_(…):   0%|          | 0.00/4.95M [00:00<?, ?B/s]

subsets/es/mp3/Intentando_Destacar_-_Sun(…):   0%|          | 0.00/6.06M [00:00<?, ?B/s]

subsets/es/mp3/La_rumba_del_coronavirus_(…):   0%|          | 0.00/4.61M [00:00<?, ?B/s]

subsets/es/mp3/Quiero_y_Puedo_-_Nacidos_(…):   0%|          | 0.00/6.30M [00:00<?, ?B/s]

subsets/es/mp3/Sin_miedo_-_Living_Camboy(…):   0%|          | 0.00/5.71M [00:00<?, ?B/s]

subsets/es/mp3/Palabras_-_Javier_Gomez_B(…):   0%|          | 0.00/5.70M [00:00<?, ?B/s]

subsets/es/mp3/Te_Recuerdo_-_Wilson_Way.(…):   0%|          | 0.00/4.29M [00:00<?, ?B/s]

subsets/es/mp3/Vente_-_Fafarulo_Calabaza(…):   0%|          | 0.00/7.36M [00:00<?, ?B/s]

subsets/es/mp3/Yuanan_-_Miedo_-_Yuanan.m(…):   0%|          | 0.00/4.04M [00:00<?, ?B/s]

subsets/es/mp3/te_amo_-_fabios_la_nueva_(…):   0%|          | 0.00/3.95M [00:00<?, ?B/s]

subsets/fr/mp3/CHRISTMAS_AVEC_TOI_-_imfr(…):   0%|          | 0.00/6.29M [00:00<?, ?B/s]

subsets/fr/mp3/Capotes_à_un_Franc_-_elma(…):   0%|          | 0.00/4.30M [00:00<?, ?B/s]

subsets/fr/mp3/Culture_&_Co._-_cool.cave(…):   0%|          | 0.00/5.80M [00:00<?, ?B/s]

subsets/es/mp3/¡Óyeme_tiburón!_-_Corrien(…):   0%|          | 0.00/4.60M [00:00<?, ?B/s]

subsets/fr/mp3/DON_VALDES_-_Tu_sais_Man.(…):   0%|          | 0.00/5.06M [00:00<?, ?B/s]

subsets/fr/mp3/Confession_-_Quesabe.mp3:   0%|          | 0.00/3.63M [00:00<?, ?B/s]

subsets/fr/mp3/En_liberté_-_tom.leyak.mp(…):   0%|          | 0.00/4.78M [00:00<?, ?B/s]

subsets/fr/mp3/Le_musée_d'air_contempora(…):   0%|          | 0.00/4.50M [00:00<?, ?B/s]

subsets/fr/mp3/Le_royaume_des_glous_glou(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

subsets/fr/mp3/Les_files_d'attente_-_Law(…):   0%|          | 0.00/4.77M [00:00<?, ?B/s]

subsets/fr/mp3/Libre_by_CybeR_AttaCK_-_C(…):   0%|          | 0.00/5.15M [00:00<?, ?B/s]

subsets/fr/mp3/Mes_Larmes_-_kobzx2z.mp3:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

subsets/fr/mp3/POIGNEE_DE_MAIN_-_Cabbac.(…):   0%|          | 0.00/4.92M [00:00<?, ?B/s]

subsets/fr/mp3/Mère_nature_-_Law'.mp3:   0%|          | 0.00/5.97M [00:00<?, ?B/s]

subsets/fr/mp3/Pas_que_tes_pas_-_AZUL.mp(…):   0%|          | 0.00/5.18M [00:00<?, ?B/s]

subsets/fr/mp3/Pluie_d'entre_deux_saison(…):   0%|          | 0.00/4.46M [00:00<?, ?B/s]

subsets/fr/mp3/Séculaire_feat._Nÿme_-_sa(…):   0%|          | 0.00/3.73M [00:00<?, ?B/s]

subsets/fr/mp3/Une_Vie_De_Roi_-_david.mp(…):   0%|          | 0.00/6.32M [00:00<?, ?B/s]

subsets/fr/mp3/de_bonne_humeur_-_Le_Nez_(…):   0%|          | 0.00/4.09M [00:00<?, ?B/s]

subsets/fr/mp3/l'abandon_-_flo.mp3:   0%|          | 0.00/4.82M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/79 [00:00<?, ? examples/s]

Songs: 79
Example keys: dict_keys(['name', 'url', 'artist', 'title', 'genre', 'license_type', 'language', 'lyric_overlap', 'polyphonic', 'non_lexical', 'text', 'lines', 'words', 'audio'])
Example language: en genre: Pop
Audio type: <class 'datasets.features._torchcodec.AudioDecoder'>
Lines example: [{'start': 18.6199798584, 'end': 19.8730163574, 'text': 'lay awake at night'}]
Raw wave tensor shape: torch.Size([2, 5151431]) dtype: torch.float32
Mono wave tensor shape: torch.Size([5151431])
Detected line keys: start end text
Segments: 945
Language counts:
 language
en    240
es    240
de    237
fr    228
Name: count, dtype: int64
Genre counts:
 genre
Pop           264
Rock          192
Hip-Hop       105
Indie          96
Reggae         84
Electronic     48
Folk           48
Metal          24
Country        24
RNB            24
Name: count, dtype: int64




Batch shape: torch.Size([32, 1, 128, 256])
Device: cpu


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


beta=2.0 epoch 01 loss 0.9177
beta=2.0 epoch 05 loss 0.4530
beta=2.0 epoch 10 loss 0.4023
beta=2.0 epoch 15 loss 0.3900
beta=2.0 epoch 20 loss 0.3781
Z_audio: (945, 32)
E64: (945, 64)
Z_fused: (945, 96)
k_lang: 4 k_genre: 12

=== Evaluate vs LANGUAGE ===
  algorithm                  params  silhouette  davies_bouldin       NMI  \
0    kmeans                     k=4    0.029940        3.811107  0.086267   
1     agglo                     k=4    0.121638        2.279963  0.023070   
2    dbscan  eps=1.2,min_samples=10         NaN             NaN  0.000000   

        ARI    purity  
0  0.010754  0.342857  
1  0.000175  0.273016  
2  0.000000  0.253968  

=== Evaluate vs GENRE ===
  algorithm                  params  silhouette  davies_bouldin       NMI  \
0    kmeans                    k=12    0.028659        3.069083  0.078157   
1     agglo                    k=12    0.107163        1.006885  0.096279   
2    dbscan  eps=1.2,min_samples=10         NaN             NaN  0.000000   

    