In [23]:
from pathlib import Path
import json
import pandas as pd

DATA_ROOT = Path(r"C:\Users\Atif\Documents\medium\jamendolyrics_full")  # <-- change to your downloaded folder
META_JSONL = DATA_ROOT / "metadata.jsonl"

assert META_JSONL.exists(), f"metadata.jsonl not found at: {META_JSONL}"

rows = []
with META_JSONL.open("r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.DataFrame(rows)
print("Songs:", len(df))
print("Columns:", list(df.columns)[:20], "...")
df.head(2)


Songs: 79
Columns: ['name', 'url', 'artist', 'title', 'genre', 'license_type', 'language', 'lyric_overlap', 'polyphonic', 'non_lexical', 'text', 'lines', 'words', 'file_name'] ...


Unnamed: 0,name,url,artist,title,genre,license_type,language,lyric_overlap,polyphonic,non_lexical,text,lines,words,file_name
0,HILA_-_Give_Me_the_Same,https://www.jamendo.com/track/1559261/give-me-...,HILA,Give Me The Same,Pop,BY-ND,en,False,False,False,lay awake at night\nwondering how could i\nlet...,"[{'start': 18.6199798584, 'end': 19.8730163574...","[{'start': 18.6199798584, 'end': 18.8935203552...",subsets/en/mp3/HILA_-_Give_Me_the_Same.mp3
1,Quentin_Hannappe_-_Keep_On,https://www.jamendo.com/track/1552064/keep-on,Quentin Hannappe,Keep On,Pop,BY-NC-ND,en,False,False,False,keep on working on your dreams and don't run a...,"[{'start': 10.1839199066, 'end': 14.5356912613...","[{'start': 10.1839199066, 'end': 10.4235830307...",subsets/en/mp3/Quentin_Hannappe_-_Keep_On.mp3


In [24]:
# Check what fields exist
print("Has 'text'?", "text" in df.columns)
print("Has 'lines'?", "lines" in df.columns)
print("Has 'words'?", "words" in df.columns)
print("Language counts:\n", df["language"].value_counts() if "language" in df.columns else "No language column")

# Look at a sample "lines" entry
if "lines" in df.columns:
    print("\nExample lines[0]:")
    print(df.loc[0, "lines"][:2])



Has 'text'? True
Has 'lines'? True
Has 'words'? True
Language counts:
 language
en    20
de    20
es    20
fr    19
Name: count, dtype: int64

Example lines[0]:
[{'start': 18.6199798584, 'end': 19.8730163574, 'text': 'lay awake at night'}, {'start': 20.8442592621, 'end': 22.1282539368, 'text': 'wondering how could i'}]


In [25]:
import numpy as np
import librosa
import soundfile as sf

SR = 22050
MIN_DUR = 1.5     # seconds: skip tiny lines
MAX_DUR = 6.0     # seconds: cap long lines
MAX_SONGS = None  # set like 50 to speed up first run

out_seg_dir = DATA_ROOT / "medium_segments_wav"
out_seg_dir.mkdir(parents=True, exist_ok=True)

def pick_keys(example_line: dict):
    # Try common key names seen in versions of the dataset
    start_keys = ["start_time", "start", "t_start"]
    end_keys   = ["end_time", "end", "t_end"]
    text_keys  = ["lyrics_line", "line", "text"]
    def find_key(cands):
        for k in cands:
            if k in example_line:
                return k
        return None
    return find_key(start_keys), find_key(end_keys), find_key(text_keys)

# Detect keys from first non-empty lines entry
start_k = end_k = text_k = None
for x in df["lines"]:
    if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict):
        start_k, end_k, text_k = pick_keys(x[0])
        break

print("Detected keys:", start_k, end_k, text_k)
assert start_k and end_k and text_k, "Could not detect line start/end/text keys. Paste df.loc[0,'lines'][:2] to fix."

seg_rows = []
bad = 0

song_iter = df.itertuples(index=False)
if MAX_SONGS:
    song_iter = list(song_iter)[:MAX_SONGS]

for i, ex in enumerate(song_iter):
    exd = ex._asdict()

    lang = exd.get("language", "unknown")
    lines = exd.get("lines", [])

    # audio path field is commonly "file_name" (relative to metadata.jsonl)
    # Your README says: subsets/*/mp3/*.mp3
    rel_audio = exd.get("file_name") or exd.get("audio") or exd.get("path")
    if rel_audio is None:
        bad += 1
        continue

    audio_path = (META_JSONL.parent / rel_audio).resolve()
    if not audio_path.exists():
        bad += 1
        continue

    # Lyrics text for embedding per-segment will come from each line; but keep full song text if needed
    try:
        y, _ = librosa.load(str(audio_path), sr=SR, mono=True)  # requires ffmpeg backend for mp3
    except Exception as e:
        print("Audio load failed:", audio_path.name, "->", e)
        bad += 1
        continue

    if not isinstance(lines, list) or len(lines) == 0:
        continue

    for j, ln in enumerate(lines):
        try:
            start = float(ln[start_k])
            end   = float(ln[end_k])
            text  = str(ln[text_k]).strip()
        except Exception:
            continue

        dur = end - start
        if dur < MIN_DUR:
            continue
        if dur > MAX_DUR:
            end = start + MAX_DUR

        s0, s1 = int(start * SR), int(end * SR)
        seg = y[s0:s1]
        if len(seg) < int(MIN_DUR * SR):
            continue
        if not text:
            continue

        seg_name = f"song{i:03d}_line{j:03d}_{lang}.wav"
        seg_path = out_seg_dir / seg_name
        sf.write(str(seg_path), seg, SR, subtype="PCM_16")

        seg_rows.append({"seg_path": str(seg_path), "lyrics": text, "language": lang})

seg_meta = pd.DataFrame(seg_rows)
seg_meta.to_csv(DATA_ROOT / "segment_meta.csv", index=False)

print("Segments created:", len(seg_meta))
print("Bad songs skipped:", bad)
seg_meta.head()


Detected keys: start end text
Segments created: 2924
Bad songs skipped: 0


Unnamed: 0,seg_path,lyrics,language
0,C:\Users\Atif\Documents\medium\jamendolyrics_f...,i thought we could get better,en
1,C:\Users\Atif\Documents\medium\jamendolyrics_f...,stayed committed like a soldier,en
2,C:\Users\Atif\Documents\medium\jamendolyrics_f...,now i know that you don't care at all,en
3,C:\Users\Atif\Documents\medium\jamendolyrics_f...,gave all of my soul and my heart to someone th...,en
4,C:\Users\Atif\Documents\medium\jamendolyrics_f...,i was a fool to believe that you would finally...,en


In [26]:
import numpy as np
import librosa
import pandas as pd

seg_meta = pd.read_csv(DATA_ROOT / "segment_meta.csv")

N_MELS = 128
N_FFT = 2048
HOP = 512
T_FIXED = 256   # time frames (pad/crop)

def logmel_fixed(path):
    y, _ = librosa.load(path, sr=SR, mono=True)
    mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP)
    x = librosa.power_to_db(mel, ref=np.max).astype(np.float32)  # (128, T)

    if x.shape[1] < T_FIXED:
        x = np.pad(x, ((0,0),(0, T_FIXED - x.shape[1])))
    else:
        x = x[:, :T_FIXED]

    x = (x - x.mean()) / (x.std() + 1e-6)
    return x[None, :, :]  # (1, 128, 256)

# Option: start smaller for speed
MAX_SEGS = 3000  # increase later if you want
seg_meta_small = seg_meta.sample(n=min(MAX_SEGS, len(seg_meta)), random_state=42).reset_index(drop=True)

X_audio = np.stack([logmel_fixed(p) for p in seg_meta_small["seg_path"]], axis=0)
np.save(DATA_ROOT / "X_audio.npy", X_audio)
seg_meta_small.to_csv(DATA_ROOT / "segment_meta_used.csv", index=False)

print("X_audio shape:", X_audio.shape)


X_audio shape: (2924, 1, 128, 256)


In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

seg_meta_used = pd.read_csv(DATA_ROOT / "segment_meta_used.csv")
texts = seg_meta_used["lyrics"].astype(str).tolist()

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
E = tfidf.fit_transform(texts).toarray().astype(np.float32)

pca = PCA(n_components=64, random_state=42)
E64 = pca.fit_transform(E).astype(np.float32)

np.save(DATA_ROOT / "E64.npy", E64)
print("E64 shape:", E64.shape)


E64 shape: (2924, 64)


In [28]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

X_audio = np.load(DATA_ROOT / "X_audio.npy")  # (N,1,128,256)
E64 = np.load(DATA_ROOT / "E64.npy")          # (N,64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

class ConvVAE(nn.Module):
    def __init__(self, latent_dim=32):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Conv2d(1, 16, 4, 2, 1), nn.ReLU(),   # (16,64,128)
            nn.Conv2d(16, 32, 4, 2, 1), nn.ReLU(),  # (32,32,64)
            nn.Conv2d(32, 64, 4, 2, 1), nn.ReLU(),  # (64,16,32)
            nn.Conv2d(64, 128, 4, 2, 1), nn.ReLU()  # (128,8,16)
        )
        self.flat_dim = 128 * 8 * 16
        self.fc_mu = nn.Linear(self.flat_dim, latent_dim)
        self.fc_lv = nn.Linear(self.flat_dim, latent_dim)

        self.fc_dec = nn.Linear(latent_dim, self.flat_dim)
        self.dec = nn.Sequential(
            nn.ConvTranspose2d(128, 64, 4, 2, 1), nn.ReLU(),   # (64,16,32)
            nn.ConvTranspose2d(64, 32, 4, 2, 1), nn.ReLU(),    # (32,32,64)
            nn.ConvTranspose2d(32, 16, 4, 2, 1), nn.ReLU(),    # (16,64,128)
            nn.ConvTranspose2d(16, 1, 4, 2, 1)                 # (1,128,256)
        )

    def reparam(self, mu, lv):
        std = torch.exp(0.5 * lv)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.enc(x).view(x.size(0), -1)
        mu, lv = self.fc_mu(h), self.fc_lv(h)
        z = self.reparam(mu, lv)
        h2 = self.fc_dec(z).view(x.size(0), 128, 8, 16)
        recon = self.dec(h2)
        return recon, mu, lv

def loss_fn(x, recon, mu, lv):
    recon_loss = nn.functional.mse_loss(recon, x, reduction="mean")
    kl = -0.5 * torch.mean(1 + lv - mu.pow(2) - lv.exp())
    return recon_loss + kl

LATENT = 32
BATCH = 32
EPOCHS = 25
LR = 1e-3

dl = DataLoader(TensorDataset(torch.from_numpy(X_audio).float()),
                batch_size=BATCH, shuffle=True)

model = ConvVAE(latent_dim=LATENT).to(device)
opt = torch.optim.Adam(model.parameters(), lr=LR)

for ep in range(1, EPOCHS+1):
    model.train()
    tot = 0.0
    for (xb,) in dl:
        xb = xb.to(device)
        recon, mu, lv = model(xb)
        loss = loss_fn(xb, recon, mu, lv)
        opt.zero_grad()
        loss.backward()
        opt.step()
        tot += loss.item()
    if ep % 5 == 0 or ep == 1:
        print(f"epoch {ep:02d} loss {tot/len(dl):.4f}")

# Extract latent embeddings (mu)
model.eval()
with torch.no_grad():
    xb = torch.from_numpy(X_audio).float().to(device)
    _, mu, _ = model(xb)
    Z_audio = mu.cpu().numpy()

np.save(DATA_ROOT / "Z_audio.npy", Z_audio)
print("Z_audio:", Z_audio.shape)


device: cpu
epoch 01 loss 0.6642
epoch 05 loss 0.3044
epoch 10 loss 0.2707
epoch 15 loss 0.2577
epoch 20 loss 0.2550
epoch 25 loss 0.2517
Z_audio: (2924, 32)


In [29]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

seg_meta_used = pd.read_csv(DATA_ROOT / "segment_meta_used.csv")
Z_audio = np.load(DATA_ROOT / "Z_audio.npy")
E64 = np.load(DATA_ROOT / "E64.npy")

Z_hybrid = np.concatenate([Z_audio, E64], axis=1)
np.save(DATA_ROOT / "Z_hybrid.npy", Z_hybrid)

# Labels for ARI (language)
y = seg_meta_used["language"].astype("category").cat.codes.values
k = len(np.unique(y))  # number of languages present in your subset

def eval_all(Z, labels):
    # Handle degenerate cases
    if len(set(labels)) < 2:
        return None, None
    return silhouette_score(Z, labels), davies_bouldin_score(Z, labels)

results = []

for rep_name, Z in [("audio_only", Z_audio), ("hybrid", Z_hybrid)]:
    # KMeans
    labels = KMeans(n_clusters=k, random_state=42, n_init="auto").fit_predict(Z)
    sil, db = eval_all(Z, labels)
    ari = adjusted_rand_score(y, labels)
    results.append([rep_name, "kmeans", sil, db, ari])

    # Agglomerative
    labels = AgglomerativeClustering(n_clusters=k).fit_predict(Z)
    sil, db = eval_all(Z, labels)
    ari = adjusted_rand_score(y, labels)
    results.append([rep_name, "agglo", sil, db, ari])

    # DBSCAN (tune eps if needed)
    labels = DBSCAN(eps=1.5, min_samples=10).fit_predict(Z)
    mask = labels != -1
    if mask.sum() > 20 and len(set(labels[mask])) > 1:
        sil = silhouette_score(Z[mask], labels[mask])
        db = davies_bouldin_score(Z[mask], labels[mask])
    else:
        sil, db = None, None
    ari = adjusted_rand_score(y, labels)
    results.append([rep_name, "dbscan", sil, db, ari])

metrics = pd.DataFrame(results, columns=["representation","algorithm","silhouette","davies_bouldin","ARI_language"])
metrics.to_csv(DATA_ROOT / "medium_metrics.csv", index=False)
metrics


Unnamed: 0,representation,algorithm,silhouette,davies_bouldin,ARI_language
0,audio_only,kmeans,0.488536,0.722215,0.002366
1,audio_only,agglo,0.493051,0.713936,0.002001
2,audio_only,dbscan,,,0.0
3,hybrid,kmeans,0.314143,1.062638,0.002308
4,hybrid,agglo,0.307607,1.076851,0.003368
5,hybrid,dbscan,,,0.0


In [30]:

import numpy as np
import pandas as pd
from pathlib import Path
import re

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

# -----------------------
# Paths (adjust if needed)
# -----------------------
DATA_ROOT = Path(r"C:\Users\Atif\Documents\medium\jamendolyrics_full")  # <-- set to your downloaded dataset folder
Z_AUDIO_PATH = DATA_ROOT / "Z_audio.npy"
E64_PATH     = DATA_ROOT / "E64.npy"
META_PATH    = DATA_ROOT / "segment_meta_used.csv"

assert Z_AUDIO_PATH.exists(), f"Missing: {Z_AUDIO_PATH}"
assert E64_PATH.exists(), f"Missing: {E64_PATH}"
assert META_PATH.exists(), f"Missing: {META_PATH}"

# -----------------------
# Load data
# -----------------------
Z_audio = np.load(Z_AUDIO_PATH)         # (N, latent_dim)
E64     = np.load(E64_PATH)             # (N, 64)
meta    = pd.read_csv(META_PATH)        # must align row-by-row with arrays

assert len(meta) == Z_audio.shape[0] == E64.shape[0], "meta / Z_audio / E64 size mismatch!"

# Labels for ARI (language)
y_lang = meta["language"].astype("category").cat.codes.values
k = len(np.unique(y_lang))
print("N:", len(meta), "| languages:", k)

# -----------------------
# OPTIONAL FIX: cap segments per song (prevents clustering by song identity)
# If your seg filenames are like: ...song012_line003_en.wav
# Turn this on if ARI_language stays ~0 and you suspect clusters are grouping per song.
# -----------------------
CAP_PER_SONG = None  # e.g. 8 or 10; set None to disable

if CAP_PER_SONG is not None:
    # extract song_id from seg_path
    def get_song_id(p):
        m = re.search(r"song(\d+)", str(p))
        return m.group(1) if m else "unknown"
    meta["song_id"] = meta["seg_path"].apply(get_song_id)

    # sample at most CAP_PER_SONG rows per song
    meta = (meta.groupby("song_id", group_keys=False)
                .apply(lambda g: g.sample(n=min(len(g), CAP_PER_SONG), random_state=42))
                .reset_index(drop=True))

    # filter arrays to match new meta order
    idx = meta.index.values
    Z_audio = Z_audio[idx]
    E64 = E64[idx]
    y_lang = meta["language"].astype("category").cat.codes.values
    k = len(np.unique(y_lang))
    print("After CAP_PER_SONG:", len(meta), "| languages:", k)

# -----------------------
# Standardize each modality separately (KEY FIX)
# -----------------------
Za = StandardScaler().fit_transform(Z_audio)
El = StandardScaler().fit_transform(E64)

# -----------------------
# Helpers
# -----------------------
def safe_silhouette_db(Z, labels):
    """Return (silhouette, davies_bouldin) or (None, None) if not computable."""
    uniq = set(labels)
    if len(uniq) < 2:
        return None, None
    # silhouette needs >=2 clusters and no empty clusters; DB needs >=2 clusters
    try:
        sil = silhouette_score(Z, labels)
    except Exception:
        sil = None
    try:
        db = davies_bouldin_score(Z, labels)
    except Exception:
        db = None
    return sil, db

def eval_dbscan(Z, eps, min_samples):
    labels = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(Z)
    noise_ratio = float(np.mean(labels == -1))
    # compute metrics only on non-noise points
    mask = labels != -1
    n_used = int(mask.sum())
    # clusters among non-noise points
    non_noise_clusters = set(labels[mask]) if n_used > 0 else set()
    n_clusters = len(non_noise_clusters)

    if n_used > 20 and n_clusters >= 2:
        sil = silhouette_score(Z[mask], labels[mask])
        db  = davies_bouldin_score(Z[mask], labels[mask])
    else:
        sil, db = None, None

    ari = adjusted_rand_score(y_lang, labels)  # ARI can be computed with noise labels too
    return labels, n_clusters, noise_ratio, n_used, sil, db, ari

# -----------------------
# Run experiments
# -----------------------
results = []

# Representations: audio-only and hybrid with different lyric weights
alpha_list = [0.0, 0.25, 0.5, 1.0, 2.0, 4.0]   # 0.0 = audio-only
dbscan_eps_list = [0.6, 0.8, 1.0, 1.2, 1.5, 2.0]
dbscan_min_samples_list = [5, 10, 20]

for alpha in alpha_list:
    if alpha == 0.0:
        Z = Za
        rep_name = "audio_only"
    else:
        Z = np.concatenate([Za, alpha * El], axis=1)
        rep_name = f"hybrid_alpha{alpha}"

    # --- KMeans ---
    labels = KMeans(n_clusters=k, random_state=42, n_init="auto").fit_predict(Z)
    sil, db = safe_silhouette_db(Z, labels)
    ari = adjusted_rand_score(y_lang, labels)
    results.append({
        "representation": rep_name,
        "alpha_lyrics": alpha,
        "algorithm": "kmeans",
        "params": f"k={k}",
        "n_clusters": len(set(labels)),
        "noise_ratio": 0.0,
        "n_used_for_metrics": len(labels),
        "silhouette": sil,
        "davies_bouldin": db,
        "ARI_language": ari
    })

    # --- Agglomerative ---
    labels = AgglomerativeClustering(n_clusters=k).fit_predict(Z)
    sil, db = safe_silhouette_db(Z, labels)
    ari = adjusted_rand_score(y_lang, labels)
    results.append({
        "representation": rep_name,
        "alpha_lyrics": alpha,
        "algorithm": "agglo",
        "params": f"k={k}",
        "n_clusters": len(set(labels)),
        "noise_ratio": 0.0,
        "n_used_for_metrics": len(labels),
        "silhouette": sil,
        "davies_bouldin": db,
        "ARI_language": ari
    })

    # --- DBSCAN sweep (best row will be chosen later) ---
    for eps in dbscan_eps_list:
        for ms in dbscan_min_samples_list:
            labels, n_clusters, noise_ratio, n_used, sil, db, ari = eval_dbscan(Z, eps=eps, min_samples=ms)
            results.append({
                "representation": rep_name,
                "alpha_lyrics": alpha,
                "algorithm": "dbscan",
                "params": f"eps={eps},min_samples={ms}",
                "n_clusters": n_clusters,
                "noise_ratio": noise_ratio,
                "n_used_for_metrics": n_used,
                "silhouette": sil,
                "davies_bouldin": db,
                "ARI_language": ari
            })

metrics = pd.DataFrame(results)

# Save full table
out_csv = DATA_ROOT / "medium_metrics_fixed_full.csv"
metrics.to_csv(out_csv, index=False)
print("Saved full metrics to:", out_csv)

# Show the best configurations (by ARI_language first, then silhouette)
print("\nTop 15 rows by ARI_language then silhouette:")
display(
    metrics.sort_values(["ARI_language", "silhouette"], ascending=[False, False]).head(15)
)

# Also show best-per-algorithm for quick reporting
best_kmeans = metrics[metrics["algorithm"]=="kmeans"].sort_values("ARI_language", ascending=False).head(5)
best_agglo  = metrics[metrics["algorithm"]=="agglo"].sort_values("ARI_language", ascending=False).head(5)
best_dbscan = metrics[metrics["algorithm"]=="dbscan"].sort_values("ARI_language", ascending=False).head(5)

print("\nBest KMeans:")
display(best_kmeans)

print("\nBest Agglo:")
display(best_agglo)

print("\nBest DBSCAN:")
display(best_dbscan)

# OPTIONAL: Save a smaller "report-ready" CSV with the best row for each (representation, algorithm)
best_rows = (metrics.sort_values(["ARI_language", "silhouette"], ascending=[False, False])
                    .groupby(["representation","algorithm"], as_index=False)
                    .head(1))
out_csv2 = DATA_ROOT / "medium_metrics_fixed_best.csv"
best_rows.to_csv(out_csv2, index=False)
print("Saved best-per-(rep,algo) to:", out_csv2)

N: 2924 | languages: 4
Saved full metrics to: C:\Users\Atif\Documents\medium\jamendolyrics_full\medium_metrics_fixed_full.csv

Top 15 rows by ARI_language then silhouette:


Unnamed: 0,representation,alpha_lyrics,algorithm,params,n_clusters,noise_ratio,n_used_for_metrics,silhouette,davies_bouldin,ARI_language
80,hybrid_alpha2.0,2.0,kmeans,k=4,4,0.0,2924,0.03232,3.468271,0.028714
100,hybrid_alpha4.0,4.0,kmeans,k=4,4,0.0,2924,-0.052201,2.627395,0.008992
4,audio_only,0.0,dbscan,"eps=0.6,min_samples=20",4,0.264022,2152,0.355419,0.873377,0.003882
37,hybrid_alpha0.25,0.25,dbscan,"eps=2.0,min_samples=5",45,0.219904,2281,-0.148752,0.969375,0.003484
31,hybrid_alpha0.25,0.25,dbscan,"eps=1.2,min_samples=5",42,0.678181,941,0.078941,0.846278,0.003282
36,hybrid_alpha0.25,0.25,dbscan,"eps=1.5,min_samples=20",3,0.650821,1021,0.250339,1.05751,0.003172
38,hybrid_alpha0.25,0.25,dbscan,"eps=2.0,min_samples=10",12,0.320793,1986,-0.112616,1.056036,0.002913
40,hybrid_alpha0.5,0.5,kmeans,k=4,4,0.0,2924,0.167494,2.170556,0.002856
60,hybrid_alpha1.0,1.0,kmeans,k=4,4,0.0,2924,0.061606,2.772309,0.002442
32,hybrid_alpha0.25,0.25,dbscan,"eps=1.2,min_samples=10",11,0.780096,643,0.156688,1.07323,0.001987



Best KMeans:


Unnamed: 0,representation,alpha_lyrics,algorithm,params,n_clusters,noise_ratio,n_used_for_metrics,silhouette,davies_bouldin,ARI_language
80,hybrid_alpha2.0,2.0,kmeans,k=4,4,0.0,2924,0.03232,3.468271,0.028714
100,hybrid_alpha4.0,4.0,kmeans,k=4,4,0.0,2924,-0.052201,2.627395,0.008992
40,hybrid_alpha0.5,0.5,kmeans,k=4,4,0.0,2924,0.167494,2.170556,0.002856
60,hybrid_alpha1.0,1.0,kmeans,k=4,4,0.0,2924,0.061606,2.772309,0.002442
20,hybrid_alpha0.25,0.25,kmeans,k=4,4,0.0,2924,0.390761,1.031907,-6.2e-05



Best Agglo:


Unnamed: 0,representation,alpha_lyrics,algorithm,params,n_clusters,noise_ratio,n_used_for_metrics,silhouette,davies_bouldin,ARI_language
1,audio_only,0.0,agglo,k=4,4,0.0,2924,0.439324,0.893452,0.000211
21,hybrid_alpha0.25,0.25,agglo,k=4,4,0.0,2924,0.371191,0.992547,0.000187
81,hybrid_alpha2.0,2.0,agglo,k=4,4,0.0,2924,0.204561,1.307274,9.5e-05
61,hybrid_alpha1.0,1.0,agglo,k=4,4,0.0,2924,0.119826,1.925157,5e-05
41,hybrid_alpha0.5,0.5,agglo,k=4,4,0.0,2924,0.266422,1.298777,-1.1e-05



Best DBSCAN:


Unnamed: 0,representation,alpha_lyrics,algorithm,params,n_clusters,noise_ratio,n_used_for_metrics,silhouette,davies_bouldin,ARI_language
4,audio_only,0.0,dbscan,"eps=0.6,min_samples=20",4,0.264022,2152,0.355419,0.873377,0.003882
37,hybrid_alpha0.25,0.25,dbscan,"eps=2.0,min_samples=5",45,0.219904,2281,-0.148752,0.969375,0.003484
31,hybrid_alpha0.25,0.25,dbscan,"eps=1.2,min_samples=5",42,0.678181,941,0.078941,0.846278,0.003282
36,hybrid_alpha0.25,0.25,dbscan,"eps=1.5,min_samples=20",3,0.650821,1021,0.250339,1.05751,0.003172
38,hybrid_alpha0.25,0.25,dbscan,"eps=2.0,min_samples=10",12,0.320793,1986,-0.112616,1.056036,0.002913


Saved best-per-(rep,algo) to: C:\Users\Atif\Documents\medium\jamendolyrics_full\medium_metrics_fixed_best.csv
