In [1]:
import os
import json
import ast
import re
import time
import requests
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

TRACKS_PATH = "tracks.csv"
ARTISTS_PATH = "artists.csv"

AUDIO_FEATURE_COLS = [
    "danceability", "energy", "loudness", "speechiness", "acousticness",
    "instrumentalness", "liveness", "valence", "tempo"
]

MIN_N_FOR_WEIGHTS = 200
LYRIC_LEN_THRESHOLD = 50

# Supervised weights will be used when AUC is at least this threshold
SUPERVISED_MIN_AUC = 0.55

# Default mixing weights for hybrid ranking
DEFAULT_ALPHA_AUDIO = 0.65
DEFAULT_ALPHA_LYRICS = 0.35

In [3]:
def parse_list(x):
    """Parse list-like strings safely. Handles NaN, list, and stringified lists."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    s = str(x).strip()
    try:
        val = ast.literal_eval(s)
        if isinstance(val, list):
            return val
    except Exception:
        pass
    # fallback: treat as single token
    return [s]

def clean_genre_list(genres):
    """Normalize genres: lowercase, strip, unique, remove empties."""
    out = []
    if not isinstance(genres, list):
        return out
    for g in genres:
        if pd.isna(g):
            continue
        s = str(g).strip().lower()
        if s:
            out.append(s)
    # stable unique
    seen = set()
    uniq = []
    for g in out:
        if g not in seen:
            uniq.append(g)
            seen.add(g)
    return uniq

def flatten_unique(list_of_lists):
    out = []
    for item in list_of_lists:
        if isinstance(item, list):
            out.extend(item)
    return clean_genre_list(out)

def safe_year(release_date):
    if pd.isna(release_date):
        return np.nan
    s = str(release_date).strip()
    m = re.match(r"^(\d{4})", s)
    return int(m.group(1)) if m else np.nan

def ensure_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    return [str(x)]

### Load data and build a “song master table”

In [4]:
tracks = pd.read_csv(TRACKS_PATH)
artists = pd.read_csv(ARTISTS_PATH)

tracks = tracks.rename(columns={"id": "track_id", "name": "track_name"})
artists = artists.rename(columns={"id": "artist_id", "name": "artist_name"})

tracks["artist_ids"] = tracks["id_artists"].apply(parse_list)
artists["genres"] = artists["genres"].apply(parse_list).apply(clean_genre_list)

tracks["release_year"] = tracks["release_date"].apply(safe_year)

# Expand tracks by artist_id and bring artist name + genres
t_long = tracks.explode("artist_ids").rename(columns={"artist_ids": "artist_id"})

ta = t_long.merge(
    artists[["artist_id", "artist_name", "genres"]],
    on="artist_id",
    how="left"
)

songs_master = (
    ta.groupby("track_id", as_index=False)
      .agg(
          track_name=("track_name", "first"),
          popularity=("popularity", "first"),
          release_date=("release_date", "first"),
          release_year=("release_year", "first"),
          duration_ms=("duration_ms", "first"),
          explicit=("explicit", "first"),

          danceability=("danceability", "first"),
          energy=("energy", "first"),
          key=("key", "first"),
          loudness=("loudness", "first"),
          mode=("mode", "first"),
          speechiness=("speechiness", "first"),
          acousticness=("acousticness", "first"),
          instrumentalness=("instrumentalness", "first"),
          liveness=("liveness", "first"),
          valence=("valence", "first"),
          tempo=("tempo", "first"),
          time_signature=("time_signature", "first"),

          artist_ids=("artist_id", lambda s: list(pd.unique(s.dropna()))),
          artist_names=("artist_name", lambda s: list(pd.unique(s.dropna()))),
          artist_genres=("genres", lambda s: flatten_unique(list(s))),
      )
)

songs_master["artist_genres"] = songs_master["artist_genres"].apply(clean_genre_list)
songs_master["artist_names"] = songs_master["artist_names"].apply(ensure_list)

songs_clean = songs_master[songs_master["artist_genres"].map(len) > 0].copy()

print("songs_master shape:", songs_master.shape)
print("songs_clean shape:", songs_clean.shape)

songs_master shape: (586672, 22)
songs_clean shape: (531191, 22)


### Select diverse top genres

In [5]:
genre_counts = (
    songs_clean
      .explode("artist_genres")
      .groupby("artist_genres")["track_id"]
      .nunique()
      .sort_values(ascending=False)
)

GENRE_FAMILIES = [
    ("rock", re.compile(r"\b(rock|metal|punk|grunge|indie rock|alt rock)\b", re.I)),
    ("pop", re.compile(r"\b(pop|dance pop|europop|synthpop|electropop|adult standards|mellow gold)\b", re.I)),
    ("latin", re.compile(r"\b(latin|reggaeton|salsa|bachata|cumbia|bossa nova|ranchera)\b", re.I)),
    ("classical", re.compile(r"\b(classical|opera|orchestra|baroque|romantic)\b", re.I)),
    ("jazz", re.compile(r"\b(jazz|swing|big band|bebop|cool jazz|vocal jazz)\b", re.I)),
    ("hiphop", re.compile(r"\b(hip hop|rap|trap)\b", re.I)),
    ("country", re.compile(r"\b(country|americana|bluegrass)\b", re.I)),
    ("rnb_soul", re.compile(r"\b(r&b|soul|motown|funk)\b", re.I)),
    ("electronic", re.compile(r"\b(edm|house|techno|trance|dubstep|electro|disco)\b", re.I)),
    ("folk", re.compile(r"\b(folk|singer-songwriter)\b", re.I)),
    ("world", re.compile(r"\b(mandopop|cantopop|c-pop|j-pop|k-pop|bollywood|filmi|arabesk)\b", re.I)),
]

def assign_family(g: str) -> str:
    for fam, pat in GENRE_FAMILIES:
        if pat.search(g):
            return fam
    return "other"

genre_df = genre_counts.reset_index()
genre_df.columns = ["genre", "song_count"]
genre_df["family"] = genre_df["genre"].apply(assign_family)

genre_df_valid = genre_df[genre_df["family"] != "other"].copy()
genre_df_valid = genre_df_valid.sort_values("song_count", ascending=False)
top_per_family = genre_df_valid.drop_duplicates(subset=["family"])

selected_genres = top_per_family.head(10)["genre"].tolist()
print("Selected genres:", selected_genres)

Selected genres: ['rock', 'adult standards', 'filmi', 'classical', 'latin', 'vocal jazz', 'soul', 'folk', 'rap', 'edm']


In [6]:
BASE_DF = songs_clean.dropna(subset=AUDIO_FEATURE_COLS).copy()

scaler_audio = StandardScaler()
BASE_DF_scaled = BASE_DF.copy()
BASE_DF_scaled[AUDIO_FEATURE_COLS] = scaler_audio.fit_transform(BASE_DF[AUDIO_FEATURE_COLS].values)

# Lyrics column (optional; filled later)
if "lyrics" not in BASE_DF_scaled.columns:
    BASE_DF_scaled["lyrics"] = None

id_to_rowidx = pd.Series(BASE_DF_scaled.index.values, index=BASE_DF_scaled["track_id"]).to_dict()

genre_support = (
    BASE_DF_scaled.explode("artist_genres")["artist_genres"]
    .value_counts()
    .to_dict()
)

print("BASE_DF_scaled shape:", BASE_DF_scaled.shape)

BASE_DF_scaled shape: (531191, 23)


In [7]:
def compute_genre_feature_weights_variance(df_scaled, genres, feature_cols, min_n=200, eps=1e-12):
    weights = {}
    for g in genres:
        mask = df_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (g in lst))
        g_df = df_scaled.loc[mask, feature_cols]
        if len(g_df) < min_n:
            continue
        w = g_df.var().values.astype(float)
        w = np.maximum(w, eps)
        w = w / w.sum()
        weights[g] = pd.Series(w, index=feature_cols)
    return weights

genre_weights_variance = compute_genre_feature_weights_variance(
    BASE_DF_scaled, selected_genres, AUDIO_FEATURE_COLS, min_n=MIN_N_FOR_WEIGHTS
)
print("Genres with variance-weights:", sorted(list(genre_weights_variance.keys())))

Genres with variance-weights: ['adult standards', 'classical', 'edm', 'filmi', 'folk', 'latin', 'rap', 'rock', 'soul', 'vocal jazz']


### SUPERVISED “TOP PREDICTORS” + GENRE WEIGHTS
Proxy label: same-artist pair (within genre) vs different-artist pair

In [8]:
def build_pairwise_dataset_for_genre(
    df_scaled,
    genre: str,
    feature_cols: List[str],
    n_pos_per_query: int = 2,
    n_neg_per_query: int = 6,
    seed: int = 42,
    min_artist_tracks: int = 2,
    max_queries: int = 500
) -> Tuple[np.ndarray, np.ndarray]:
    rng = np.random.default_rng(seed)

    pool = df_scaled[df_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (genre in lst))].copy()
    if len(pool) < 200:
        raise ValueError(f"Genre '{genre}' pool too small: {len(pool)}")

    # artist -> track_ids in this genre
    artist_to_tracks: Dict[str, List[str]] = {}
    for _, r in pool[["track_id", "artist_ids"]].iterrows():
        aids = r["artist_ids"] if isinstance(r["artist_ids"], list) else []
        for aid in aids:
            artist_to_tracks.setdefault(aid, []).append(r["track_id"])

    valid_artists = [aid for aid, tids in artist_to_tracks.items() if len(tids) >= min_artist_tracks]
    if len(valid_artists) == 0:
        raise ValueError(f"No artists with >= {min_artist_tracks} tracks in genre '{genre}'")

    pool_index = pool.set_index("track_id")
    all_track_ids = pool["track_id"].tolist()

    # Build query candidates from valid artists
    candidate_queries = []
    for aid in valid_artists:
        candidate_queries.extend(artist_to_tracks[aid])

    unique_queries = list(pd.unique(candidate_queries))
    if len(unique_queries) == 0:
        raise ValueError("No query candidates found after filtering.")

    query_ids = rng.choice(unique_queries, size=min(max_queries, len(unique_queries)), replace=False)

    X, y = [], []

    for qid in query_ids:
        q_row = pool_index.loc[qid]
        q_vec = q_row[feature_cols].values.astype(float)

        q_artists = q_row["artist_ids"]
        q_artists = q_artists if isinstance(q_artists, list) else []

        # positives: same artist, different track
        pos_pool = []
        for aid in q_artists:
            pos_pool.extend([tid for tid in artist_to_tracks.get(aid, []) if tid != qid])
        pos_pool = list(dict.fromkeys(pos_pool))

        if len(pos_pool) > 0:
            pos_sample = rng.choice(pos_pool, size=min(n_pos_per_query, len(pos_pool)), replace=False)
            for tid in pos_sample:
                v = pool_index.loc[tid, feature_cols].values.astype(float)
                X.append(np.abs(q_vec - v))
                y.append(1)

        # negatives: different artist(s)
        negs = []
        attempts = 0
        while len(negs) < n_neg_per_query and attempts < 50 * n_neg_per_query:
            tid = rng.choice(all_track_ids)
            if tid == qid:
                attempts += 1
                continue
            cand_artists = pool_index.loc[tid, "artist_ids"]
            cand_artists = cand_artists if isinstance(cand_artists, list) else []
            if set(cand_artists).intersection(set(q_artists)):
                attempts += 1
                continue
            negs.append(tid)
            attempts += 1

        for tid in negs:
            v = pool_index.loc[tid, feature_cols].values.astype(float)
            X.append(np.abs(q_vec - v))
            y.append(0)

    X = np.asarray(X, dtype=float)
    y = np.asarray(y, dtype=int)
    if len(X) < 200:
        raise ValueError(f"Not enough pairwise samples built for '{genre}': {len(X)}")
    return X, y

In [9]:
def get_similar_songs(
    song_id: str,
    genre: str,
    top_k: int = 10,
    include_query: bool = False,
    exclude_same_artist: bool = True,
    same_era: bool = False,
    era_year_window: int = 7,
    min_similarity: float | None = None,
    min_popularity: int | None = None,
    require_release_date: bool = False,
    return_columns=None,
):
    if return_columns is None:
        return_columns = ["track_id", "track_name", "artist_names", "popularity", "release_date", "release_year"]

    if genre not in genre_weights:
        raise ValueError(f"No weights for genre='{genre}'. (Maybe too few songs in that genre.)")

    if song_id not in id_to_rowidx:
        raise KeyError("song_id not found in dataset")

    qrow = BASE_DF_scaled.loc[id_to_rowidx[song_id]]

    # Candidate pool: only songs in the same genre tag
    candidates = BASE_DF_scaled[BASE_DF_scaled["artist_genres"].apply(lambda lst: genre in lst)].copy()
    if len(candidates) == 0:
        return pd.DataFrame(columns=return_columns + ["similarity_score"])

    # Optional: require a real release_date (some datasets have missing)
    if require_release_date:
        candidates = candidates[candidates["release_date"].notna()]

    # Optional: same era filter (only works if query has a release year)
    if same_era and not pd.isna(qrow["release_year"]):
        y = int(qrow["release_year"])
        candidates = candidates[candidates["release_year"].between(y - era_year_window, y + era_year_window, inclusive="both")]

    # Optional: minimum popularity
    if min_popularity is not None:
        candidates = candidates[candidates["popularity"].fillna(-1) >= min_popularity]

    # Exclude same artist to avoid obvious duplicates in recommendations
    if exclude_same_artist:
        q_artists = set(qrow["artist_ids"])
        candidates = candidates[~candidates["artist_ids"].apply(lambda ids: bool(set(ids).intersection(q_artists)))]

    # Compute similarity
    w = genre_weights[genre].reindex(AUDIO_FEATURE_COLS).values.astype(float)
    q_vec = qrow[AUDIO_FEATURE_COLS].values.astype(float)
    mat = candidates[AUDIO_FEATURE_COLS].values.astype(float)

    candidates["similarity_score"] = weighted_cosine_similarity(q_vec, mat, w)

    # Optional: minimum similarity threshold
    if min_similarity is not None:
        candidates = candidates[candidates["similarity_score"] >= min_similarity]

    # Remove query row unless asked to include it
    if not include_query:
        candidates = candidates[candidates["track_id"] != song_id]

    # Rank
    candidates = candidates.sort_values("similarity_score", ascending=False).head(top_k)

    cols = [c for c in return_columns if c in candidates.columns] + ["similarity_score"]
    return candidates[cols].reset_index(drop=True)

In [9]:
def learn_predictors_for_genre(
    df_scaled,
    genre: str,
    feature_cols: List[str],
    seed: int = 42
) -> Tuple[pd.Series, pd.DataFrame, float]:
    X, y = build_pairwise_dataset_for_genre(df_scaled, genre, feature_cols, seed=seed)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=seed, stratify=y
    )

    clf = LogisticRegression(
        penalty="l1",
        solver="saga",
        max_iter=5000,
        class_weight="balanced",
        random_state=seed
    )
    clf.fit(X_train, y_train)

    proba = clf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)

    coefs = clf.coef_.ravel()
    importance = np.abs(coefs)

    w = importance / (importance.sum() + 1e-12)
    weights = pd.Series(w, index=feature_cols).sort_values(ascending=False)

    predictors = pd.DataFrame({
        "genre": genre,
        "feature": feature_cols,
        "abs_coef": np.abs(coefs),
        "coef": coefs,
        "weight": w
    }).sort_values("abs_coef", ascending=False)

    return weights, predictors, float(auc)

In [10]:
def learn_predictors_all_genres(
    df_scaled,
    genres: List[str],
    feature_cols: List[str],
    min_auc: float = 0.55,
    seed: int = 42
) -> Tuple[Dict[str, pd.Series], pd.DataFrame, pd.DataFrame]:
    genre_weights_supervised: Dict[str, pd.Series] = {}
    predictors_tables = []
    auc_rows = []

    for g in genres:
        try:
            w, pred, auc = learn_predictors_for_genre(df_scaled, g, feature_cols, seed=seed)
            auc_rows.append({"genre": g, "pairwise_auc": float(auc)})
            predictors_tables.append(pred)
            if auc >= min_auc:
                genre_weights_supervised[g] = w
        except Exception as e:
            auc_rows.append({"genre": g, "pairwise_auc": np.nan, "error": str(e)})

    auc_df = pd.DataFrame(auc_rows).sort_values("pairwise_auc", ascending=False)
    predictors_df = pd.concat(predictors_tables, ignore_index=True) if predictors_tables else pd.DataFrame()
    return genre_weights_supervised, predictors_df, auc_df

genre_weights_supervised, predictors_df, genre_auc_df = learn_predictors_all_genres(
    BASE_DF_scaled, selected_genres, AUDIO_FEATURE_COLS, min_auc=SUPERVISED_MIN_AUC, seed=RANDOM_SEED
)

print("Supervised weights learned for genres:", sorted(list(genre_weights_supervised.keys())))
print(genre_auc_df.head(10))

  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))
  unique_queries = list(pd.unique(candidate_queries))


Supervised weights learned for genres: ['adult standards', 'classical', 'edm', 'filmi', 'folk', 'latin', 'rap', 'rock', 'soul', 'vocal jazz']
             genre  pairwise_auc
4            latin      0.694709
9              edm      0.691341
1  adult standards      0.675099
2            filmi      0.669280
6             soul      0.668528
0             rock      0.660443
7             folk      0.635189
5       vocal jazz      0.634261
8              rap      0.599636
3        classical      0.587365


In [11]:
genre_weights: Dict[str, pd.Series] = {}
for g in selected_genres:
    if g in genre_weights_supervised:
        genre_weights[g] = genre_weights_supervised[g].reindex(AUDIO_FEATURE_COLS).fillna(0.0)
    elif g in genre_weights_variance:
        genre_weights[g] = genre_weights_variance[g].reindex(AUDIO_FEATURE_COLS).fillna(0.0)

print("Final genre_weights available:", sorted(list(genre_weights.keys())))


def top_predictors_by_genre(genres: List[str], top_n: int = 5) -> Dict[str, pd.Series]:
    """
    Returns top predictors (song metrics) per genre.
    If supervised predictors exist, uses supervised weights (most defensible).
    Else falls back to variance weights.
    """
    out = {}
    for g in genres:
        if g in genre_weights_supervised:
            out[g] = genre_weights_supervised[g].sort_values(ascending=False).head(top_n)
        elif g in genre_weights_variance:
            out[g] = genre_weights_variance[g].sort_values(ascending=False).head(top_n)
    return out

Final genre_weights available: ['adult standards', 'classical', 'edm', 'filmi', 'folk', 'latin', 'rap', 'rock', 'soul', 'vocal jazz']


In [12]:
def weighted_cosine_similarity(query_vec: np.ndarray, mat: np.ndarray, w: np.ndarray) -> np.ndarray:
    q = query_vec * w
    M = mat * w
    q_norm = np.linalg.norm(q)
    M_norm = np.linalg.norm(M, axis=1)
    denom = q_norm * M_norm
    denom = np.where(denom == 0, 1e-12, denom)
    return (M @ q) / denom

def weighted_l1_similarity(query_vec: np.ndarray, mat: np.ndarray, w: np.ndarray) -> np.ndarray:
    """
    Similarity based on weighted L1 distance.
    Returns higher = more similar.
    """
    d = np.sum(np.abs(mat - query_vec) * w, axis=1)
    return np.exp(-d)  # (0,1]

In [13]:
def get_similar_songs(
    song_id: str,
    genre: str,
    top_k: int = 10,
    include_query: bool = False,
    exclude_same_artist: bool = True,
    same_era: bool = False,
    era_year_window: int = 7,
    min_similarity: float | None = None,
    min_popularity: int | None = None,
    require_release_date: bool = False,
    weights_dict: Optional[Dict[str, pd.Series]] = None,
    similarity_mode: str = "cosine",  # "cosine" or "l1"
    return_columns=None,
):
    """
    Main recommender:
      - Filters candidate pool by genre.
      - Computes similarity score and returns sorted list.

    weights_dict:
      - If None, uses global genre_weights.
      - You can pass genre_weights_supervised or genre_weights_variance explicitly.
    """
    if return_columns is None:
        return_columns = ["track_id", "track_name", "artist_names", "popularity", "release_date", "release_year"]

    if song_id not in id_to_rowidx:
        raise KeyError(f"song_id not found in BASE_DF_scaled: {song_id}")

    if weights_dict is None:
        weights_dict = genre_weights

    if genre not in weights_dict:
        raise ValueError(f"No weights available for genre='{genre}'")

    q_row = BASE_DF_scaled.loc[id_to_rowidx[song_id]]
    q_vec = q_row[AUDIO_FEATURE_COLS].values.astype(float)

    pool_mask = BASE_DF_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (genre in lst))
    candidates = BASE_DF_scaled.loc[pool_mask].copy()
    if len(candidates) == 0:
        raise ValueError(f"No candidate songs found for genre='{genre}'")

    cand_mat = candidates[AUDIO_FEATURE_COLS].values.astype(float)
    w = weights_dict[genre].reindex(AUDIO_FEATURE_COLS).fillna(0.0).values.astype(float)

    if similarity_mode == "cosine":
        sims = weighted_cosine_similarity(q_vec, cand_mat, w)
    elif similarity_mode == "l1":
        sims = weighted_l1_similarity(q_vec, cand_mat, w)
    else:
        raise ValueError("similarity_mode must be 'cosine' or 'l1'")

    candidates["similarity_score"] = sims

    if exclude_same_artist:
        q_artists = set(q_row["artist_ids"]) if isinstance(q_row["artist_ids"], list) else set()
        candidates = candidates[
            ~candidates["artist_ids"].apply(
                lambda lst: bool(q_artists.intersection(set(lst))) if isinstance(lst, list) else False
            )
        ]

    if not include_query:
        candidates = candidates[candidates["track_id"] != song_id]

    if require_release_date:
        candidates = candidates[candidates["release_year"].notna()]

    if same_era and pd.notna(q_row["release_year"]):
        y = int(q_row["release_year"])
        candidates = candidates[
            candidates["release_year"].between(y - era_year_window, y + era_year_window, inclusive="both")
        ]

    if min_popularity is not None:
        candidates = candidates[candidates["popularity"].fillna(-1) >= min_popularity]

    if min_similarity is not None:
        candidates = candidates[candidates["similarity_score"] >= min_similarity]

    candidates = candidates.sort_values("similarity_score", ascending=False).head(top_k)

    cols = [c for c in return_columns if c in candidates.columns] + ["similarity_score"]
    return candidates[cols].reset_index(drop=True)

In [14]:
def pick_best_selected_genre_for_song(song_id: str, selected_genres_list: List[str]) -> Optional[str]:
    if song_id not in id_to_rowidx:
        return None
    row = BASE_DF_scaled.loc[id_to_rowidx[song_id]]
    tags = set(row["artist_genres"]) if isinstance(row["artist_genres"], list) else set()
    candidates = [g for g in selected_genres_list if g in tags and g in genre_weights]
    if not candidates:
        return None
    return max(candidates, key=lambda g: genre_support.get(g, 0))

def recommend_similar(song_id: str, top_k: int = 10, **kwargs):
    g = pick_best_selected_genre_for_song(song_id, selected_genres)
    if g is None:
        raise ValueError("Song does not belong to any selected genre (or weights missing).")
    return g, get_similar_songs(song_id=song_id, genre=g, top_k=top_k, **kwargs)

In [15]:
def normalize_title(title: str) -> str:
    t = str(title)
    # remove parenthetical phrases
    t = re.sub(r"\s*\(.*?\)\s*", " ", t)
    # remove " - ..." suffix (Remastered, Live, etc.)
    t = re.sub(r"\s*-\s*.*$", " ", t)
    # remove feat segments
    t = re.sub(r"\b(feat|ft)\b.*$", " ", t, flags=re.I)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def normalize_artist(artist: str) -> str:
    a = str(artist)
    a = re.sub(r"\s+", " ", a).strip()
    return a

def resolve_track_id(track_name: str, artist_name: str | None = None, df: Optional[pd.DataFrame] = None, max_candidates: int = 25) -> str:
    """
    Resolver: finds best track_id by normalized title match and optional artist match.
    """
    if df is None:
        df = BASE_DF_scaled

    t = normalize_title(track_name).lower()
    a = normalize_artist(artist_name).lower() if artist_name else None

    tmp = df.copy()
    tmp["_t"] = tmp["track_name"].apply(lambda x: normalize_title(x).lower())
    tmp["_a"] = tmp["artist_names"].apply(
        lambda lst: " ".join([normalize_artist(x).lower() for x in (lst if isinstance(lst, list) else [])])
    )

    cand = tmp[tmp["_t"].str.contains(re.escape(t), na=False)]
    if len(cand) == 0:
        # fallback: substring other way
        cand = tmp[tmp["_t"].apply(lambda x: t in x if isinstance(x, str) else False)]

    if len(cand) == 0:
        raise ValueError("No title match found. Try a shorter title substring.")

    if a:
        cand2 = cand[cand["_a"].str.contains(re.escape(a), na=False)]
        if len(cand2) > 0:
            cand = cand2

    cand = cand.head(max_candidates).copy()

    # score: prefer closer title length match + higher popularity
    cand["_score"] = -cand["_t"].apply(lambda x: abs(len(x) - len(t))) + 0.01 * cand["popularity"].fillna(0)
    best = cand.sort_values("_score", ascending=False).iloc[0]
    return best["track_id"]

In [16]:
# Audio only Demo
def demo_genre_recommendations(
    genre: str,
    n_queries: int = 3,
    top_k: int = 5,
    seed: int = 42,
    weights_dict: Optional[Dict[str, pd.Series]] = None,
    similarity_mode: str = "cosine"
):
    rng = np.random.default_rng(seed)

    pool_mask = BASE_DF_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (genre in lst))
    pool = BASE_DF_scaled.loc[pool_mask].copy()
    if len(pool) < 50:
        raise ValueError(f"Not enough songs for genre='{genre}' (n={len(pool)})")

    # choose queries
    sample = pool.sample(n=min(n_queries, len(pool)), random_state=seed)
    results = {}

    qsum = sample[["track_id", "track_name", "artist_names", "popularity", "release_date"]].reset_index(drop=True)

    for qid in sample["track_id"].tolist():
        recs = get_similar_songs(
            song_id=qid,
            genre=genre,
            top_k=top_k,
            exclude_same_artist=True,
            weights_dict=weights_dict,
            similarity_mode=similarity_mode
        )
        results[qid] = recs

    return qsum, results

In [17]:
# EFFECTIVENESS EVALUATION : RECALL@K / MRR VS RANDOM
def eval_recall_mrr_same_artist(
    df_scaled: pd.DataFrame,
    genres: List[str],
    top_k: int = 10,
    n_queries_per_genre: int = 50,
    seed: int = 42,
    weights_dict: Optional[Dict[str, pd.Series]] = None,
    similarity_mode: str = "cosine"
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    rng = np.random.default_rng(seed)
    rows = []

    if weights_dict is None:
        weights_dict = genre_weights

    for g in genres:
        pool = df_scaled[df_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (g in lst))].copy()
        if len(pool) < 200:
            continue

        # queries must have at least one other track by same artist in this genre
        pool["artist_key"] = pool["artist_ids"].apply(lambda x: tuple(sorted(x)) if isinstance(x, list) else tuple())
        counts = pool["artist_key"].value_counts()
        valid_artist_keys = set(counts[counts >= 2].index)
        q_pool = pool[pool["artist_key"].isin(valid_artist_keys)].copy()
        if len(q_pool) < 50:
            continue

        q_ids = q_pool.sample(n=min(n_queries_per_genre, len(q_pool)), random_state=seed)["track_id"].tolist()

        for qid in q_ids:
            q_row = df_scaled.loc[id_to_rowidx[qid]]
            q_artists = set(q_row["artist_ids"]) if isinstance(q_row["artist_ids"], list) else set()

            # allow same artist in results for evaluation
            recs = get_similar_songs(
                song_id=qid,
                genre=g,
                top_k=top_k,
                exclude_same_artist=False,
                weights_dict=weights_dict,
                similarity_mode=similarity_mode
            )

            hits = []
            for tid in recs["track_id"].tolist():
                cand_artists = df_scaled.loc[id_to_rowidx[tid], "artist_ids"]
                cand_artists = set(cand_artists) if isinstance(cand_artists, list) else set()
                hits.append(bool(q_artists.intersection(cand_artists)))

            recall_at_k = 1.0 if any(hits) else 0.0
            mrr = 1.0 / (hits.index(True) + 1) if any(hits) else 0.0

            # random baseline
            pool_ids = [t for t in pool["track_id"].tolist() if t != qid]
            if len(pool_ids) < top_k:
                continue
            rand_ids = rng.choice(pool_ids, size=top_k, replace=False)
            rand_hits = []
            for tid in rand_ids:
                cand_artists = df_scaled.loc[id_to_rowidx[tid], "artist_ids"]
                cand_artists = set(cand_artists) if isinstance(cand_artists, list) else set()
                rand_hits.append(bool(q_artists.intersection(cand_artists)))
            rand_recall = 1.0 if any(rand_hits) else 0.0

            rows.append({
                "genre": g,
                "query_id": qid,
                "recall@k": recall_at_k,
                "mrr": mrr,
                "random_recall@k": rand_recall
            })

    out = pd.DataFrame(rows)
    if len(out) == 0:
        return out, pd.DataFrame()

    summary = (
        out.groupby("genre")
           .agg(
               queries=("query_id", "count"),
               recall_at_k=("recall@k", "mean"),
               mrr=("mrr", "mean"),
               random_recall_at_k=("random_recall@k", "mean"),
           )
           .assign(lift=lambda d: d["recall_at_k"] / (d["random_recall_at_k"] + 1e-12))
           .sort_values("lift", ascending=False)
    )
    return out, summary

In [18]:
LRCLIB_CACHE_PATH = "lyrics_cache_lrclib.json"

def load_cache(path):
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}

def save_cache(cache, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(cache, f, ensure_ascii=False)

lrclib_cache = load_cache(LRCLIB_CACHE_PATH)
print("LRCLIB cache entries:", len(lrclib_cache))

def _norm_token(s: str) -> str:
    s = str(s).lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\s]", "", s)
    return s

def pick_best_lrclib_result(results, artist, title):
    a = _norm_token(artist)
    t = _norm_token(title)

    best = None
    best_score = -1.0
    for r in results:
        ra = _norm_token(r.get("artistName", ""))
        rt = _norm_token(r.get("trackName", ""))

        score = 0.0
        score += 3.0 if rt == t else 0.0
        score += 2.0 if ra == a else 0.0

        # token overlaps (weak)
        t_tokens = set(t.split())
        a_tokens = set(a.split())
        score += 0.10 * len(t_tokens.intersection(set(rt.split())))
        score += 0.10 * len(a_tokens.intersection(set(ra.split())))

        if score > best_score:
            best_score = score
            best = r

    return best

def fetch_lyrics_lrclib(artist_name, track_name, timeout=10):
    artist = normalize_artist(artist_name)
    title = normalize_title(track_name)
    key = f"{artist.lower()}|||{title.lower()}"

    if key in lrclib_cache:
        return lrclib_cache[key]

    try:
        url = "https://lrclib.net/api/search"
        params = {"track_name": title, "artist_name": artist}
        r = requests.get(url, params=params, timeout=timeout)
        if r.status_code != 200:
            lrclib_cache[key] = None
            return None

        data = r.json()
        if not isinstance(data, list) or len(data) == 0:
            lrclib_cache[key] = None
            return None

        best = pick_best_lrclib_result(data, artist, title)
        if not best:
            lrclib_cache[key] = None
            return None

        lyr = best.get("plainLyrics") or best.get("syncedLyrics")
        if not lyr:
            lrclib_cache[key] = None
            return None

        lyr = re.sub(r"\s+", " ", str(lyr)).strip()
        lrclib_cache[key] = lyr if len(lyr) >= LYRIC_LEN_THRESHOLD else None
        return lrclib_cache[key]

    except Exception:
        lrclib_cache[key] = None
        return None

def attach_lyrics_for_genre_lrclib_inplace(
    df_scaled: pd.DataFrame,
    genre: str,
    max_songs: int = 500,
    sleep_s: float = 0.03
) -> pd.DataFrame:
    """
    Adds lyrics into df_scaled['lyrics'] for songs in a genre (in batches).
    Uses cache to avoid refetching.
    """
    if "lyrics" not in df_scaled.columns:
        df_scaled["lyrics"] = None

    pool_mask = df_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (genre in lst))
    pool_idx = df_scaled.index[pool_mask]

    need_idx = df_scaled.loc[pool_idx][df_scaled.loc[pool_idx, "lyrics"].isna()].index
    if len(need_idx) == 0:
        print(f"All songs in genre='{genre}' already have lyrics or cached misses.")
        return df_scaled

    # sample to limit API usage
    need_idx = list(need_idx[:max_songs])

    fetched = 0
    for ix in need_idx:
        row = df_scaled.loc[ix]
        # best-effort: take first artist name
        artist = row["artist_names"][0] if isinstance(row["artist_names"], list) and len(row["artist_names"]) > 0 else ""
        title = row["track_name"]

        lyr = fetch_lyrics_lrclib(artist, title)
        df_scaled.at[ix, "lyrics"] = lyr
        fetched += 1

        if fetched % 50 == 0:
            save_cache(lrclib_cache, LRCLIB_CACHE_PATH)
        time.sleep(sleep_s)

    save_cache(lrclib_cache, LRCLIB_CACHE_PATH)
    return df_scaled

LRCLIB cache entries: 4671


In [19]:
def minmax(x):
    x = np.asarray(x, dtype=float)
    mn, mx = np.min(x), np.max(x)
    return (x - mn) / (mx - mn + 1e-12)

def audio_similarity_in_pool(
    df_scaled_pool: pd.DataFrame,
    genre: str,
    query_id: str,
    weights_dict: Optional[Dict[str, pd.Series]] = None,
    similarity_mode: str = "cosine"
) -> pd.DataFrame:
    """
    Compute audio similarity inside a pre-filtered pool dataframe (already genre-filtered).
    """
    if weights_dict is None:
        weights_dict = genre_weights

    if genre not in weights_dict:
        raise ValueError(f"weights missing genre='{genre}'")

    q = df_scaled_pool[df_scaled_pool["track_id"] == query_id]
    if len(q) == 0:
        raise KeyError(f"query_id not found in provided pool: {query_id}")
    q_row = q.iloc[0]

    q_vec = q_row[AUDIO_FEATURE_COLS].values.astype(float)
    M = df_scaled_pool[AUDIO_FEATURE_COLS].values.astype(float)

    w = weights_dict[genre].reindex(AUDIO_FEATURE_COLS).fillna(0.0).values.astype(float)

    if similarity_mode == "cosine":
        sims = weighted_cosine_similarity(q_vec, M, w)
    elif similarity_mode == "l1":
        sims = weighted_l1_similarity(q_vec, M, w)
    else:
        raise ValueError("similarity_mode must be 'cosine' or 'l1'")

    pool = df_scaled_pool.copy()
    pool["audio_sim"] = sims
    return pool

def lyrics_similarity_in_pool(df_pool_with_audio: pd.DataFrame, query_id: str) -> pd.DataFrame:
    """
    TF-IDF cosine similarity over lyrics within pool.
    Fixes index mismatch by resetting the DataFrame index before building TF-IDF matrix.
    Robust to empty/low-coverage lyrics.
    """
    # IMPORTANT: reset index so TF-IDF row positions match DataFrame row positions
    pool = df_pool_with_audio.copy().reset_index(drop=True)

    # Ensure query exists
    q_positions = pool.index[pool["track_id"] == query_id].tolist()
    if not q_positions:
        raise KeyError("query_id not found in pool")
    q_pos = q_positions[0]

    # Prepare text
    pool["lyrics"] = pool["lyrics"].fillna("").astype(str)
    texts = pool["lyrics"].tolist()

    # If almost everything is empty, return zeros
    nonempty = sum(1 for t in texts if t.strip())
    if nonempty < 2:
        pool["lyrics_sim"] = 0.0
        return pool

    vectorizer = TfidfVectorizer(
        max_features=40000,
        ngram_range=(1, 2),
        min_df=2,
        stop_words="english"
    )

    try:
        X = vectorizer.fit_transform(texts)
    except ValueError:
        # e.g., "empty vocabulary" if all texts are stopwords or too short after filtering
        pool["lyrics_sim"] = 0.0
        return pool

    # Compute cosine similarity via dot product (TF-IDF vectors are L2-normalized by default)
    q_vec = X[q_pos]
    sims = (X @ q_vec.T).toarray().ravel()

    pool["lyrics_sim"] = sims
    return pool

def hybrid_rank(
    df_scored: pd.DataFrame,
    query_id: str,
    alpha_audio: float = DEFAULT_ALPHA_AUDIO,
    alpha_lyrics: float = DEFAULT_ALPHA_LYRICS,
    normalize: bool = True,
    top_k: int = 10
) -> pd.DataFrame:
    """
    Combine audio_sim and lyrics_sim into a final hybrid score.
    """
    pool = df_scored.copy()
    if "audio_sim" not in pool.columns:
        raise ValueError("audio_sim missing")
    if "lyrics_sim" not in pool.columns:
        # allow audio-only
        pool["lyrics_sim"] = 0.0

    a = pool["audio_sim"].values.astype(float)
    l = pool["lyrics_sim"].values.astype(float)

    if normalize:
        a = minmax(a)
        l = minmax(l)

    pool["hybrid_score"] = alpha_audio * a + alpha_lyrics * l
    pool = pool[pool["track_id"] != query_id]
    pool = pool.sort_values("hybrid_score", ascending=False).head(top_k)

    cols = ["track_id", "track_name", "artist_names", "release_date", "audio_sim", "lyrics_sim", "hybrid_score"]
    cols = [c for c in cols if c in pool.columns]
    return pool[cols].reset_index(drop=True)

In [55]:
# AUDIO VS HYBRID EVALUATION (LYRICS SUBSET) — DOES LYRICS IMPROVE?
def eval_audio_vs_hybrid_on_lyrics_subset(
    df_scaled: pd.DataFrame,
    genre: str,
    top_k: int = 10,
    n_queries: int = 30,
    alpha_audio: float = DEFAULT_ALPHA_AUDIO,
    alpha_lyrics: float = DEFAULT_ALPHA_LYRICS,
    lyric_len_threshold: int = LYRIC_LEN_THRESHOLD,
    seed: int = 42,
    weights_dict: Optional[Dict[str, pd.Series]] = None,
    similarity_mode: str = "cosine"
) -> Tuple[pd.DataFrame, Dict]:
    pool = df_scaled[df_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (genre in lst))].copy()
    pool = pool[pool["lyrics"].fillna("").astype(str).str.len() >= lyric_len_threshold].copy()

    if len(pool) < max(80, n_queries):
        raise ValueError(f"Not enough lyric-bearing songs in genre='{genre}': {len(pool)}")

    # queries must have at least one same-artist neighbor in lyrics pool
    pool["artist_key"] = pool["artist_ids"].apply(lambda x: tuple(sorted(x)) if isinstance(x, list) else tuple())
    counts = pool["artist_key"].value_counts()
    valid_keys = set(counts[counts >= 2].index)
    q_pool = pool[pool["artist_key"].isin(valid_keys)].copy()
    q_pool = q_pool[q_pool["lyrics"].fillna("").astype(str).str.len() >= lyric_len_threshold].copy()
    if len(q_pool) < n_queries:
        q_pool = q_pool.sample(n=min(n_queries, len(q_pool)), random_state=seed)

    query_ids = q_pool.sample(n=min(n_queries, len(q_pool)), random_state=seed)["track_id"].tolist()

    rows = []
    for qid in query_ids:
        q_artists = set(df_scaled.loc[id_to_rowidx[qid], "artist_ids"])

        audio_pool = audio_similarity_in_pool(pool, genre, qid, weights_dict=weights_dict, similarity_mode=similarity_mode)
        audio_ranked = (
            audio_pool[audio_pool["track_id"] != qid]
            .sort_values("audio_sim", ascending=False)
            .head(top_k)
        )
        audio_ids = audio_ranked["track_id"].tolist()

        scored = lyrics_similarity_in_pool(audio_pool, qid)
        hybrid_ranked = hybrid_rank(scored, qid, alpha_audio=alpha_audio, alpha_lyrics=alpha_lyrics, normalize=True, top_k=top_k)
        hybrid_ids = hybrid_ranked["track_id"].tolist()

        def recall_mrr(ids):
            hits = []
            for tid in ids:
                cand_artists = set(df_scaled.loc[id_to_rowidx[tid], "artist_ids"])
                hits.append(bool(q_artists.intersection(cand_artists)))
            recall = 1.0 if any(hits) else 0.0
            mrr = 1.0 / (hits.index(True) + 1) if any(hits) else 0.0
            return recall, mrr

        a_recall, a_mrr = recall_mrr(audio_ids)
        h_recall, h_mrr = recall_mrr(hybrid_ids)

        overlap = len(set(audio_ids).intersection(set(hybrid_ids))) / float(top_k)

        rows.append({
            "query_id": qid,
            "audio_recall@k": a_recall,
            "audio_mrr": a_mrr,
            "hybrid_recall@k": h_recall,
            "hybrid_mrr": h_mrr,
            "topk_overlap": overlap,
        })

    df = pd.DataFrame(rows)
    summary = {
        "genre": genre,
        "queries": int(len(df)),
        "audio_recall@k": float(df["audio_recall@k"].mean()),
        "hybrid_recall@k": float(df["hybrid_recall@k"].mean()),
        "audio_mrr": float(df["audio_mrr"].mean()),
        "hybrid_mrr": float(df["hybrid_mrr"].mean()),
        "avg_topk_overlap": float(df["topk_overlap"].mean()),
        "delta_recall@k": float(df["hybrid_recall@k"].mean() - df["audio_recall@k"].mean()),
        "delta_mrr": float(df["hybrid_mrr"].mean() - df["audio_mrr"].mean()),
        "alpha_audio": alpha_audio,
        "alpha_lyrics": alpha_lyrics,
        "lyric_len_threshold": lyric_len_threshold,
        "similarity_mode": similarity_mode
    }
    return df, summary

In [21]:
# QUICK RUN EXAMPLES (OPTIONAL)

# Top predictors per selected genre (defensible supervised where possible)
tp = top_predictors_by_genre(selected_genres, top_n=5)
for g, s in tp.items():
    print("\nTop predictors for genre:", g)
    print(s)


Top predictors for genre: rock
loudness        0.277673
acousticness    0.226293
valence         0.205148
danceability    0.136286
energy          0.062345
dtype: float64

Top predictors for genre: adult standards
acousticness        0.481759
instrumentalness    0.156555
loudness            0.149267
danceability        0.113094
valence             0.044132
dtype: float64

Top predictors for genre: filmi
acousticness        0.609006
energy              0.081924
instrumentalness    0.079616
loudness            0.059664
danceability        0.057305
dtype: float64

Top predictors for genre: classical
instrumentalness    0.227005
acousticness        0.201586
valence             0.186564
liveness            0.109310
loudness            0.094371
dtype: float64

Top predictors for genre: latin
speechiness     0.262334
acousticness    0.174805
danceability    0.158289
loudness        0.154900
valence         0.114492
dtype: float64

Top predictors for genre: vocal jazz
danceability        0.24

In [22]:
# Audio-only demo
if len(selected_genres) > 0:
    qsum, res = demo_genre_recommendations(
        genre=selected_genres[0],
        n_queries=2,
        top_k=5,
        seed=RANDOM_SEED,
        # If you’re using supervised weights, L1 often aligns well:
        weights_dict=genre_weights,
        similarity_mode=("l1" if selected_genres[0] in genre_weights_supervised else "cosine")
    )
    print("\nQUERY SONGS")
    print(qsum)
    for qid, df in res.items():
        print("\nQuery:", qid)
        print(df.head(5))


QUERY SONGS
                 track_id                              track_name  \
0  2xd1YFCq6W9kYNfjvCnGyT  You Never Can Tell - 1986 Stereo Remix   
1  0wqM8Flc5YaWwZoFBMz3zk                         Band On The Run   

    artist_names  popularity release_date  
0  [Chuck Berry]          40   1986-01-01  
1        [Wings]          36   1978-11-22  

Query: 2xd1YFCq6W9kYNfjvCnGyT
                 track_id                         track_name  \
0  1auS5fpZRGBOS13XsYvYVY                Let's Work Together   
1  0XMEFaM6o1Kzkm4ps2LnQU  Machine Gun Kelly - 2019 Remaster   
2  2QXYYCc7Z2wkSaqrnwxfvL                    Call The Police   
3  3GjzIa2sjSx02xyoEDa4kg                     I'm Not Sayin'   
4  2uZZRqxoRyOAwm5ErphYp0                      Join The Gang   

         artist_names  popularity release_date  release_year  similarity_score  
0       [Canned Heat]          36   1987-01-01          1987          0.849908  
1      [James Taylor]          26   1971-04-01          1971         

In [23]:
# System effectiveness (Recall@K / MRR vs random)
eval_rows, eval_summary = eval_recall_mrr_same_artist(
    BASE_DF_scaled,
    selected_genres,
    top_k=10,
    n_queries_per_genre=30,
    seed=RANDOM_SEED,
    weights_dict=genre_weights,
    similarity_mode="l1"  # good default when using supervised weights
)
print("\nEVALUATION SUMMARY (AUDIO):")
print(eval_summary.head(10))


EVALUATION SUMMARY (AUDIO):
                 queries  recall_at_k       mrr  random_recall_at_k       lift
genre                                                                         
soul                  30     0.366667  0.192989            0.033333  11.000000
rock                  30     0.366667  0.250556            0.066667   5.500000
edm                   30     0.866667  0.757262            0.166667   5.200000
adult standards       30     0.300000  0.188095            0.066667   4.500000
rap                   30     0.566667  0.445556            0.133333   4.250000
vocal jazz            30     0.633333  0.338135            0.166667   3.800000
latin                 30     0.400000  0.316667            0.133333   3.000000
folk                  30     0.566667  0.332778            0.200000   2.833333
classical             30     0.466667  0.331111            0.333333   1.400000
filmi                 30     0.600000  0.474537            0.433333   1.384615


In [24]:
# Lyrics enrichment + hybrid evaluation example (e.g., rap)

genre_to_test = "rap"
for _ in range(3):  # batches
    BASE_DF_scaled = attach_lyrics_for_genre_lrclib_inplace(BASE_DF_scaled, genre_to_test, max_songs=400, sleep_s=0.03)
    pool = BASE_DF_scaled[BASE_DF_scaled["artist_genres"].apply(lambda lst: isinstance(lst, list) and (genre_to_test in lst))]
    cov = float((pool["lyrics"].fillna("").astype(str).str.len() >= LYRIC_LEN_THRESHOLD).mean())
    print(f"Coverage in '{genre_to_test}': {cov:.2%}")

lyr_rows, lyr_summary = eval_audio_vs_hybrid_on_lyrics_subset(
    BASE_DF_scaled,
    genre=genre_to_test,
    top_k=10,
    n_queries=30,
    alpha_audio=0.65,
    alpha_lyrics=0.35,
    lyric_len_threshold=LYRIC_LEN_THRESHOLD,
    seed=RANDOM_SEED,
    weights_dict=genre_weights,
    similarity_mode="l1"
)
print("\nAUDIO VS HYBRID SUMMARY:")
print(lyr_summary)

Coverage in 'rap': 6.11%
Coverage in 'rap': 12.04%
Coverage in 'rap': 17.88%

AUDIO VS HYBRID SUMMARY:
{'genre': 'rap', 'queries': 30, 'audio_recall@k': 0.36666666666666664, 'hybrid_recall@k': 0.43333333333333335, 'audio_mrr': 0.25785714285714284, 'hybrid_mrr': 0.3516666666666667, 'avg_topk_overlap': 0.8966666666666665, 'delta_recall@k': 0.06666666666666671, 'delta_mrr': 0.09380952380952384, 'alpha_audio': 0.65, 'alpha_lyrics': 0.35, 'lyric_len_threshold': 50, 'similarity_mode': 'l1'}


In [25]:
def _format_top_predictors(weights: pd.Series, top_n: int = 5, decimals: int = 3) -> str:
    """
    Convert a weights series (feature -> weight) to a compact string:
      "loudness:0.278, acousticness:0.226, ..."
    """
    s = weights.sort_values(ascending=False).head(top_n)
    parts = [f"{k}:{v:.{decimals}f}" for k, v in s.items()]
    return ", ".join(parts)

def make_results_table(
    genres,
    top_n_predictors: int = 5,
    n_eval_queries_per_genre: int = 30,
    top_k: int = 10,
    seed: int = 42,
    weights_dict=None,
    similarity_mode: str = "l1",
    decimals: int = 3
) -> pd.DataFrame:
    """
    Builds a single compact per-genre results table with:
      - top predictors
      - supervised pairwise AUC (if available)
      - audio recall@k, audio mrr, random recall@k, lift

    Notes:
      - AUC is taken from genre_auc_df (pairwise supervised training)
      - If a genre has no supervised weights, AUC may be NaN.
    """
    # Pull audio evaluation summary
    _, audio_summary = eval_recall_mrr_same_artist(
        BASE_DF_scaled,
        genres,
        top_k=top_k,
        n_queries_per_genre=n_eval_queries_per_genre,
        seed=seed,
        weights_dict=weights_dict,
        similarity_mode=similarity_mode
    )

    # AUC lookup from training report
    auc_map = {}
    if "genre_auc_df" in globals() and isinstance(genre_auc_df, pd.DataFrame) and len(genre_auc_df) > 0:
        for _, r in genre_auc_df.iterrows():
            auc_map[str(r["genre"])] = r.get("pairwise_auc", np.nan)

    rows = []
    for g in genres:
        # predictors string
        if "genre_weights_supervised" in globals() and g in genre_weights_supervised:
            w = genre_weights_supervised[g]
        elif "genre_weights" in globals() and g in genre_weights:
            w = genre_weights[g]
        else:
            w = None

        predictors_str = _format_top_predictors(w, top_n=top_n_predictors, decimals=decimals) if w is not None else ""

        # audio metrics
        if (isinstance(audio_summary, pd.DataFrame) and len(audio_summary) > 0 and g in audio_summary.index):
            r = audio_summary.loc[g]
            audio_recall = float(r["recall_at_k"])
            audio_mrr = float(r["mrr"])
            rand_recall = float(r["random_recall_at_k"])
            lift = float(r["lift"])
            queries = int(r["queries"])
        else:
            audio_recall = np.nan
            audio_mrr = np.nan
            rand_recall = np.nan
            lift = np.nan
            queries = 0

        rows.append({
            "genre": g,
            "pairwise_auc": auc_map.get(g, np.nan),
            "top_predictors": predictors_str,
            "audio_queries": queries,
            f"audio_recall@{top_k}": audio_recall,
            "audio_mrr": audio_mrr,
            f"random_recall@{top_k}": rand_recall,
            "lift": lift,
        })

    out = pd.DataFrame(rows)

    # sort: prioritize lift, then AUC (descending)
    out = out.sort_values(["lift", "pairwise_auc"], ascending=[False, False], na_position="last")

    # optional rounding for display
    num_cols = [c for c in out.columns if c not in ("genre", "top_predictors")]
    out[num_cols] = out[num_cols].apply(pd.to_numeric, errors="coerce")
    out[num_cols] = out[num_cols].round(decimals)

    return out

In [26]:
results_tbl = make_results_table(
    selected_genres,
    top_n_predictors=5,
    n_eval_queries_per_genre=30,
    top_k=10,
    seed=RANDOM_SEED,
    weights_dict=genre_weights,
    similarity_mode="l1"
)
results_tbl

Unnamed: 0,genre,pairwise_auc,top_predictors,audio_queries,audio_recall@10,audio_mrr,random_recall@10,lift
6,soul,0.669,"acousticness:0.407, energy:0.165, loudness:0.1...",30,0.367,0.193,0.033,11.0
0,rock,0.66,"loudness:0.278, acousticness:0.226, valence:0....",30,0.367,0.251,0.067,5.5
9,edm,0.691,"loudness:0.264, danceability:0.198, tempo:0.17...",30,0.867,0.757,0.167,5.2
1,adult standards,0.675,"acousticness:0.482, instrumentalness:0.157, lo...",30,0.3,0.188,0.067,4.5
8,rap,0.6,"loudness:0.312, speechiness:0.162, valence:0.1...",30,0.567,0.446,0.133,4.25
5,vocal jazz,0.634,"danceability:0.248, acousticness:0.207, instru...",30,0.633,0.338,0.167,3.8
4,latin,0.695,"speechiness:0.262, acousticness:0.175, danceab...",30,0.4,0.317,0.133,3.0
7,folk,0.635,"acousticness:0.304, loudness:0.271, valence:0....",30,0.567,0.333,0.2,2.833
3,classical,0.587,"instrumentalness:0.227, acousticness:0.202, va...",30,0.467,0.331,0.333,1.4
2,filmi,0.669,"acousticness:0.609, energy:0.082, instrumental...",30,0.6,0.475,0.433,1.385


In [56]:
from IPython.display import display

def alpha_sweep(
    genre,
    alphas_audio=None,
    top_k=10,
    n_queries=30,
    lyric_len_threshold=50,
    seed=42,
    weights_dict=None,
    similarity_mode="l1",
    ensure_lyrics=True,
    fetch_batches=0,
):
    """
    Runs Audio vs Hybrid evaluation over multiple alpha values.

    Returns:
      sweep_df: one row per alpha (sorted by delta_mrr then delta_recall@k)
      report: report-friendly dict (includes lyrics coverage + best alpha)
    """
    if alphas_audio is None:
        alphas_audio = [0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]

    # Optionally fetch more lyrics (ONLY if the helper exists)
    if ensure_lyrics and fetch_batches > 0:
        fn = globals().get("attach_lyrics_for_genre", None)
        if fn is None:
            raise NameError(
                "attach_lyrics_for_genre is not defined. "
                "Run the cell that defines it, or set ensure_lyrics=False / fetch_batches=0."
            )
        for _ in range(fetch_batches):
            fn(BASE_DF_scaled, genre, max_songs=400, sleep_s=0.03)

    # Compute lyrics coverage directly (robust)
    pool_all = BASE_DF_scaled[BASE_DF_scaled["artist_genres"].apply(lambda lst: genre in lst)]
    if len(pool_all) == 0:
        coverage = 0.0
    else:
        coverage = float((pool_all["lyrics"].fillna("").astype(str).str.len() >= lyric_len_threshold).mean())

    # Run evaluation per alpha
    rows = []
    for a in alphas_audio:
        a = float(a)
        a_lyrics = 1.0 - a

        _, summary = eval_audio_vs_hybrid_on_lyrics_subset(
            BASE_DF_scaled,
            genre=genre,
            top_k=top_k,
            n_queries=n_queries,
            alpha_audio=a,
            alpha_lyrics=a_lyrics,                 # IMPORTANT: must be passed
            lyric_len_threshold=lyric_len_threshold,
            seed=seed,
            weights_dict=weights_dict,
            similarity_mode=similarity_mode
        )

        summary = dict(summary)
        summary["genre"] = genre
        summary["alpha_audio"] = a
        summary["alpha_lyrics"] = a_lyrics
        summary.setdefault("lyrics_coverage", coverage)
        summary.setdefault("lyric_len_threshold", lyric_len_threshold)
        rows.append(summary)

    sweep_df = pd.DataFrame(rows)

    sweep_sorted = (
        sweep_df.sort_values(["delta_mrr", "delta_recall@k"], ascending=False)
                .reset_index(drop=True)
    )

    best = sweep_sorted.iloc[0].to_dict() if len(sweep_sorted) else None

    report = {
        "genre": genre,
        "lyrics_coverage": coverage,
        "lyric_len_threshold": lyric_len_threshold,
        "n_queries": n_queries,
        "top_k": top_k,
        "similarity_mode": similarity_mode,
        "best_by_delta_mrr": best,
        "coverage_warning": coverage < 0.10,
        "coverage_note": f"Coverage is {coverage:.2%}."
    }

    return sweep_sorted, report

### Outputs

In [49]:
# Genre selection table (genre, family, count)
genre_selection_table = (
    genre_df_valid[genre_df_valid["genre"].isin(selected_genres)]
    .loc[:, ["genre", "family", "song_count"]]
    .sort_values(["family", "song_count"], ascending=[True, False])
    .reset_index(drop=True)
)

print("GENRE SELECTION TABLE:")
genre_selection_table

GENRE SELECTION TABLE:


Unnamed: 0,genre,family,song_count
0,classical,classical,18995
1,edm,electronic,5329
2,folk,folk,9032
3,rap,hiphop,6371
4,vocal jazz,jazz,14327
5,latin,latin,15168
6,adult standards,pop,26688
7,soul,rnb_soul,12865
8,rock,rock,32026
9,filmi,world,19557


##### Metrics
- Recall@10: “Did we find at least one same-artist song in Top-10?”
   Score is 1 if yes, 0 if no, then averaged across queries.
- MRR (Mean Reciprocal Rank): “How early did the first same-artist match appear?”
   Higher is better because it means the match is near the top.
- Random Recall@10: same test, but Top-10 is random (baseline).
- Lift: how much better you are than random.

In [50]:
# Audio evaluation summary
print("AUDIO EVALUATION SUMMARY (Recall@K / MRR / Random / Lift):")
eval_summary

AUDIO EVALUATION SUMMARY (Recall@K / MRR / Random / Lift):


Unnamed: 0_level_0,queries,recall_at_k,mrr,random_recall_at_k,lift
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
soul,30,0.366667,0.192989,0.033333,11.0
rock,30,0.366667,0.250556,0.066667,5.5
edm,30,0.866667,0.757262,0.166667,5.2
adult standards,30,0.3,0.188095,0.066667,4.5
rap,30,0.566667,0.445556,0.133333,4.25
vocal jazz,30,0.633333,0.338135,0.166667,3.8
latin,30,0.4,0.316667,0.133333,3.0
folk,30,0.566667,0.332778,0.2,2.833333
classical,30,0.466667,0.331111,0.333333,1.4
filmi,30,0.6,0.474537,0.433333,1.384615


In [51]:
# AUC Table
print("PAIRWISE AUC TABLE (same-artist vs different-artist proxy):")
genre_auc_df

PAIRWISE AUC TABLE (same-artist vs different-artist proxy):


Unnamed: 0,genre,pairwise_auc
4,latin,0.694709
9,edm,0.691341
1,adult standards,0.675099
2,filmi,0.66928
6,soul,0.668528
0,rock,0.660443
7,folk,0.635189
5,vocal jazz,0.634261
8,rap,0.599636
3,classical,0.587365


In [52]:
# Master results table (AUC + predictors + audio metrics)
master_results_table = make_results_table(selected_genres)
print("MASTER RESULTS TABLE:")
master_results_table

MASTER RESULTS TABLE:


Unnamed: 0,genre,pairwise_auc,top_predictors,audio_queries,audio_recall@10,audio_mrr,random_recall@10,lift
6,soul,0.669,"acousticness:0.407, energy:0.165, loudness:0.1...",30,0.367,0.193,0.033,11.0
0,rock,0.66,"loudness:0.278, acousticness:0.226, valence:0....",30,0.367,0.251,0.067,5.5
9,edm,0.691,"loudness:0.264, danceability:0.198, tempo:0.17...",30,0.867,0.757,0.167,5.2
1,adult standards,0.675,"acousticness:0.482, instrumentalness:0.157, lo...",30,0.3,0.188,0.067,4.5
8,rap,0.6,"loudness:0.312, speechiness:0.162, valence:0.1...",30,0.567,0.446,0.133,4.25
5,vocal jazz,0.634,"danceability:0.248, acousticness:0.207, instru...",30,0.633,0.338,0.167,3.8
4,latin,0.695,"speechiness:0.262, acousticness:0.175, danceab...",30,0.4,0.317,0.133,3.0
7,folk,0.635,"acousticness:0.304, loudness:0.271, valence:0....",30,0.567,0.333,0.2,2.833
3,classical,0.587,"instrumentalness:0.227, acousticness:0.202, va...",30,0.467,0.331,0.333,1.4
2,filmi,0.669,"acousticness:0.609, energy:0.082, instrumental...",30,0.6,0.475,0.433,1.385


In [53]:
# Lyrics coverage table per selected genre
# Coverage = % songs in each genre with lyric length >= threshold
def _lyrics_coverage_for_genre(df_scaled, genre, lyric_len_threshold):
    pool = df_scaled[df_scaled["artist_genres"].apply(lambda lst: genre in lst)]
    if len(pool) == 0:
        return 0.0, 0, 0
    has_lyrics = pool["lyrics"].fillna("").astype(str).str.len() >= lyric_len_threshold
    return float(has_lyrics.mean()), int(has_lyrics.sum()), int(len(pool))

coverage_rows = []
for g in selected_genres:
    cov, n_has, n_total = _lyrics_coverage_for_genre(BASE_DF_scaled, g, LYRIC_LEN_THRESHOLD)
    coverage_rows.append({
        "genre": g,
        "family": assign_family(g),
        "songs_in_genre": n_total,
        "songs_with_lyrics": n_has,
        "lyrics_coverage": cov
    })

lyrics_coverage_table = (
    pd.DataFrame(coverage_rows)
      .sort_values(["lyrics_coverage", "songs_in_genre"], ascending=False)
      .reset_index(drop=True)
)

print("LYRICS COVERAGE TABLE:")
lyrics_coverage_table

LYRICS COVERAGE TABLE:


Unnamed: 0,genre,family,songs_in_genre,songs_with_lyrics,lyrics_coverage
0,rap,hiphop,6371,1865,0.292733
1,edm,electronic,5329,33,0.006193
2,latin,latin,15168,24,0.001582
3,soul,rnb_soul,12865,7,0.000544
4,rock,rock,32026,4,0.000125
5,adult standards,pop,26688,2,7.5e-05
6,filmi,world,19557,0,0.0
7,classical,classical,18995,0,0.0
8,vocal jazz,jazz,14327,0,0.0
9,folk,folk,9032,0,0.0


In [57]:
# Alpha sweep results for the 2–3 best coverage genres
# This will:
# 1) choose the top 3 genres by lyrics coverage (excluding zero)
# 2) run alpha_sweep() for each
#
# Notes:
# - alpha_sweep() supports `ensure_lyrics` and `fetch_batches`.
# - If you want the notebook to fetch more lyrics automatically, set FETCH_BATCHES > 0.

FETCH_BATCHES = 0       # set to 1 or 2 to fetch more lyrics before running sweep
MIN_COVERAGE = 0.05     # avoid extremely tiny lyric subsets (noisy results)
TOP_N_GENRES_FOR_SWEEP = 3

candidates = lyrics_coverage_table[lyrics_coverage_table["lyrics_coverage"] > 0].copy()
candidates = candidates.sort_values(["lyrics_coverage", "songs_in_genre"], ascending=False)

best_genres = candidates[candidates["lyrics_coverage"] >= MIN_COVERAGE]["genre"].head(TOP_N_GENRES_FOR_SWEEP).tolist()
if len(best_genres) == 0:
    # fallback: still pick top 2 by coverage, even if below MIN_COVERAGE
    best_genres = candidates["genre"].head(2).tolist()

print("Genres selected for alpha sweep:", best_genres)

all_sweeps = []
all_reports = []

for g in best_genres:
    try:
        sweep_tbl, sweep_report = alpha_sweep(
            genre=g,
            alphas_audio=[0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5],
            top_k=10,
            n_queries=30,
            lyric_len_threshold=LYRIC_LEN_THRESHOLD,
            seed=RANDOM_SEED,
            weights_dict=genre_weights,
            similarity_mode="l1",
            ensure_lyrics=True,
            fetch_batches=FETCH_BATCHES
        )

        sweep_tbl = sweep_tbl.copy()

        # Fix: if 'genre' already exists, overwrite it; otherwise add it.
        if "genre" in sweep_tbl.columns:
            sweep_tbl["genre"] = g
        else:
            sweep_tbl.insert(0, "genre", g)

        all_sweeps.append(sweep_tbl)
        all_reports.append(sweep_report)

    except Exception as e:
        all_reports.append({"genre": g, "error": str(e)})

alpha_sweep_table = pd.concat(all_sweeps, ignore_index=True) if all_sweeps else pd.DataFrame()
alpha_sweep_reports = all_reports

print("ALPHA SWEEP TABLE (combined):")
display(alpha_sweep_table)

print("ALPHA SWEEP REPORTS (per genre):")
display(alpha_sweep_reports)


Genres selected for alpha sweep: ['rap']
ALPHA SWEEP TABLE (combined):


Unnamed: 0,genre,queries,audio_recall@k,hybrid_recall@k,audio_mrr,hybrid_mrr,avg_topk_overlap,delta_recall@k,delta_mrr,alpha_audio,alpha_lyrics,lyric_len_threshold,similarity_mode,lyrics_coverage
0,rap,30,0.6,0.633333,0.479815,0.526111,0.95,0.033333,0.046296,0.8,0.2,50,l1,0.292733
1,rap,30,0.6,0.633333,0.479815,0.526111,0.926667,0.033333,0.046296,0.75,0.25,50,l1,0.292733
2,rap,30,0.6,0.633333,0.479815,0.526111,0.903333,0.033333,0.046296,0.7,0.3,50,l1,0.292733
3,rap,30,0.6,0.633333,0.479815,0.526111,0.87,0.033333,0.046296,0.65,0.35,50,l1,0.292733
4,rap,30,0.6,0.633333,0.479815,0.526111,0.85,0.033333,0.046296,0.6,0.4,50,l1,0.292733
5,rap,30,0.6,0.633333,0.479815,0.526111,0.83,0.033333,0.046296,0.55,0.45,50,l1,0.292733
6,rap,30,0.6,0.633333,0.479815,0.526111,0.776667,0.033333,0.046296,0.5,0.5,50,l1,0.292733
7,rap,30,0.6,0.666667,0.479815,0.490556,0.976667,0.066667,0.010741,0.85,0.15,50,l1,0.292733
8,rap,30,0.6,0.633333,0.479815,0.466481,0.986667,0.033333,-0.013333,0.9,0.1,50,l1,0.292733


ALPHA SWEEP REPORTS (per genre):


[{'genre': 'rap',
  'lyrics_coverage': 0.29273269502432897,
  'lyric_len_threshold': 50,
  'n_queries': 30,
  'top_k': 10,
  'similarity_mode': 'l1',
  'best_by_delta_mrr': {'genre': 'rap',
   'queries': 30,
   'audio_recall@k': 0.6,
   'hybrid_recall@k': 0.6333333333333333,
   'audio_mrr': 0.47981481481481486,
   'hybrid_mrr': 0.5261111111111111,
   'avg_topk_overlap': 0.9499999999999997,
   'delta_recall@k': 0.033333333333333326,
   'delta_mrr': 0.046296296296296224,
   'alpha_audio': 0.8,
   'alpha_lyrics': 0.19999999999999996,
   'lyric_len_threshold': 50,
   'similarity_mode': 'l1',
   'lyrics_coverage': 0.29273269502432897},
  'coverage_note': 'Coverage is 29.27%.'}]

In [59]:
def get_topk_ids_hybrid(df_scaled, genre, query_id, top_k, alpha_audio, lyric_len_threshold=50, seed=42):
    """
    Returns the top_k track_ids for the hybrid score for a given query song within a genre.
    Uses the SAME logic as your hybrid evaluation: audio_sim + lyrics_sim, normalized, combined.
    """
    # Build lyrics pool (only songs with enough lyrics)
    pool = df_scaled[df_scaled["artist_genres"].apply(lambda lst: genre in lst)].copy()
    pool = pool[pool["lyrics"].fillna("").astype(str).str.len() >= lyric_len_threshold].reset_index(drop=True)

    if len(pool) < (top_k + 5):
        raise ValueError("Not enough lyric songs in pool. Fetch more lyrics or lower threshold.")

    # Query must exist in this lyrics pool
    if query_id not in set(pool["track_id"]):
        raise ValueError("Query song is not in the lyrics-available pool for this genre.")

    # TF-IDF for lyrics similarity (built inside pool, simplest prototype approach)
    texts = pool["lyrics"].fillna("").astype(str).tolist()
    vec = TfidfVectorizer(max_features=40000, ngram_range=(1,2), min_df=2, stop_words="english")
    X = vec.fit_transform(texts)

    q_pos = int(pool.index[pool["track_id"] == query_id][0])
    q_row = pool.loc[q_pos]

    # Audio similarity
    w = genre_weights[genre].reindex(AUDIO_FEATURE_COLS).values.astype(float)
    q_vec = q_row[AUDIO_FEATURE_COLS].values.astype(float)
    M = pool[AUDIO_FEATURE_COLS].values.astype(float)
    pool["audio_sim"] = weighted_cosine_similarity(q_vec, M, w)

    # Lyrics similarity
    q_tfidf = X[q_pos]
    pool["lyrics_sim"] = (X @ q_tfidf.T).toarray().ravel()

    # Normalize and combine
    def _minmax(v):
        v = np.asarray(v, dtype=float)
        return (v - v.min()) / (v.max() - v.min() + 1e-12)

    a = _minmax(pool["audio_sim"].values)
    l = _minmax(pool["lyrics_sim"].values)
    pool["hybrid_score"] = float(alpha_audio) * a + (1.0 - float(alpha_audio)) * l

    # Rank (exclude query itself)
    ranked = pool[pool["track_id"] != query_id].sort_values("hybrid_score", ascending=False).head(top_k)
    return ranked["track_id"].tolist()


# --- Choose a rap query that is guaranteed to have lyrics ---
g = "rap"
lyrics_pool = BASE_DF_scaled[
    BASE_DF_scaled["artist_genres"].apply(lambda lst: g in lst)
].copy()
lyrics_pool = lyrics_pool[lyrics_pool["lyrics"].fillna("").astype(str).str.len() >= LYRIC_LEN_THRESHOLD]

qid = lyrics_pool["track_id"].iloc[0]  # first valid query with lyrics
print("Query track_id:", qid)

top10_a08 = get_topk_ids_hybrid(BASE_DF_scaled, g, qid, top_k=10, alpha_audio=0.8, lyric_len_threshold=LYRIC_LEN_THRESHOLD)
top10_a06 = get_topk_ids_hybrid(BASE_DF_scaled, g, qid, top_k=10, alpha_audio=0.6, lyric_len_threshold=LYRIC_LEN_THRESHOLD)

print("\nTop-10 overlap count:", len(set(top10_a08) & set(top10_a06)))
print("Same set?:", set(top10_a08) == set(top10_a06))

print("\nTop-10 (alpha=0.8):", top10_a08)
print("\nTop-10 (alpha=0.6):", top10_a06)


Query track_id: 0047CfWoMPpXkZlizPQAb2

Top-10 overlap count: 7
Same set?: False

Top-10 (alpha=0.8): ['0ALV8l7nqbjwSZ2w9hS2CI', '1Zt4rVKL5cazgN54WibvLt', '1pYKzfOlpHWtVUiEIu4Vn7', '1rfofaqEpACxVEHIZBJe6W', '1mKNyZCNDueEASgBgZuLdA', '25eXZAa3awZeQ8elAgPsAa', '0jWHEPqCnNAHtpgcKvJCvZ', '19TOAlTFq0NDHvUPQR0tkr', '0fgZUSa7D7aVvv3GfO0A1n', '0lMbuWUpfTWhEmOKxppEau']

Top-10 (alpha=0.6): ['0ALV8l7nqbjwSZ2w9hS2CI', '1Zt4rVKL5cazgN54WibvLt', '1pYKzfOlpHWtVUiEIu4Vn7', '1rfofaqEpACxVEHIZBJe6W', '0yLsJKNJHmSnmSGl3ZodYn', '1mKNyZCNDueEASgBgZuLdA', '01A7PEPSnmtixFPfB2UTal', '25eXZAa3awZeQ8elAgPsAa', '0jWHEPqCnNAHtpgcKvJCvZ', '1rICTKr67UUnoDVvOj4A8c']


##### When the audio–lyrics mix (alpha) is adjusted, the recommended songs changed, but only partially. For a sample rap query, the Top-10 list had 7 out of 10 songs in common between alpha=0.8 and alpha=0.6. This shows that audio features provide a stable core of recommendations, while lyrics mainly influence a smaller portion of borderline results. Even when some songs change, the overall evaluation metrics (Recall@10 and MRR) may remain similar because they only measure whether at least one ‘correct’ match appears in the Top-10 and how early the first match appears.