In [67]:
from __future__ import annotations
import os
import math
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Callable, Iterable, Optional
from dataclasses import dataclass
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


DATA_PATH = "data.csv" 

CANDIDATE_FEATURES = [
    "acousticness", "danceability", "energy", "instrumentalness",
    "liveness", "loudness", "speechiness", "tempo", "valence"
]

CANDIDATE_ID_COLS = ["id", "track_id", "uri"]
CANDIDATE_NAME_COLS = ["name", "track_name", "title"]
CANDIDATE_ARTIST_COLS = ["artists", "artist", "artist_name", "artist_names"]
CANDIDATE_GENRE_COLS = ["genres", "genre"]
CANDIDATE_POP_COLS = ["popularity", "popularity_score"]
CANDIDATE_YEAR_COLS = ["year", "release_year"]
CANDIDATE_DATE_COLS = ["release_date", "date", "album_release_date"]


def _first_existing(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
    return None

def _normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [str(c).strip().lower() for c in df.columns]
    return df

def _ensure_year_column(df: pd.DataFrame, year_col: Optional[str]) -> Tuple[pd.DataFrame, Optional[str]]:
    if year_col and year_col in df.columns:
        return df, year_col
    date_col = _first_existing(df, CANDIDATE_DATE_COLS)
    if date_col:
        df = df.copy()
        def _to_year(x):
            if pd.isna(x): return np.nan
            s = str(x)
            for sep in ["-", "/", "."]:
                parts = s.split(sep)
                if len(parts) >= 1 and parts[0].isdigit() and len(parts[0]) == 4:
                    return float(parts[0])
            digits = "".join(ch for ch in s if ch.isdigit())
            if len(digits) >= 4:
                return float(digits[:4])
            return np.nan
        df["year"] = df[date_col].map(_to_year)
        return df, "year"
    return df, None

def _safe_get(row: pd.Series, col: Optional[str], default="") -> str:
    if col and col in row and pd.notna(row[col]):
        return str(row[col])
    return default

def _coerce_listish(x) -> List[str]:
    if isinstance(x, list):
        return [str(i).strip() for i in x if str(i).strip()]
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    if s.startswith("[") and s.endswith("]"):
        s2 = s.strip("[]")
        parts = [p.strip(" '\"") for p in s2.split(",")]
        return [p for p in parts if p]
    for sep in [";", ",", "|", "/"]:
        if sep in s:
            return [p.strip() for p in s.split(sep) if p.strip()]
    return [s]


In [68]:
def load_dataset(path: str = DATA_PATH) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at: {path}")
    df = pd.read_csv(path)
    df = _normalize_column_names(df)

    id_col = _first_existing(df, CANDIDATE_ID_COLS)
    name_col = _first_existing(df, CANDIDATE_NAME_COLS) or "name"
    artist_col = _first_existing(df, CANDIDATE_ARTIST_COLS)
    genre_col = _first_existing(df, CANDIDATE_GENRE_COLS)
    pop_col = _first_existing(df, CANDIDATE_POP_COLS)
    year_col = _first_existing(df, CANDIDATE_YEAR_COLS)

    df, year_col = _ensure_year_column(df, year_col)

    features = [f for f in CANDIDATE_FEATURES if f in df.columns]

    required_min = [name_col, pop_col]
    for col in required_min:
        if col is None or col not in df.columns:
            raise ValueError("Dataset must include at least 'name' and 'popularity' (or similar). "
                             f"Missing: {col}")

    df = df.dropna(subset=[name_col, pop_col]).copy()
    df[name_col] = df[name_col].astype(str)

    if genre_col and genre_col in df.columns:
        df["genres_list"] = df[genre_col].map(_coerce_listish)
        df["genres_primary"] = df["genres_list"].map(lambda lst: lst[0] if lst else "")
    else:
        df["genres_list"] = [[] for _ in range(len(df))]
        df["genres_primary"] = ""

    cols = {
        "id": id_col,
        "name": name_col,
        "artist": artist_col,
        "genre": genre_col,
        "popularity": pop_col,
        "year": year_col
    }

    return df.reset_index(drop=True), cols, features


In [69]:
@dataclass
class FeatureIndex:
    scaler: StandardScaler
    feature_matrix: np.ndarray
    features: List[str]
    name_to_idx: Dict[str, int]
    sim: Optional[np.ndarray] = None

def build_feature_index(df: pd.DataFrame, cols: Dict[str, str], features: List[str]) -> FeatureIndex:
    if not features:
        raise ValueError("No audio features found in dataset. "
                         f"Tried: {CANDIDATE_FEATURES}")

    X = df[features].copy()

    for c in features:
        X[c] = pd.to_numeric(X[c], errors="coerce")
    X = X.fillna(X.median(numeric_only=True))

    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    sim = cosine_similarity(Xs)

    name_col = cols["name"]
    name_to_idx = {str(n).lower(): i for i, n in enumerate(df[name_col])}

    return FeatureIndex(scaler, Xs, features, name_to_idx, sim=sim)


def popularity_recommender(df: pd.DataFrame, cols: Dict[str, str], n: int = 10) -> pd.DataFrame:
    return df.sort_values(by=cols["popularity"], ascending=False)\
             .head(n)[[cols["name"], cols["artist"], cols["popularity"]]].reset_index(drop=True)

def _nearest_by_similarity(idx: int, sim: np.ndarray, topk: int) -> List[int]:
    scores = list(enumerate(sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    out = [i for i, s in scores if i != idx][:topk]
    return out

def content_recommender(song_name: str, df: pd.DataFrame, cols: Dict[str, str],
                        fidx: FeatureIndex, n: int = 10) -> pd.DataFrame | str:
    key = song_name.lower()
    if key not in fidx.name_to_idx:
        return f"Song '{song_name}' not found."
    idx = fidx.name_to_idx[key]
    nbrs = _nearest_by_similarity(idx, fidx.sim, n)
    return df.iloc[nbrs][[cols["name"], cols["artist"], cols["popularity"]]].reset_index(drop=True)


In [70]:
def hybrid_recommender(song_name: str, df: pd.DataFrame, cols: Dict[str, str],
                       fidx: FeatureIndex, n: int = 10, alpha: float = 0.7) -> pd.DataFrame | str:
    key = song_name.lower()
    if key not in fidx.name_to_idx:
        return f"Song '{song_name}' not found."
    idx = fidx.name_to_idx[key]

    pop = df[cols["popularity"]].to_numpy().astype(float)
    pop_norm = (pop - pop.min()) / (pop.max() - pop.min() + 1e-9)

    scores = []
    for j, s in enumerate(fidx.sim[idx]):
        hybrid = alpha * s + (1 - alpha) * pop_norm[j]
        scores.append((j, hybrid))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    rec_indices = [j for j, _ in scores if j != idx][:n]
    return df.iloc[rec_indices][[cols["name"], cols["artist"], cols["popularity"]]].reset_index(drop=True)


In [71]:
def genre_recommender(song_name: str, df: pd.DataFrame, cols: Dict[str, str],
                      fidx: FeatureIndex, n: int = 10) -> pd.DataFrame | str:
    key = song_name.lower()
    if key not in fidx.name_to_idx:
        return f"Song '{song_name}' not found."
    idx = fidx.name_to_idx[key]
    target_genre = df.iloc[idx]["genres_primary"]
    if not target_genre:
        return "No genre info available to constrain recommendations."

    candidates = _nearest_by_similarity(idx, fidx.sim, max(n*5, 50))
    filt = [j for j in candidates if df.iloc[j]["genres_primary"] == target_genre][:n]
    if not filt:
        return f"No similar songs found in genre '{target_genre}'."
    return df.iloc[filt][[cols["name"], cols["artist"], "genres_primary", cols["popularity"]]].reset_index(drop=True)


In [72]:
def year_recommender(song_name: str, df: pd.DataFrame, cols: Dict[str, str],
                     fidx: FeatureIndex, year: int | float, n: int = 10) -> pd.DataFrame | str:
    key = song_name.lower()
    if key not in fidx.name_to_idx:
        return f"Song '{song_name}' not found."
    if not cols["year"] or cols["year"] not in df.columns:
        return "No year information available in dataset."
    idx = fidx.name_to_idx[key]
    year_vals = df[cols["year"]]

    candidates = _nearest_by_similarity(idx, fidx.sim, max(n*10, 100))
    filt = [j for j in candidates if pd.notna(year_vals.iloc[j]) and int(year_vals.iloc[j]) == int(year)][:n]
    if not filt:
        return f"No similar songs found in year {year}."
    return df.iloc[filt][[cols["name"], cols["artist"], cols["year"], cols["popularity"]]].reset_index(drop=True)


In [73]:
def explain_recommendation(query_song: str, candidate_row: pd.Series,
                           df: pd.DataFrame, cols: Dict[str, str], fidx: FeatureIndex,
                           top_features: int = 3) -> Dict[str, object]:
    key = query_song.lower()
    if key not in fidx.name_to_idx:
        return {"error": f"Song '{query_song}' not found."}
    q_idx = fidx.name_to_idx[key]

    q_vec = fidx.feature_matrix[q_idx, :]
    c_idx = int(candidate_row.name)
    c_vec = fidx.feature_matrix[c_idx, :]

    cos_sim = float(np.dot(q_vec, c_vec) / (np.linalg.norm(q_vec) * np.linalg.norm(c_vec) + 1e-9))
    diffs = np.abs(q_vec - c_vec)
    feat_pairs = list(zip(fidx.features, diffs))
    feat_pairs.sort(key=lambda t: t[1])  
    top_feat = [f for f, d in feat_pairs[:top_features]]

    rationale = []
    for f in top_feat:
        rationale.append(f"similar {f}")

    return {
        "query": df.loc[q_idx, cols["name"]],
        "candidate": candidate_row[cols["name"]],
        "artist": _safe_get(candidate_row, cols["artist"], ""),
        "popularity": float(candidate_row[cols["popularity"]]) if pd.notna(candidate_row[cols["popularity"]]) else None,
        "cosine_similarity": round(cos_sim, 4),
        "key_features": top_feat,
        "why": f"Because it has {', '.join(rationale)} and matches your track's audio profile."
    }

In [74]:

print("Loading dataset...")
df, cols, features = load_dataset(DATA_PATH)
print(f"Rows: {len(df)}, Features found: {features}")



Loading dataset...
Rows: 2450, Features found: ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']


In [75]:
fidx = build_feature_index(df, cols, features)

print("\nTop Popular Tracks:")
print(popularity_recommender(df, cols, n=5))



Top Popular Tracks:
                                             name  \
0  All of Me (with Eddie Heywood & His Orchestra)   
1                               Monster FaladorÃ©   
2                                     Tea for Two   
3                                  Mack the Knife   
4                                      Summertime   

                               artists  popularity  
0  ['Billie Holiday', 'Eddie Heywood']          64  
1                       ['Joe Quartz']          55  
2                        ['Art Tatum']          53  
3                  ['Louis Armstrong']          52  
4                   ['Billie Holiday']          52  


In [76]:

seed = df.sort_values(by=cols["popularity"], ascending=False).iloc[0][cols["name"]]
print(f"\nUsing seed track for demo: {seed!r}")



Using seed track for demo: 'All of Me (with Eddie Heywood & His Orchestra)'


In [77]:

print("\nContent-based recommendations:")
print(content_recommender(seed, df, cols, fidx, n=5))



Content-based recommendations:
                                                name  \
0                                     Drinking Blues   
1                   That's All I Ask of You - Take 1   
2  Gloomy Sunday (with Teddy Wilson & His Orchest...   
3  Until the Real Thing Comes Along (with Teddy W...   
4  Love Me Tonight (feat. Frank Trumbauer with Le...   

                                             artists  popularity  
0                                  ['Lucille Bogan']          10  
1                                 ['Billie Holiday']          10  
2                 ['Billie Holiday', 'Teddy Wilson']          12  
3                 ['Billie Holiday', 'Teddy Wilson']          26  
4  ['Bing Crosby', 'Frank Trumbauer', 'Lennie Hay...           7  


In [78]:

print("\nHybrid recommendations:")
print(hybrid_recommender(seed, df, cols, fidx, n=5, alpha=0.7))



Hybrid recommendations:
                                                name  \
0  Gloomy Sunday (with Teddy Wilson & His Orchest...   
1  If You Were Mine (with Teddy Wilson & His Orch...   
2          Nobody Knows You When You're Down and Out   
3  Georgia On My Mind (with Eddie Heywood & His O...   
4                                     The Man I Love   

                               artists  popularity  
0   ['Billie Holiday', 'Teddy Wilson']          52  
1   ['Billie Holiday', 'Teddy Wilson']          42  
2                     ['Bessie Smith']          42  
3  ['Billie Holiday', 'Eddie Heywood']          32  
4                   ['Billie Holiday']          47  


In [79]:

print("\nGenre-constrained recommendations:")
print(genre_recommender(seed, df, cols, fidx, n=5))



Genre-constrained recommendations:
No genre info available to constrain recommendations.


In [80]:

if cols["year"]:
    year_val = df.iloc[0][cols["year"]]
    if pd.notna(year_val):
        print(f"\nYear-constrained recommendations (year={int(year_val)}):")
        print(year_recommender(seed, df, cols, fidx, year=int(year_val), n=5))




Year-constrained recommendations (year=1921):
                                               name  \
0                                A Ballynure Ballad   
1                      That's How You Spell Ireland   
2                                My Wild Irish Rose   
3    I Met Her In The Garden Where The Praties Grow   
4  A Midsummer Night's Dream, Op. 61: Wedding March   

                                     artists  year  popularity  
0                      ['Christopher Lynch']  1921           0  
1                          ['Morton Downey']  1921           0  
2                          ['Morton Downey']  1921           0  
3                      ['Christopher Lynch']  1921           0  
4  ['Felix Mendelssohn', 'Arturo Toscanini']  1921           0  


In [81]:
c_res = content_recommender(seed, df, cols, fidx, n=5)
if not isinstance(c_res, str) and len(c_res) > 0:
    first_rec_name = c_res.iloc[0][cols["name"]]
    first_row = df[df[cols["name"]] == first_rec_name].iloc[0]
    exp = explain_recommendation(seed, first_row, df, cols, fidx, top_features=3)
    print("\nExplanation for first content recommendation:")
    print(exp)



Explanation for first content recommendation:
{'query': 'All of Me (with Eddie Heywood & His Orchestra)', 'candidate': 'Drinking Blues', 'artist': "['Lucille Bogan']", 'popularity': 10.0, 'cosine_similarity': 0.9442, 'key_features': ['instrumentalness', 'speechiness', 'energy'], 'why': "Because it has similar instrumentalness, similar speechiness, similar energy and matches your track's audio profile."}
