## Audio-based Song Similarity

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier

In [6]:
#loading datasets
tracks = pd.read_csv("/Users/sbathina/Desktop/archive/tracks.csv")
artists = pd.read_csv("/Users/sbathina/Desktop/archive/artists.csv")

In [7]:
#Extract first artist_id from id_artists
def get_first_artist_id(x):
    if isinstance(x, str):
        try:
            ids = eval(x)
            return ids[0] if ids else None
        except Exception:
            return None
    return None

tracks["artist_id"] = tracks["id_artists"].apply(get_first_artist_id)

In [8]:
# Keep needed artist columns and rename
artists_small = (
    artists[["id", "name", "genres", "followers", "popularity"]]
    .rename(columns={
        "id": "artist_id",
        "name": "artist_name",
        "popularity": "artist_popularity"
    })
)

In [9]:
# Join tracks + artists on artist_id
df = tracks.merge(artists_small, on="artist_id", how="left")

### Define a single main genre and split data by genre

In [10]:
# Get one main genre (first in list) per artist
def get_main_genre(g):
    if isinstance(g, str) and len(g) > 2:
        try:
            glist = eval(g)
            return glist[0] if glist else None
        except Exception:
            return None
    return None

df["main_genre"] = df["genres"].apply(get_main_genre)

# Keeping only rows with audio features + main_genre
AUDIO_FEATURES = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

df_feat = df.dropna(subset=AUDIO_FEATURES + ["main_genre"]).copy()

# split/filter by genre
by_genre = {
    g: df_feat[df_feat["main_genre"] == g]
    for g in df_feat["main_genre"].unique()
}

In [11]:
#Build X, y for genre prediction
X_raw = df_feat[AUDIO_FEATURES].values
y_genre = df_feat["main_genre"].values

scaler = StandardScaler()
X = scaler.fit_transform(X_raw)

#RandomForest for feature importance
rf = RandomForestClassifier(
    n_estimators=50,      # smaller to avoid killing kernel
    max_depth=10,
    max_features="sqrt",
    random_state=0,
    n_jobs=-1
)
rf.fit(X, y_genre)

feature_importances = (
    pd.Series(rf.feature_importances_, index=AUDIO_FEATURES)
      .sort_values(ascending=False)
)

feature_importances

speechiness         0.286423
acousticness        0.185280
loudness            0.107459
danceability        0.101736
energy              0.093551
valence             0.085280
instrumentalness    0.080665
tempo               0.029923
liveness            0.029684
dtype: float64

In [12]:
SIM_FEATURES = ["speechiness", "acousticness", "loudness", "danceability", "energy"]

In [13]:
from sklearn.neighbors import NearestNeighbors

#Build feature matrix using SIM_FEATURES
df_feat_sim = df_feat.dropna(subset=SIM_FEATURES).copy()

X_raw_sim = df_feat_sim[SIM_FEATURES].values
scaler_sim = StandardScaler()
X_sim = scaler_sim.fit_transform(X_raw_sim)

#Fit Nearest Neighbors model using cosine distance
nn_model_sim = NearestNeighbors(metric="cosine", algorithm="brute")
nn_model_sim.fit(X_sim)

In [14]:
def get_similar_songs_sim(input_track_id, k=10, same_genre_only=False):
    # find the query song row
    mask = df_feat_sim["id"] == input_track_id
    if not mask.any():
        raise ValueError("Track id not found in df_feat_sim")
    row = df_feat_sim[mask].iloc[0]
    idx = df_feat_sim.index.get_loc(row.name)

    #Query k+1 neighbors in SIM_FEATURES space
    x_query = X_sim[idx].reshape(1, -1)
    distances, indices = nn_model_sim.kneighbors(x_query, n_neighbors=k+1)

    distances = distances.flatten()
    indices = indices.flatten()

    #first neighbor is itself (distance 0), so we can drop it
    distances = distances[1:]
    indices = indices[1:]

    #cosine similarity = 1 - cosine distance
    similarities = 1 - distances

    neighbors = df_feat_sim.iloc[indices].copy()
    neighbors["distance"] = distances
    neighbors["similarity"] = similarities

    #restrict to same genre
    if same_genre_only:
        g0 = row["main_genre"]
        neighbors = neighbors[neighbors["main_genre"] == g0]

    #Return a clean table
    return neighbors[[
        "id", "name", "artists", "artist_name", "main_genre",
        "distance", "similarity"
    ]].sort_values("similarity", ascending=False)

In [15]:
example_id = df_feat_sim.iloc[0]["id"]
get_similar_songs_sim(example_id, k=10)

Unnamed: 0,id,name,artists,artist_name,main_genre,distance,similarity
18716,4hR1I8PGsnERdIDW8izZeZ,Winter In St. Louis,"['MGM Studio Orchestra', 'Georgie Stoll']",MGM Studio Orchestra,classic soundtrack,0.000114,0.999886
334543,0i6NDefE8ZAdYgJLV5DP9Z,Long Mae Ping / ล่องแม่ปิง,['Hucky Eichelmann'],Hucky Eichelmann,thai instrumental,0.000379,0.999621
24446,0bmMvv3wq0hwJdH4xXoxAU,"Violin Sonata No. 8 in G Major, Op. 30 No. 3: ...","['Ludwig van Beethoven', 'Zino Francescatti', ...",Ludwig van Beethoven,classical,0.000387,0.999613
3428,49b8VmiTERLWRaDffo5alJ,Eche Más Caña Patrón - Remasterizado,['Ignacio Corsini'],Ignacio Corsini,tango,0.000517,0.999483
191090,33sjOTo0KW2w7D9jaUA2pg,"Tomorrow Is a Long Time - Live at Town Hall, N...",['Bob Dylan'],Bob Dylan,album rock,0.000581,0.999419
570584,4k8fb4RVb4XrFN0x8ksjUh,"Sonata No. 11 in A Major, K. 331; III. (Rondo)...","['Wolfgang Amadeus Mozart', 'Lili Kraus']",Wolfgang Amadeus Mozart,classical,0.000627,0.999373
202199,4W2PENKiXUCJTGuiZaWH2u,Calcutta Blues,['Dave Brubeck'],Dave Brubeck,american modern classical,0.000639,0.999361
104627,3Ax8bGl9QHS42TXUrADXdx,"Horn Concerto No. 3 in E-Flat Major, K. 447: I...","['Wolfgang Amadeus Mozart', 'William Purvis', ...",Wolfgang Amadeus Mozart,classical,0.000655,0.999345
306085,41WbD1aKFOZX80YizSdzJo,Ströva omkring,['Jan Johansson'],Jan Johansson,swedish jazz,0.000693,0.999307
240599,1nzeu7AgQOy0nGlzbWWLa6,"Capriccio in B-Flat Major, BWV 992 ""On the Dep...","['Johann Sebastian Bach', 'Wilhelm Kempff']",Johann Sebastian Bach,baroque,0.00074,0.99926


## Evaluation

In [16]:
#checking if the similarity function is working or not
def artist_coherence_at_k_sim(k=10, n_samples=500):
    sampled = df_feat_sim.sample(
        n=min(n_samples, len(df_feat_sim)), random_state=0
    )
    scores = []

    for _, row in sampled.iterrows():
        song_id = row["id"]
        artist = row["artists"]

        neighbors = get_similar_songs_sim(song_id, k=k, same_genre_only=False)
        same_artist = neighbors["artists"] == artist
        scores.append(same_artist.mean())  # fraction of neighbors same artist

    return float(np.mean(scores))

In [17]:
def artist_coherence_random(k=10, n_samples=500, random_state=0):
    rng = np.random.default_rng(random_state)
    sampled = df_feat_sim.sample(
        n=min(n_samples, len(df_feat_sim)), random_state=random_state
    )
    scores = []

    n = len(df_feat_sim)
    all_idx = np.arange(n)

    for _, row in sampled.iterrows():
        artist = row["artists"]
        rng.shuffle(all_idx)
        rand_idx = all_idx[:k]
        neighbors = df_feat_sim.iloc[rand_idx]
        same_artist = neighbors["artists"] == artist
        scores.append(same_artist.mean())

    return float(np.mean(scores))

In [18]:
df_genre = df_feat_sim.dropna(subset=["main_genre"]).copy()
def genre_coherence_at_k_sim(k=10, n_samples=500):
    sampled = df_genre.sample(
        n=min(n_samples, len(df_genre)), random_state=0
    )
    scores = []

    for _, row in sampled.iterrows():
        song_id = row["id"]
        genre = row["main_genre"]

        neighbors = get_similar_songs_sim(song_id, k=k, same_genre_only=False)
        neighbors = neighbors.dropna(subset=["main_genre"])
        same_genre = neighbors["main_genre"] == genre
        scores.append(same_genre.mean())

    return float(np.mean(scores))

In [19]:
artist_coh_sim = artist_coherence_at_k_sim(10, 500)
genre_coh_sim = genre_coherence_at_k_sim(10, 500)
artist_coh_random = artist_coherence_random(10, 500)
artist_coh_sim, genre_coh_sim, artist_coh_random

(0.0254, 0.05840000000000001, 0.0004)