In [113]:
import numpy as np
import pandas as pd
import ast
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dropout, BatchNormalization, GlobalAveragePooling1D, concatenate
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

data = pd.read_csv("../datasets/sam_df_clean.csv")

# parse genre string as a Python list
def parse_genre(s):
    try: # try to convert string to list
        return ast.literal_eval(s)  
    except (ValueError, SyntaxError):
        # fallback: wrap as a single-element list
        return [str(s)]
data['track_genre'] = data['track_genre'].apply(parse_genre)


# data.head()
print(data.shape)
print(data.columns.tolist())
data.head()

(83679, 20)
['track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,[acoustic]
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,[acoustic]
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,[acoustic]
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,[acoustic]
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,[acoustic]


# Genre Feature Engineering:

start by creating a list within track_genre where there are more than one genre

In [114]:
# concise aggregation: collect unique genres per track_id
genre_dataset = (
    data.groupby("track_id")["track_genre"]
        .agg(lambda vals: list({
            g.strip()
            for v in vals
            for g in (v if isinstance(v, (list, tuple)) else str(v).split(";"))
        }))
        .reset_index()
)

genre_dataset.head()

Unnamed: 0,track_id,track_genre
0,0000vdREvCVMxbQTkS888c,[german]
1,000Iz0K615UepwSJ5z2RE5,[minimal-techno]
2,000RDCYioLteXcutOjeweY,[hip-hop]
3,000qpdoc97IMTBvF8gwcpy,[minimal-techno]
4,0017XiMkqbTfF2AUOzlhj6,[comedy]


In [115]:
multi_genre_df = genre_dataset[genre_dataset["track_genre"].apply(len) > 1]

if multi_genre_df.empty:
    print("No multi-genre songs found.")
else:
    multi_genre_song = multi_genre_df.iloc[0]
    print(multi_genre_song.to_dict())


No multi-genre songs found.


Now changing the format of the genre values, from list to string

In [116]:
import ast

# Assuming 'track_genre' column contains genre strings formatted as lists
# Remove the brackets and convert the string representation of lists to actual lists.
data["track_genre"] = data["track_genre"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# If you want to flatten and normalize the list to a single string (e.g., "forro") from lists
data["track_genre"] = data["track_genre"].apply(lambda x: x[0] if isinstance(x, list) and x else "uncategorized")

# Now, get unique genres
unique_genres = data["track_genre"].unique()
print(unique_genres)


<StringArray>
[   'acoustic',    'afrobeat',    'alt-rock', 'alternative',     'ambient',
       'anime', 'black-metal',   'bluegrass',       'blues',      'brazil',
 ...
     'spanish',       'study',     'swedish',   'synth-pop',       'tango',
      'techno',      'trance',    'trip-hop',     'turkish', 'world-music']
Length: 113, dtype: str


Now create a new aggregated genre category to support the genere indices:

- high-level genre category takes specific category names to make top-level genre category

In [117]:
# define the genre categories with dictionary
genre_categories = {
    'pop-mainstream': [
        'pop', 'pop-film', 'power-pop', 'k-pop', 'j-pop', 'mandopop', 
        'cantopop', 'indie-pop', 'synth-pop', 'j-idol'
    ],
    'rock': [
        'rock', 'alt-rock', 'grunge', 'punk', 'punk-rock', 'indie', 
        'psych-rock', 'garage', 'rock-n-roll', 'rockabilly', 'hard-rock'
    ],
    'electronic': [
        'house', 'techno', 'trance', 'dubstep', 'edm', 'electro', 'electronic',
        'drum-and-bass', 'deep-house', 'progressive-house', 'chicago-house',
        'detroit-techno', 'hardstyle', 'minimal-techno', 'idm'
    ],
    'hiphop-rnb': [
        'hip-hop', 'r-n-b'
    ],
    'metal': [
        'metal', 'heavy-metal', 'death-metal', 'black-metal', 'metalcore',
        'grindcore', 'hardcore'
    ],
    'country-folk': [
        'country', 'folk', 'honky-tonk', 'singer-songwriter', 'songwriter'
    ],
    'jazz-blues': [
        'jazz', 'blues', 'soul'
    ],
    'world-regional': [
        'latin', 'latino', 'afrobeat', 'brazil', 'forro', 'salsa', 'samba',
        'sertanejo', 'pagode', 'mpb', 'french', 'spanish', 'german', 'swedish',
        'indian', 'iranian', 'malay', 'turkish', 'j-dance', 'j-rock'
    ],
    'dance-club': [
        'dance', 'dancehall', 'disco', 'club', 'reggaeton', 'reggae', 'dub'
    ],
    'classical': [
        'classical', 'opera', 'new-age'
    ],
    'niche-mood': [
        'acoustic', 'ambient', 'anime', 'bluegrass', 'breakbeat', 'british',
        'children', 'chill', 'comedy', 'disney', 'emo', 'funk', 'gospel', 
        'goth', 'guitar', 'groove', 'happy', 'industrial', 'kids', 'party',
        'piano', 'romance', 'sad', 'show-tunes', 'ska', 'sleep', 'study', 'world-music'
    ]
}


Create function to add new top category to df

Call the function with the data df

In [118]:
def add_genre_cat(X: pd.DataFrame) -> pd.DataFrame:

    X = X.copy()
    genre_to_cat = {}

    for cat, genres in genre_categories.items():
            for genre in genres:
                genre_to_cat[genre] = cat

    X['genre_subcategory'] = X['track_genre'].map(genre_to_cat).fillna('uncategorized')
    print(X['genre_subcategory'].value_counts())
    
    return X

In [119]:
# Add the genre categories using the add_genre_cat function
multi_genre_df = add_genre_cat(data)

genre_subcategory
niche-mood        24134
world-regional    15528
electronic        10467
pop-mainstream     7585
rock               6308
metal              5516
dance-club         4340
country-folk       2781
classical          2549
uncategorized      2125
jazz-blues         1231
hiphop-rnb         1115
Name: count, dtype: int64


In [120]:
# Check for occurrences of "uncategorized" in the genre_subcategory column
uncategorized_count = multi_genre_df["genre_subcategory"].value_counts().get("uncategorized", 0)

print(f"Number of uncategorized genres: {uncategorized_count}")

# Optionally, show the rows with uncategorized genres if you want to inspect them
uncategorized_rows = multi_genre_df[multi_genre_df["genre_subcategory"] == "uncategorized"]
print(uncategorized_rows)

Number of uncategorized genres: 2125
                     track_id                        artists  \
2779   09gysnJpfQ3ublBmJDfcEC       Amaarae;Kali Uchis;Moliy   
2780   4fouWK6XVHhzl78KzQ1UjL                          GAYLE   
2781   07MDkzWARZaLEdKxo6yArG  THE ANXIETY;WILLOW;Tyler Cole   
2782   60a0Rd6pjrkxjPbaKzXjfq                    Linkin Park   
2783   2nLtzopw4rPReszdYBJU6h                    Linkin Park   
...                       ...                            ...   
81909  3TpcGANz2N705Bq4zc982H                   Roots Manuva   
81910  57qWtXga1hMwSfkhLDJCKQ        Everything But The Girl   
81911  0xcDUsknTawAv5VZKQ62aZ                     Wax Tailor   
81912  7LVOum3l1HrbitK9AoUPd7                     Cibo Matto   
81913  2Mt8qdFRaaIgzsf2bxwH4Y  Lovage;Nathaniel Merriweather   

                                              album_name  \
2779   SAD GIRLZ LUV MONEY Remix (feat. Kali Uchis an...   
2780                                             abcdefu   
2781          

In [121]:
# Get unique track_genre values that are categorized as "uncategorized"
uncategorized_tracks_unique = multi_genre_df[multi_genre_df["genre_subcategory"] == "uncategorized"]["track_genre"].unique()

# Create a DataFrame from the unique values
uncategorized_tracks_df = pd.DataFrame(uncategorized_tracks_unique, columns=["track_genre"])

# Specify the path to save the CSV file
output_file_path = "uncategorized_tracks.csv"  # Adjust the name/path as needed

# Write the DataFrame to a CSV file
uncategorized_tracks_df.to_csv(output_file_path, index=False)

print(f"Uncategorized tracks have been saved to {output_file_path}")


Uncategorized tracks have been saved to uncategorized_tracks.csv


Checking the csv we see that 'alternative, tango and trip hop' were not categorised, so we add them to the dictionary below

In [122]:
# define the genre categories with dictionary
genre_categories2 = {
    'pop-mainstream': [
        'pop', 'pop-film', 'power-pop', 'k-pop', 'j-pop', 'mandopop', 
        'cantopop', 'indie-pop', 'synth-pop', 'j-idol'
    ],
    'rock': [
        'rock', 'alt-rock', 'grunge', 'punk', 'punk-rock', 'indie', 
        'psych-rock', 'garage', 'rock-n-roll', 'rockabilly', 'hard-rock'
    ],
    'electronic': [
        'house', 'techno', 'trance', 'dubstep', 'edm', 'electro', 'electronic',
        'drum-and-bass', 'deep-house', 'progressive-house', 'chicago-house',
        'detroit-techno', 'hardstyle', 'minimal-techno', 'idm'
    ],
    'hiphop-rnb': [
        'hip-hop', 'r-n-b','trip-hop'
    ],
    'metal': [
        'metal', 'heavy-metal', 'death-metal', 'black-metal', 'metalcore',
        'grindcore', 'hardcore'
    ],
    'country-folk': [
        'country', 'folk', 'honky-tonk', 'singer-songwriter', 'songwriter'
    ],
    'jazz-blues': [
        'jazz', 'blues', 'soul'
    ],
    'world-regional': [
        'latin', 'latino', 'afrobeat', 'brazil', 'forro', 'salsa', 'samba',
        'sertanejo', 'pagode', 'mpb', 'french', 'spanish', 'german', 'swedish',
        'indian', 'iranian', 'malay', 'turkish', 'j-dance', 'j-rock', 'tango'
    ],
    'dance-club': [
        'dance', 'dancehall', 'disco', 'club', 'reggaeton', 'reggae', 'dub'
    ],
    'classical': [
        'classical', 'opera', 'new-age'
    ],
    'niche-mood': [
        'acoustic', 'alternative', 'ambient', 'anime', 'bluegrass', 'breakbeat', 'british',
        'children', 'chill', 'comedy', 'disney', 'emo', 'funk', 'gospel', 
        'goth', 'guitar', 'groove', 'happy', 'industrial', 'kids', 'party',
        'piano', 'romance', 'sad', 'show-tunes', 'ska', 'sleep', 'study', 'world-music'
    ]
}


In [123]:
def add_genre_cat(X: pd.DataFrame) -> pd.DataFrame:

    X = X.copy()
    genre_to_cat = {}

    for cat, genres in genre_categories2.items():
            for genre in genres:
                genre_to_cat[genre] = cat

    X['genre_subcategory'] = X['track_genre'].map(genre_to_cat).fillna('uncategorized')
    print(X['genre_subcategory'].value_counts())
    
    return X

In [124]:
multi_genre_df2 = add_genre_cat(data)

genre_subcategory
niche-mood        24383
world-regional    16522
electronic        10467
pop-mainstream     7585
rock               6308
metal              5516
dance-club         4340
country-folk       2781
classical          2549
hiphop-rnb         1997
jazz-blues         1231
Name: count, dtype: int64


There are no uncategorized in the genre sub-category now

In [125]:
multi_genre_df2

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,genre_subcategory
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic,niche-mood
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic,niche-mood
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic,niche-mood
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic,niche-mood
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic,niche-mood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83674,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music,niche-mood
83675,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music,niche-mood
83676,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music,niche-mood
83677,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music,niche-mood


**Code below**:

- Collects unique genres from both track_genre and genre_subcategory.
- Constructs a genre_id column containing lists of indices based on the genre in each corresponding row.

In [126]:

# create separate vocabularies and index mappings (no concatenation)
unique_track_genres = pd.Series(multi_genre_df2['track_genre'].unique())
unique_subcats = pd.Series(multi_genre_df2['genre_subcategory'].unique())

track_genre_to_index = {g: i for i, g in enumerate(unique_track_genres)}
subcat_to_index = {g: i for i, g in enumerate(unique_subcats)}

# map to separate id columns (one index per field)
multi_genre_df2['track_genre_id'] = multi_genre_df2['track_genre'].map(track_genre_to_index)
multi_genre_df2['genre_subcat_id'] = multi_genre_df2['genre_subcategory'].map(subcat_to_index)

# diagnostics
num_track_genres = len(track_genre_to_index)
num_subcats = len(subcat_to_index)
print("num unique track_genre:", num_track_genres)
print("num unique genre_subcategory:", num_subcats)
print(multi_genre_df2[['track_genre','track_genre_id','genre_subcategory','genre_subcat_id']].head())


num unique track_genre: 113
num unique genre_subcategory: 11
  track_genre  track_genre_id genre_subcategory  genre_subcat_id
0    acoustic               0        niche-mood                0
1    acoustic               0        niche-mood                0
2    acoustic               0        niche-mood                0
3    acoustic               0        niche-mood                0
4    acoustic               0        niche-mood                0


**Creates a dictionary mapping each genre to a unique index.**

In [None]:
# Save mapping from song Id to row index (needed for recommendations later)
song_ids = multi_genre_df2['track_id'].values
id2index = {sid: idx for idx, sid in enumerate(song_ids)}

**Next scale and prepare numeric/audio features for the numeric branch of the model.**
- Why:
  - Ensure features have comparable ranges so no single feature dominates learning.
  - Keep preprocessing explicit and reproducible.
  - Provide a float32 matrix ready to feed the neural network.
- Steps:
  - Select relevant numeric/song attributes.
  - Standardize (zero mean, unit variance) using StandardScaler.
  - Inspect a few rows to confirm transformation.

In [144]:
# creates a list of numeric features (attributes)  to use for processing, analysis, or building the model using sound profile embedding
numeric_features = ['popularity','danceability', 'duration_ms', 'energy','key','loudness',   
                    'speechiness','acousticness','instrumentalness','liveness',
                    'valence','tempo']

# Scale features so all are roughly on the same scale
scaler = StandardScaler()
X_numeric_sound_profile_input = scaler.fit_transform(multi_genre_df2[numeric_features])

# view the features have scale properly
X_numeric_sound_profile_input[:3]

array([[ 1.94952593,  0.65611278, -0.00418544, -0.67536733, -1.20421411,
         0.34589524,  0.47188738, -0.87493611, -0.55115742,  0.70276957,
         0.94840511, -1.13937629],
       [ 1.03073852, -0.78991886, -0.70635476, -1.81827102, -1.20421411,
        -1.64272524, -0.10521842,  1.75081707, -0.5511436 , -0.59861208,
        -0.75329035, -1.48629541],
       [ 1.13282601, -0.68824476, -0.17605475, -1.07054081, -1.48533872,
        -0.22060281, -0.28345499, -0.3514343 , -0.55116048, -0.51759221,
        -1.31165917, -1.52478653]])

**Prepare integer index arrays and vocabulary sizes for separate genre and subcategory embeddings:**
- Why:
  - Use separate embeddings for track-level genres and subcategories to preserve within-field likeness.
  - Compute vocabulary sizes (input_dim) for each Embedding layer from max index + 1.
  - Ensure index arrays are int32 and inspect diagnostics before modeling.
- Steps:
  - Extract integer index arrays for each field.
  - Compute vocab sizes as max_index + 1.
  - Run basic diagnostics and ensure correct dtypes.

In [138]:

# build arrays for each position (shape (n_samples,))
X_track = multi_genre_df2['track_genre_id'].astype('int32').values
X_subcat = multi_genre_df2['genre_subcat_id'].astype('int32').values

# vocab sizes for separate embeddings
num_track_genres = int(X_track.max()) + 1
num_subcats = int(X_subcat.max()) + 1

# diagnostics
print("X_genre_pair.shape:", X_genre_pair.shape)      # (n_samples, 2)
print("example pair (index 26):", X_genre_pair[26])
print("num_track_genres (vocab size):", num_track_genres)
print("num_subcats (vocab size):", num_subcats)

# ensure integer dtype
X_track = X_track.astype('int32')
X_subcat = X_subcat.astype('int32')


X_genre_pair.shape: (83679, 2)
example pair (index 26): [0 0]
num_track_genres (vocab size): 113
num_subcats (vocab size): 11


**Build an NN that:**

- (1) scales numeric sound features and passes them through a neural network pipeline 
- (2) learns two separate embedding layers — one for track_genre tokens and one for genre_subcategory tokens 
- (3) flattens/combines those embeddings with the numeric branch (sound profile), and 
- (4) outputs a 64‑dimensional song embedding

In [149]:
# data arrays
X_track = multi_genre_df2['track_genre_id'].astype('int32').values      # shape (n,)
X_subcat = multi_genre_df2['genre_subcat_id'].astype('int32').values    # shape (n,)
X_numeric = X_numeric_sound_profile_input.astype('float32')             # your numeric matrix

embedding_dim = 32

# model inputs
track_input = Input(shape=(1,), dtype='int32', name='track_input')
subcat_input = Input(shape=(1,), dtype='int32', name='subcat_input')
numeric_input = Input(shape=(X_numeric.shape[1],), name='numeric_input')

# numeric branch (song profile) NN pipeline
x_numeric = BatchNormalization()(numeric_input)
x_numeric = Dense(64, activation='relu')(x_numeric)
x_numeric = Dropout(0.2)(x_numeric)

# embeddings (separate)
track_emb = Embedding(input_dim=num_track_genres, output_dim=24, name='track_embedding')(track_input)
track_emb = Flatten()(track_emb)        # (batch, 24)

subcat_emb = Embedding(input_dim=num_subcats, output_dim=8, name='subcat_embedding')(subcat_input)
subcat_emb = Flatten()(subcat_emb)      # (batch, 8)

# combine
x = concatenate([track_emb, subcat_emb, x_numeric])
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
song_embedding = Dense(64, activation=None, name='song_embedding')(x)

model = Model(inputs=[track_input, subcat_input, numeric_input], outputs=song_embedding)
model.compile(optimizer='adam', loss='mse')  # adjust loss for your task
model.summary()


# All songs get an embedding¶

- Prepare inputs: ensure genre indices are int32 and numeric features are float32.
- Predict song embeddings from the model using batch inference.
- L2-normalize embeddings (unit vectors) for cosine-similarity search.

In [141]:

# prepare inputs (ensure int32 for indices, float32 for numeric)
X_track_in = X_track.astype('int32')          # shape (n_samples,) or (n_samples,1)
X_subcat_in = X_subcat.astype('int32')        # shape (n_samples,) or (n_samples,1)
X_numeric_in = X_numeric_sound_profile_input.astype('float32')  # shape (n_samples, n_features)

# Extract embeddings for all songs
all_embeddings = model.predict([X_track_in, X_subcat_in, X_numeric_in], batch_size=1024)

# Normalize for cosine similarity (avoid division by zero)
norms = np.linalg.norm(all_embeddings, axis=1, keepdims=True)
norms[norms == 0] = 1.0
all_embeddings = all_embeddings / norms


[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


**Dictionary mapping each song ID to its embedding vector:**

- look up a song’s vector when computing similarities or generating recommendations
- if you know your favorite song’s ID, you can get its embedding via id2embedding[fav_song_id] and then compare it to all other songs

In [150]:
id2embedding = {sid: emb for sid, emb in zip(song_ids, all_embeddings)}

**The recommender function:**

- input a single song
- when you input multiple songs is average out their embeddings to get a single embedding, and then based on that one pick similar songs

In [151]:
def recommend_songs(favorite_song_ids, top_k=5):
    # Get embeddings for the favorite songs
    fav_embs = [id2embedding[sid] for sid in favorite_song_ids if sid in id2embedding]
    if len(fav_embs) == 0:
        print("No favorite songs found in embeddings.")
        return []
    
    # Average embeddings if multiple songs
    avg_emb = np.mean(fav_embs, axis=0)
    avg_emb /= np.linalg.norm(avg_emb)

    # Cosine similarity with all songs
    sims = [(sid, np.dot(avg_emb, emb), data_single['Name'].iloc[id2index[sid]]) 
            for sid, emb in id2embedding.items()]
    sims.sort(key=lambda x: x[1], reverse=True)
    
    # Return top-k
    return sims[:top_k]