In [None]:
# Used dataset : https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs/data
import pandas as pd
import numpy as np
import pickle
import torch
import ast

from sentence_transformers import SentenceTransformer

print(torch.cuda.is_available())  # True
print(torch.cuda.get_device_name(0))

model = SentenceTransformer("all-MiniLM-L6-v2")  # free, fast, 384-dim

df = pd.read_csv("./dataset/millionSongs.csv")

df["artists"] = df["artists"].apply(ast.literal_eval)

def build_semantic_text(row):
    artists = ", ".join(row["artists"])
    return (
        f"Song: {row['name']} by {artists}. "
        f"Album: {row['album']}. Released in {row['year']} on {row['release_date']}. "
        f"This track is characterized by danceability {row['danceability']}, energy {row['energy']}, "
        f"valence {row['valence']}, acousticness {row['acousticness']}, instrumentalness {row['instrumentalness']}, "
        f"speechiness {row['speechiness']}, liveness {row['liveness']}, loudness {row['loudness']}, "
        f"and tempo {row['tempo']} BPM. "
        f"It has a key of {row['key']}, mode {row['mode']}, and a time signature of {row['time_signature']}. "
        f"Duration: {row['duration_ms']} milliseconds."
    )

df["semantic_text"] = df.apply(build_semantic_text, axis=1)

texts = df["semantic_text"].tolist()

embeddings = model.encode(texts, batch_size=128, device="cuda", show_progress_bar=True)

# df["embedding"] = df["semantic_text"].apply(lambda x: model.encode(x).tolist())

df["embedding"] = embeddings.tolist()

with open("./million_embeddings.pkl", "wb") as f:
    pickle.dump(df, f)


In [None]:
import torch

from sentence_transformers import SentenceTransformer

print(torch.cuda.is_available())  # True
print(torch.cuda.get_device_name(0))

model = SentenceTransformer("all-MiniLM-L6-v2")  # free, fast, 384-dim

In [23]:
embeddings = df["embedding"]

#embeddings_np = np.array(embeddings)  # shape: (num_songs, 384) this is giving 1D arrray
embeddings_np = np.stack(df["embedding"].values).astype("float32")  # shape: (num_songs, 384)
print(embeddings_np.shape)

(1204025, 384)


In [24]:
import faiss

faiss.normalize_L2(embeddings_np)

In [44]:
d = embeddings_np.shape[1]  # embedding dimension (384)
index = faiss.IndexFlatIP(d)  # inner product index
index.add(embeddings_np)      # add all embeddings to index

print("Number of vectors in index:", index.ntotal)
q = "rap battle"
q_emb = model.encode(q).astype("float32").reshape(1, -1)

faiss.normalize_L2(q_emb)

score, index = index.search(q_emb, 10)

for idx, scr in zip(index[0], score[0]):
    print(f"Song: {df['name'].iloc[idx]} : {scr}")

Number of vectors in index: 1204025
Song: Rap Battle : 0.6036165356636047
Song: Victory (feat. The Notorious B.I.G. & Busta Rhymes) : 0.5630593299865723
Song: Rappers Battle : 0.5559302568435669
Song: BRING IT ON 〜Battle of Rap〜 : 0.5446075201034546
Song: George Washington vs William Wallace : 0.544097363948822
Song: The Battle : 0.5433796048164368
Song: Victory (feat. The Notorious B.I.G. & Busta Rhymes) - 2014 Remaster : 0.5424882173538208
Song: When Rappers Attack : 0.5419893264770508
Song: Battle Rhymes For Battle Times : 0.5373817682266235
Song: Rap Supremacy (Remix) : 0.5367496609687805


In [None]:
# This works but is slow af for data with millions of items
import numpy as np
import pandas as pd
import pickle

def cosine_similarity_np(A, B):
    A = np.array(A)
    B = np.array(B)
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

q = "pop"
q_emb = model.encode(q)
q_emb = q_emb.reshape(1, -1)          # shape: (1, 384)

similarities = (embeddings_np @ q_emb.T).flatten()  # dot product
similarities /= np.linalg.norm(embeddings_np, axis=1)
similarities /= np.linalg.norm(q_emb)

top_idx = np.argsort(-similarities)[:10]

for i in top_idx:
    print(f"{df['name'][i]}: {similarities[i]:.4f}")


In [None]:
# --- Slowest approch below ---
    
# similarities = [cosine_similarity_np(q_emb, song_emb) for song_emb in embeddings]
    
# songs = df["name"]

# results = list(zip(songs, similarities))

# # Sort by similarity descending
# results = sorted(results, key=lambda x: x[1], reverse=True)

# # Top 10 similar songs
# top_10 = results[:10]

# for song, score in top_10:
#     print(f"{song}: {score:.4f}")

In [None]:
import gc

del df
del texts
del embeddings

gc.collect()