In [11]:
# LLM Integration for Music Recommendation System (SentenceTransformers)
# =============================================================================

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
from tqdm import tqdm

tracks = pd.read_csv('../data/tracks_processed.csv')

# SentenceTransformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def safe_embed(text, model, max_retries=3):
    """
    Obtain a text embedding using the SentenceTransformer model with error handling.
    Returns a zero vector (of dimension 384 for all-MiniLM-L6-v2) if text is missing or retries fail.
    """
    if pd.isna(text):
        return np.zeros(384)
    
    retries = 0
    while retries < max_retries:
        try:
            embedding = model.encode(text)
            return np.array(embedding)
        except Exception as e:
            print(f"Error during embedding: {e}. Retrying in 5 seconds...")
            time.sleep(5)
            retries += 1
    return np.zeros(384)

# caches for embeddings
genre_embedding_cache = {}
artist_embedding_cache = {}

# Process unique genres using distributed batching
unique_genres = tracks['track_genre'].unique()
for genre_batch in tqdm(np.array_split(unique_genres, 10), 
                        desc="Processing genre batches", total=10):
    for genre in genre_batch:
        if genre not in genre_embedding_cache:
            genre_embedding_cache[genre] = safe_embed(genre, embedding_model)

# Process unique artists using distributed batching
unique_artists = tracks['artists'].unique()
for artist_batch in tqdm(np.array_split(unique_artists, 10), 
                         desc="Processing artist batches", total=10):
    for artist in artist_batch:
        if artist not in artist_embedding_cache:
            artist_embedding_cache[artist] = safe_embed(artist, embedding_model)

# Map the computed embeddings back to the tracks DataFrame.
# Convert the numpy arrays to lists for CSV serialization.
tracks['genre_embedding'] = tracks['track_genre'].map(genre_embedding_cache).apply(lambda emb: emb.tolist())
tracks['artist_embedding'] = tracks['artists'].map(artist_embedding_cache).apply(lambda emb: emb.tolist())

# Save the enhanced data to a new CSV file
output_path = '../data/tracks_llm_enhanced.csv'
try:
    tracks.to_csv(output_path, index=False)
    print(f"Embedding integration complete. Enhanced data saved to {output_path}.")
except Exception as e:
    print(f"Failed to save enhanced data: {e}. Check embedding dimensions and data format.")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Exception ignored in: <function tqdm.__del__ at 0x00000277E74C6660>
Traceback (most recent call last):
  File "c:\Users\samue\AppData\Local\Programs\Python\Python313\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\samue\AppData\Local\Programs\Python\Python313\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
Processing genre batches: 100%|██████████| 10/10 [00:02<00:00,  3.72it/s]
Processing artist batches: 100%|██████████| 10/10 [06:10<00:00, 37.07s/it]


Embedding integration complete. Enhanced data saved to ../data/tracks_llm_enhanced.csv.
