In [None]:
root_dir = '..'

In [None]:
import os
import pandas as pd
import numpy as np
import random
from google.colab import userdata
from huggingface_hub import login
login(userdata.get("HF_TOKEN"))
from tqdm import tqdm
tqdm.pandas()
import faiss

# Dataset

In [None]:
songs_df = pd.read_csv(f"{root_dir}/spotify_millsongdata.csv")
songs_df = songs_df.drop(columns=["link"])
songs_df["song_id"] = songs_df.index + 1

# Models

In [None]:
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer(
    "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
    model_kwargs={"torch_dtype": torch.float16}
)

embedding_model.max_seq_length = 8192

In [None]:
from transformers import pipeline

summarizer = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
    batch_size=16
)

# Pre-computing the Embeddings

In [None]:
def batch_summarize(lyrics_list, batch_size):
    prompts = [
        [
            {
                "role": "user",
                "content": (
                    "You are an expert song summarizer. You will be given the full lyrics "
                    "to a song. Your task is to write a concise, cohesive summary that "
                    "captures the central emotion, overarching theme, key elements, and "
                    "narrative arc of the song in 200 words.\n\n"
                    f"{lyrics}"
                ),
            }
        ]
        for lyrics in lyrics_list
    ]

    all_summaries = []

    for i in tqdm(range(0, len(prompts), batch_size)):
        batch_prompts = prompts[i : i + batch_size]

        outputs = summarizer(
            batch_prompts,
            max_new_tokens=256,
        )

        for output in outputs:
            generated_text = output[0]["generated_text"]
            assistant_message = generated_text[-1]["content"]
            summary = assistant_message.strip()
            all_summaries.append(summary)

    return all_summaries

lyrics_list = songs_df["text"].values
summaries = batch_summarize(lyrics_list, batch_size=16)
songs_df["summary"] = summaries

In [None]:
song_lyrics = songs_df["text"].values
song_summary = songs_df["summary"].values

In [None]:
lyrics_embeddings = embedding_model.encode(
    song_lyrics,
    batch_size=32,
    show_progress_bar=True
)
np.save(f"{root_dir}/60k_song_lyrics_embeddings.npy", lyrics_embeddings)

summary_embeddings = embedding_model.encode(
    song_summary,
    batch_size=32,
    show_progress_bar=True
)
np.save(f"{root_dir}/60k_song_summary_embeddings.npy", summary_embeddings)

# Vector Search

In [None]:
lyrics_embeddings = np.load(f"{root_dir}/60k_song_lyrics_embeddings.npy")
lyrics_index = faiss.IndexFlatIP(lyrics_embeddings.shape[1])
lyrics_index.add(lyrics_embeddings.astype(np.float32))

summary_embeddings = np.load(f"{root_dir}/60k_song_summary_embeddings.npy")
summary_index = faiss.IndexFlatIP(summary_embeddings.shape[1])
summary_index.add(summary_embeddings.astype(np.float32))

# LyRec

In [None]:
class LyRec:
    def __init__(self, songs_df, lyrics_index, summary_index, embedding_model):
        self.songs_df = songs_df
        self.lyrics_index = lyrics_index
        self.summary_index = summary_index
        self.embedding_model = embedding_model

    def get_records_from_id(self, song_ids):
        songs = []
        for _id in song_ids:
            songs.extend(self.songs_df[self.songs_df["song_id"]==_id+1].to_dict(orient='records'))
        return songs

    def get_songs_with_similar_lyrics(self, query_lyrics, k=10):
        query_embedding = self.embedding_model.encode(
            f"Instruct: Given the lyrics, retrieve relevant songs\n Query: {query_lyrics}"
        ).reshape(1, -1).astype(np.float32)

        scores, song_ids = self.lyrics_index.search(query_embedding, k)
        return self.get_records_from_id(song_ids[0])

    def get_songs_with_similar_description(self, query_description, k=10):
        query_embedding = self.embedding_model.encode(
            f"Instruct: Given a description, retrieve relevant songs\n Query: {query_description}"
        ).reshape(1, -1).astype(np.float32)

        scores, song_ids = self.summary_index.search(query_embedding, k)
        return self.get_records_from_id(song_ids[0])

    def get_songs_with_similar_lyrics_and_description(self, query_lyrics, query_description, k=10):
        query_lyrics_embedding = self.embedding_model.encode(
            f"Instruct: Given the lyrics, retrieve relevant songs\n Query: {query_lyrics}"
        ).reshape(1, -1).astype(np.float32)

        scores, song_ids = self.lyrics_index.search(query_lyrics_embedding, 500)
        top_k_indices = song_ids[0]

        summary_candidates = []
        for idx in top_k_indices:
            emb = self.summary_index.reconstruct(int(idx))
            summary_candidates.append(emb)
        summary_candidates = np.array(summary_candidates, dtype=np.float32)

        temp_index = faiss.IndexFlatIP(summary_candidates.shape[1])
        temp_index.add(summary_candidates)

        query_description_embedding = self.embedding_model.encode(
            f"Instruct: Given a description, retrieve relevant songs\n Query: {query_description}"
        ).reshape(1, -1).astype(np.float32)

        scores, temp_ids = temp_index.search(query_description_embedding, k)
        final_song_ids = [top_k_indices[i] for i in temp_ids[0]]

        return self.get_records_from_id(final_song_ids)

In [None]:
recommender = LyRec(songs_df, lyrics_index, summary_index, embedding_model)
recommender.get_songs_with_similar_lyrics_and_description("Lyrics of a song", 
                                                          "Describe the type of song you want to listen to", 
                                                          5)