Dataset

In [None]:
import pandas as pd

metadata_file_path="/content/drive/MyDrive/vector-databases-course/music-recommendation-system/dataset/reduced_80_fer2013_music_dataset_with_youtube_URLS.csv"
# Loading metadata into DataFrame
metadata_df = pd.read_csv(metadata_file_path)

# Extracting relevant numeric attribute and creating textual description using them
metadata_df['textual_description'] = metadata_df.apply(
    lambda row: f"The song {row['song_name']} has a danceability of {row['danceability']}, "
                f"energy of {row['energy']}, "
                f"loudness of {row['loudness']}, "
                f"speechiness of {row['speechiness']}, "
                f"acousticness of {row['acousticness']}, "
                f"instrumentalness of {row['instrumentalness']}, "
                f"liveness of {row['liveness']}, "
                f"valence of {row['valence']}, "
                f"tempo of {row['tempo']}",
    axis=1
)
# Saving the updated DataFrame with the textual descriptions for each song back to the CSV file
metadata_df.to_csv(metadata_file_path, index=False)
print("The 'textual_description' column has been added to the CSV file.")

we define SPECIAL_TOKENS for numeric attributes of songs to be treated by BERT as special attributes, not regular text. Then, we replace these attributes in the textual descriptions with the corresponding special tokens. This helps the model differentiate between attribute names and other text, improving its understanding and processing.

In [None]:
# Define special tokens for numeric attributes
SPECIAL_TOKENS = {
    '[DANCEABILITY]': 'danceability',
    '[SPEECHINESS]': 'speechiness',
    '[ENERGY]': 'energy',
    '[LOUDNESS]': 'loudness',
    '[ACOUSTICNESS]': 'acousticness',
    '[INSTRUMENTALNESS]': 'instrumentalness',
    '[LIVENESS]': 'liveness',
    '[VALENCE]': 'valence',
    '[TEMPO]': 'tempo'
}

# Tokenize using custom special tokens
def tokenize_with_special_tokens(text):
    for token, attribute in SPECIAL_TOKENS.items():
        text = text.replace(attribute, token)
    return text

# Apply tokenization to create input for BERT
metadata_df['textual_description_with_Special_Tokens'] = metadata_df['textual_description'].apply(tokenize_with_special_tokens)

Original textual description: "The song has a danceability of 0.8, energy of 0.6, loudness of -5.0, speechiness of 0.1, acousticness of 0.3, instrumentalness of 0.0, liveness of 0.2, valence of 0.7, tempo of 120.0."

Textual description with special tokens: "The song has a [DANCEABILITY] of 0.8, [ENERGY] of 0.6, [LOUDNESS] of -5.0, [SPEECHINESS] of 0.1, [ACOUSTICNESS] of 0.3, [INSTRUMENTALNESS] of 0.0, [LIVENESS] of 0.2, [VALENCE] of 0.7, [TEMPO] of 120.0."

Initailize the tokenizer


In [None]:
from transformers import BertTokenizer
# Initialize the BERT tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
# Tokenize using the BERT tokenizer
encoded_inputs = tokenizer(metadata_df['textual_description_with_Special_Tokens'].tolist(), padding=True, truncation=True, return_tensors='pt')

Generate Embeddings

In [None]:
from transformers import BertModel
import torch

# Load the BERT model
bert_model = BertModel.from_pretrained(bert_model_name)

# Forward pass through BERT
with torch.no_grad():
    outputs = bert_model(**encoded_inputs)

# Extract embeddings from the BERT output
last_hidden_states = outputs.last_hidden_state
metadata_embeddings = last_hidden_states[:, 0, :].numpy()  # Use CLS token for sentence-level embedding

Audio embeddings with OpenL3 embedding model

In [None]:
import openl3
model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type="music", embedding_size=512)

In [None]:
import os
import numpy as np
import soundfile as sf

audio_files_folder="/usr/local/musicRecommendationSystemDir/audio-files"

for audio_file in os.listdir(audio_files_folder):
  audio_file_path = os.path.join(audio_files_folder, audio_file)
  audio, sr = sf.read(audio_file_path)
  embeddings_list, ts = openl3.get_audio_embedding(audio, sr, model=model)
  # Aggregate embeddings
  aggregated_embedding = np.mean(np.vstack(embeddings_list), axis=0)

audio: The audio data as a NumPy array.

sr: The sample rate of the audio file.

ts: A NumPy array containing the timestamps corresponding to each embedding.

Storing Audio and Metadata Embeddings to Vector Database

Create a chroma client :
 
 we import chromadb and initialize a persistent Chroma client. This setup ensures that the database is saved and loaded on our local machine

In [None]:
import chromadb
client = chromadb.PersistentClient(path="/usr/local/musicRecommendationSystemDir/database")

Create collections for storing metadata and audio embeddings

In [None]:
metadata_collection_name = "metadata_embeddings_collection"
client.create_collection(
    name = metadata_collection_name,
    metadata = {"hnsw:space": "cosine"}
    )

In [None]:
audio_collection_name = "audio_embeddings_collection"
client.create_collection(
    name = audio_collection_name,
    metadata = {"hnsw:space": "cosine"}
    )

Adding metadata embeddings to the collection

In [None]:
from songmetadata import generateMetadataEmbeddings, getSongDetails
import pandas as pd
import os

# Preparing data we want to add to the collection
ids_list = []
doc_list = []
embeddings_list = []
metadata_list = []

metadata_CSV_file_path="/usr/local/musicRecommendationSystemDir/metadata_songs.csv"
metadata_df = pd.read_csv(metadata_file_path)

song_IDs_list = metadata_df['song_ID']
metadata_embeddings = generateMetadataEmbeddings(metadata_df)

for i in range(len(song_IDs_list)):
    songID = song_IDs_list[i]
    ids_list.append(songID)

    embedding = metadata_embeddings[i]
    embeddings_list.append(embedding.tolist())

    song_name, genre, description = getSongDetails(songID)
    doc_list.append(description)

    metadata = {}
    metadata ["song_name"] = song_name
    metadata ["genre"] = genre
    metadata ["URI"] = "/usr/local/musicRecommendationSystemDir/audio-files/" + songID + ".wav"
    print(metadata)
    metadata_list.append(metadata)

# Adding to the collection song IDs, textual descriptions (docs), embeddings for each doc, and metadata, including song name, genre, and URI
metadata_collection  = client.get_collection(name=metadata_collection_name)
metadata_collection.add(
    documents=doc_list,
    embeddings=embeddings_list,
    metadatas=metadata_list,
    ids=ids_list
)

Querying the Database to Generate Recommendations

1 - Recommendations based on metadata embeddings

In [None]:
current_song_metadata_embedding = metadata_collection.get(
    ids=["LikeaPrayer"],
    include=["embeddings"]
    )

In [None]:
recommendation_based_on_metadata_embedding = metadata_collection.query(
    query_embeddings=current_song_metadata_embedding['embeddings'][0],
    n_results=6,
    include=["metadatas", "distances"]
    )

In [None]:
# Extract IDs and metadatas of the current song from the recommendations
metadata_ids = recommendation_based_on_metadata_embedding["ids"][0]
metadata_metadatas = recommendation_based_on_metadata_embedding["metadatas"][0]

# Filter out the current song ID from the recommendations
filtered_metadata_recommendations = [
    {"id": id_, "metadata": metadata} for id_, metadata in zip(metadata_ids, metadata_metadatas) if id_ != "LikeaPrayer"
]

# Limit results to the desired number (1 in this case)
recommended_songs_metadata = filtered_metadata_recommendations[:5]

2 - Recommendations based on audio embeddings

In [None]:
current_song_audio_embedding = audio_collection.get(
    ids=["LikeaPrayer"],
    include=["embeddings"]
    )

In [None]:
recommendation_based_on_audio_embedding = audio_collection.query(
    query_embeddings=current_song_audio_embedding['embeddings'],
    n_results=6, 
    include=["metadatas", "distances"]
    )

In [None]:
# Extract IDs and metadatas of the current song from the recommendations
ids = recommendation_based_on_audio_embedding["ids"][0]

# Filter out the current song ID from the recommendations
filtered_metadata_recommendations = [
    {"id": id_, "metadata": metadata} for id_, metadata in zip(metadata_ids, metadata_metadatas) if id_ != "LikeaPrayer"
]

# Limit results to the desired number (1 in this case)
recommended_songs_audio = filtered_audio_recommendations[:5]

3 - Combination of audio + metadata embeddings

In [None]:
metadata_ids = recommendation_based_on_metadata_embedding["ids"][0]
metadata_metadatas = recommendation_based_on_metadata_embedding["metadatas"][0]
metadata_distances = recommendation_based_on_metadata_embedding["distances"][0]

audio_ids = recommendation_based_on_audio_embedding["ids"][0]
audio_metadatas = recommendation_based_on_audio_embedding["metadatas"][0]
audio_distances = recommendation_based_on_audio_embedding["distances"][0]

In [None]:
combined_recommendations = {}

# Aggregate distances
for id_, metadata, metadata_distance in zip(metadata_ids, metadata_metadatas, metadata_distances):
    if id_ != "LikeaPrayer":
        if id_ not in combined_recommendations:
            combined_recommendations[id_] = {"metadata": metadata, "aggregate_distance": 0}
        combined_recommendations[id_]["aggregate_distance"] += metadata_distance

for id_, metadata, audio_distance in zip(audio_ids, audio_metadatas, audio_distances):
    if id_ != "LikeaPrayer":
        if id_ not in combined_recommendations:
            combined_recommendations[id_] = {"metadata": metadata, "aggregate_distance": 0}
        combined_recommendations[id_]["aggregate_distance"] += audio_distance

In [None]:
# Sort recommendations by aggregated distance in ascending order
sorted_recommendations = sorted(combined_recommendations.items(), key=lambda item: item[1]["aggregate_distance"])

# Get the top five recommendations
top_5_recommendations = sorted_recommendations[:5]