In [108]:
# Imports
import pandas as pd
import numpy as np
import gower
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Setting the seed to get reproducible results

In [109]:
np.random.seed(123)

Data pre-processing. Reading the data in from file, merging the headers into one line, dropping unrelevant columns

In [None]:
# Load and process the dataset
headers = pd.read_csv("./data/tracks.csv", nrows=3, header=None, sep=",")
combined_headers = headers.apply(lambda x: '.'.join(x.dropna().astype(str)), axis=0)
df = pd.read_csv("./data/tracks.csv", skiprows=3, header=None, sep=",")
df.columns = combined_headers

# Drop unnecessary columns
columns_to_drop = [
    "album.comments", "album.id", "album.information", "album.engineer",
    "artist.bio", "artist.comments", "artist.latitude", "artist.longitude",
    "artist.related_projects", "artist.website", "artist.wikipedia_page",
    "set.split", "set.subset", "track.information", "track.lyricist",
    "track.interest", "track.license", "track.number", "track.bit_rate"
]
df.drop(columns=columns_to_drop, inplace=True)

print(f"Dataset loaded and processed. Shape: {df.shape}")

First step : Calculating Gower's distance to get a subset

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   track_id                  106574 non-null  int64 
 1   album.date_created        103045 non-null  object
 2   album.date_released       70294 non-null   object
 3   album.favorites           106574 non-null  int64 
 4   album.listens             106574 non-null  int64 
 5   album.producer            18060 non-null   object
 6   album.tags                106574 non-null  object
 7   album.title               105549 non-null  object
 8   album.tracks              106574 non-null  int64 
 9   album.type                100066 non-null  object
 10  artist.active_year_begin  22711 non-null   object
 11  artist.active_year_end    5375 non-null    object
 12  artist.associated_labels  14271 non-null   object
 13  artist.date_created       105718 non-null  object
 14  arti

In [111]:
# Parsing query
def get_user_query():
    print("Input your query in the following CSV format: Title, Artist, Album Title, Genre, Year.")
    print("Only 'Title' is mandatory. Leave fields empty if not needed.")
    
    query_input = input("Enter your query: ").strip()
    
    # Expected columns
    columns = ["track.title", "artist.name", "album.title", "track.genre_top", "track.date_created"]
    
    # Split the input into fields and handling empty ones
    query_values = query_input.split(",")
    query_values = [x.strip() if x.strip() else None for x in query_values]
    query_values = (query_values + [None] * len(columns))[:len(columns)]
    
    # Map the values to their corresponding columns
    query_dict = dict(zip(columns, query_values))
    
    # Ensure that at least the 'track.title' is provided
    if not query_dict["track.title"]:
        print("Error: 'Title' is mandatory. Please provide a valid query.")
        return None
    
    # Create the query DataFrame
    query_df = pd.DataFrame([query_dict])
    return query_df
    
# Dropping the None columns in the query and reducing dataset size
def filter_relevant_columns(df, query_df):
    relevant_columns = [col for col in query_df.columns if query_df[col].iloc[0] is not None]
    
    df_reduced = df[relevant_columns]
    query_reduced = query_df[relevant_columns]
    return df_reduced, query_reduced

# Calculating Gower's distances
def calculate_gower_distances(df, query_df):
    distances = gower.gower_matrix(df, query_df)
    return distances.flatten()

Second step : Applying S-BERT to refine the results

In [112]:
# Using S-BERT on the subset
def refine_with_sbert(df, query, filtered_indices):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    df_subset = df.iloc[filtered_indices]
    
    textual_data = (df_subset['track.title'] + " " +
                    df_subset['track.genre_top'].fillna("") + " " +
                    df_subset['track.genres_all'].fillna("") + " " +
                    df_subset['artist.name'].fillna("")).fillna("")
    
    # Compute embeddings and cosine similarity
    text_embeddings = model.encode(textual_data.tolist())
    query_embedding = model.encode([query])

    similarity_scores = cosine_similarity(query_embedding, text_embeddings).flatten()
    
    
    # Get top 10 recommendations
    top_indices = np.argsort(similarity_scores)[::-1][:10]
    final_indices = [filtered_indices[i] for i in top_indices]
    return df.iloc[final_indices]

Main Execution

In [None]:
# Getting user query
query_df = get_user_query()

if query_df is None or query_df['track.title'].isnull().any():
    print("Error: Title is mandatory. Please provide a valid query.")
else:
    df_reduced, query_reduced = filter_relevant_columns(df, query_df)

    # Stage 1: Filter with Gower
    distances = calculate_gower_distances(df_reduced, query_reduced)
    closest_indices = np.argsort(distances)[:1000] # Getting closest tracks
    print(f"Top {len(closest_indices)} tracks filtered using Gower's distance.")

    user_nlp_query = input("Now, describe in detail what you're looking for (e.g., 'I want upbeat pop songs similar to tropical house.'): ")

    # Stage 2: Refine with S-BERT
    recommendations = refine_with_sbert(df, user_nlp_query, closest_indices)
    
    print("\nInitial input:")
    display(query_df)

    print("Description:", user_nlp_query)

    print("\nTop Recommendations:")
    display(recommendations[['track.title', 'artist.name', 'album.title', 'track.genre_top']])