In [22]:
# Imports
import pandas as pd
import numpy as np
import gower
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Setting the seed to get reproducible results

In [23]:
np.random.seed(123)

Data preparation. Reading the data in from file, merging the headers into one line, splitting the data into X and Y sets, and into test and train sets

In [24]:
# Load and process the dataset
headers = pd.read_csv("./data/tracks.csv", nrows=3, header=None, sep=",")
combined_headers = headers.apply(lambda x: '.'.join(x.dropna().astype(str)), axis=0)
df = pd.read_csv("./data/tracks.csv", skiprows=3, header=None, sep=",")
df.columns = combined_headers

# Drop unnecessary columns
columns_to_drop = [
    "album.comments", "album.id", "album.information", "album.engineer",
    "artist.bio", "artist.comments", "artist.latitude", "artist.longitude",
    "artist.related_projects", "artist.website", "artist.wikipedia_page",
    "set.split", "set.subset", "track.information", "track.lyricist",
    "track.interest", "track.license", "track.number", "track.bit_rate"
]
df.drop(columns=columns_to_drop, inplace=True)

print(f"Dataset loaded and processed. Shape: {df.shape}")

Dataset loaded and processed. Shape: (106574, 34)


In [25]:
# First step : Gower's Distance Filtering
def get_user_query():
    print("Input your query in the following CSV format: Title, Artist, Album Title, Genre, Year.")
    print("Only 'Title' is mandatory. Leave fields empty if not needed.")
    
    query_input = input("Enter your query: ").strip()
    
    # Expected columns
    columns = ["track.title", "artist.name", "album.title", "track.genre_top", "track.date_created"]
    
    # Split the input into fields and handling empty ones
    query_values = query_input.split(",")
    query_values = [x.strip() if x.strip() else None for x in query_values]
    query_values = (query_values + [None] * len(columns))[:len(columns)]
    
    # Map the values to their corresponding columns
    query_dict = dict(zip(columns, query_values))
    
    # Ensure that at least the 'track.title' is provided
    if not query_dict["track.title"]:
        print("Error: 'Title' is mandatory. Please provide a valid query.")
        return None
    
    # Create the query DataFrame
    query_df = pd.DataFrame([query_dict])
    return query_df
    
# Dropping the None columns in the query and reducing dataset
def filter_relevant_columns(df, query_df):
    relevant_columns = [col for col in query_df.columns if query_df[col].iloc[0] is not None]
    
    df_reduced = df[relevant_columns]
    query_reduced = query_df[relevant_columns]
    return df_reduced, query_reduced

# Calculating Gower's distances
def calculate_gower_distances(df, query_df):
    distances = gower.gower_matrix(df, query_df)
    return distances.flatten()

In [26]:
# Second step : S-BERT refinement
def refine_with_sbert(df, query, filtered_indices):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Prepare text data for S-BERT
    df_subset = df.iloc[filtered_indices]
    textual_data = (df_subset['track.title'] + " " + df_subset['track.genre_top']).fillna("")
    query_text = " ".join([str(query[col].iloc[0]) for col in query.columns if query[col].iloc[0]])
    
    # Compute embeddings and cosine similarity
    text_embeddings = model.encode(textual_data.tolist())
    query_embedding = model.encode([query_text])
    similarity_scores = cosine_similarity(query_embedding, text_embeddings).flatten()
    
    # Get top 10 recommendations
    top_indices = np.argsort(similarity_scores)[::-1][:10]
    final_indices = [filtered_indices[i] for i in top_indices]
    return df.iloc[final_indices]

Main Execution

In [27]:

query_df = get_user_query()

if query_df is None or query_df['track.title'].isnull().any():
    print("Error: Title is mandatory. Please provide a valid query.")
else:
    df_reduced, query_reduced = filter_relevant_columns(df, query_df)

    # Stage 1: Filter with Gower
    distances = calculate_gower_distances(df_reduced, query_reduced)
    closest_indices = np.argsort(distances)[:500]
    print(f"Top {len(closest_indices)} tracks filtered using Gower's distance.")

    # Stage 2: Refine with S-BERT
    recommendations = refine_with_sbert(df, query_df, closest_indices)
    print("Top Recommendations:")
    display(recommendations[['track.title', 'artist.name', 'album.title', 'track.genre_top']])

Input your query in the following CSV format: Title, Artist, Album Title, Genre, Year.
Only 'Title' is mandatory. Leave fields empty if not needed.
Top 150 tracks filtered using Gower's distance.
Top Recommendations:


Unnamed: 0,track.title,artist.name,album.title,track.genre_top
62385,Night,Muhmood,Tamara and Demon,Instrumental
102540,Night,Kirill Makushin,Infection!,Experimental
53195,Night,abc100,dirty dishes from a hostile cosmos,Electronic
106550,Fallen Stars,Alex Mason,Return,Instrumental
77,Bella Crane,Ed Askew,What I Know,Folk
149,Brian's Garage,Blah Blah Blah,30th Anniversary Blah Blah Blah,Rock
15,Queen Of The Wires,Alec K. Redfearn & the Eyesores,The Blind Spot,Folk
75,My Dream,Ed Askew,Blue Piano,Folk
10,Father's Day,Abominog,mp3,Rock
121,The Hotel,Blah Blah Blah,Stripey Collection,Rock
