In [30]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gower
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Setting the seed to get reproducible results

In [31]:
np.random.seed(123)

Data preparation. Reading the data in from file, merging the headers into one line, splitting the data into X and Y sets, and into test and train sets

In [32]:
headers = pd.read_csv("./data/tracks.csv", nrows = 3, header = None, sep=",")
combined_headers = headers.apply(lambda x: '.'.join(x.dropna().astype(str)), axis=0)
df = pd.read_csv("./data/tracks.csv", skiprows = 3, header = None, sep=",")
df.columns = combined_headers
print(headers)

X = df.iloc[: ,1:]
Y = df.iloc[:, :1]

X_train,X_test,y_test,y_train = train_test_split(X,Y,test_size= 0.2)

         0         1             2              3         4          5   \
0       NaN     album         album          album     album      album   
1       NaN  comments  date_created  date_released  engineer  favorites   
2  track_id       NaN           NaN            NaN       NaN        NaN   

      6            7        8         9   ...           43        44  \
0  album        album    album     album  ...        track     track   
1     id  information  listens  producer  ...  information  interest   
2    NaN          NaN      NaN       NaN  ...          NaN       NaN   

              45       46       47        48      49         50     51     52  
0          track    track    track     track   track      track  track  track  
1  language_code  license  listens  lyricist  number  publisher   tags  title  
2            NaN      NaN      NaN       NaN     NaN        NaN    NaN    NaN  

[3 rows x 53 columns]


In [33]:
# Retrieve query from the user
def get_user_query(df):
    print("Enter the attributes of the query as comma-separated values:")
    query_input = input()
    try:
        # Split input and convert to match DataFrame structure
        query_data = [float(x) if x.replace('.', '', 1).isdigit() else x for x in query_input.split(",")]
        query_df = pd.DataFrame([query_data], columns=df.columns)
        return query_df
    except Exception as e:
        print(f"Error processing query: {e}")
        return None

Query retrieval, and running the Gower distance algorithm on the data 

In [34]:

query = get_user_query(X) 
if query is None:
    query = X.iloc[0:1]
    
distances = gower.gower_matrix(X, query)

print(distances)

Enter the attributes of the query as comma-separated values:
Error processing query: 52 columns passed, passed data had 3 columns
[[0.17307693]
 [0.23097612]
 [0.21187736]
 ...
 [       nan]
 [       nan]
 [       nan]]


Ranking rows in the DataFrame according to their proximity to the query. Excluding the 1st record from the closest_indices list, since it is an id of the record from the query


In [35]:
distances = np.nan_to_num(distances, nan=np.inf)
distances = distances.flatten()


closest_indices = np.argsort(distances)[1:10001]
print(closest_indices)

print(df.iloc[closest_indices])

[    2     1     9 ...  3428 36395  8343]
       track_id  album.comments   album.date_created  album.date_released  \
2             5               0  2008-11-26 01:44:45  2009-01-05 00:00:00   
1             3               0  2008-11-26 01:44:45  2009-01-05 00:00:00   
9           134               0  2008-11-26 01:44:45  2009-01-05 00:00:00   
5729      10666               0  2008-11-26 01:44:45  2009-01-05 00:00:00   
5846      10815               0  2008-11-26 01:44:45  2009-01-05 00:00:00   
...         ...             ...                  ...                  ...   
7562      13173               0  2009-05-18 19:56:50  2002-01-01 00:00:00   
15533     24907               0  2010-02-08 16:39:46                  NaN   
3428       6522               1  2009-02-19 10:34:10  2008-10-28 00:00:00   
36395     54735               0  2011-10-12 23:43:16  2011-10-01 00:00:00   
8343      14179               0  2009-06-01 08:45:14                  NaN   

      album.engineer  album.favor

In [36]:
# Pre-trained light weight model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Combining textual columns for S-BERT
textual_data = (df['track.title'] + " " + df['track.information']).fillna("")
query_text = (str(query['track.title'].iloc[0]) + " " + str(query['track.information'].iloc[0])).strip()

# Computing embeddings
text_embeddings = model.encode(textual_data.tolist())
query_embedding = model.encode([query_text])

In [37]:
# Filter dataset using closest_indices from Gower
filtered_embeddings = text_embeddings[closest_indices]

# Compute cosine similarity
similarity_scores = cosine_similarity(query_embedding, filtered_embeddings).flatten()

# Get the top N results
top_indices = np.argsort(similarity_scores)[::-1][:10]
final_indices = [closest_indices[i] for i in top_indices]

In [None]:
# Retrieve recommendations
recommendations = df.iloc[final_indices]
print(recommendations[['track.title', 'track.information', 'artist.name']])

KeyError: "None of [Index(['track_title', 'track_information', 'artist_name'], dtype='object')] are in the [columns]"