In [48]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gower
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Setting the seed to get reproducible results

In [49]:
np.random.seed(123)

Data preparation. Reading the data in from file, merging the headers into one line, splitting the data into X and Y sets, and into test and train sets

In [50]:
headers = pd.read_csv("./data/tracks.csv", nrows = 3, header = None, sep=",")
combined_headers = headers.apply(lambda x: '.'.join(x.dropna().astype(str)), axis=0)
df = pd.read_csv("./data/tracks.csv", skiprows = 3, header = None, sep=",")
df.columns = combined_headers
print(headers)

df.drop(columns=["album.comments", "album.id", "album.information", "album.engineer", "artist.bio", "artist.comments", "artist.latitude", "artist.longitude", "artist.related_projects", "artist.website", "artist.wikipedia_page", "set.split", "set.subset", "track.information", "track.lyricist", "track.interest", "track.license", "track.number", "track.bit_rate"], inplace=True)

X = df.iloc[: ,1:]
Y = df.iloc[:, :1]

X_train,X_test,y_test,y_train = train_test_split(X,Y,test_size= 0.2)

         0         1             2              3         4          5   \
0       NaN     album         album          album     album      album   
1       NaN  comments  date_created  date_released  engineer  favorites   
2  track_id       NaN           NaN            NaN       NaN        NaN   

      6            7        8         9   ...           43        44  \
0  album        album    album     album  ...        track     track   
1     id  information  listens  producer  ...  information  interest   
2    NaN          NaN      NaN       NaN  ...          NaN       NaN   

              45       46       47        48      49         50     51     52  
0          track    track    track     track   track      track  track  track  
1  language_code  license  listens  lyricist  number  publisher   tags  title  
2            NaN      NaN      NaN       NaN     NaN        NaN    NaN    NaN  

[3 rows x 53 columns]


In [51]:
# Retrieve query from the user
def get_user_query(df):
    print("Enter the attributes of the query as comma-separated values:")
    query_input = input()
    try:
        # Split input and convert to match DataFrame structure
        query_data = [float(x) if x.replace('.', '', 1).isdigit() else x for x in query_input.split(",")]
        query_df = pd.DataFrame([query_data], columns=df.columns)
        return query_df
    except Exception as e:
        print(f"Error processing query: {e}")
        return None

Query retrieval, and running the Gower distance algorithm on the data 

In [52]:

query = get_user_query(X) 
if query is None:
    query = X.iloc[0:1]
    
distances = gower.gower_matrix(X, query)

print(distances)

Enter the attributes of the query as comma-separated values:
Error processing query: 33 columns passed, passed data had 1 columns
[[0.15151516]
 [0.21229906]
 [0.21227367]
 ...
 [0.66944236]
 [0.6691228 ]
 [0.7257222 ]]


Ranking rows in the DataFrame according to their proximity to the query. Excluding the 1st record from the closest_indices list, since it is an id of the record from the query


In [53]:
distances = np.nan_to_num(distances, nan=np.inf)
distances = distances.flatten()


closest_indices = np.argsort(distances)[1:10001]
print(closest_indices)

print(df.iloc[closest_indices])

[    9     2     1 ... 12297 12348 12769]
       track_id   album.date_created  album.date_released  album.favorites  \
9           134  2008-11-26 01:44:45  2009-01-05 00:00:00                4   
2             5  2008-11-26 01:44:45  2009-01-05 00:00:00                4   
1             3  2008-11-26 01:44:45  2009-01-05 00:00:00                4   
5846      10815  2008-11-26 01:44:45  2009-01-05 00:00:00                4   
5729      10666  2008-11-26 01:44:45  2009-01-05 00:00:00                4   
...         ...                  ...                  ...              ...   
12193     19965  2009-10-22 08:34:13  2005-08-02 00:00:00                0   
12910     20996  2009-11-12 18:47:07                  NaN                1   
12297     20081  2009-10-27 03:36:05  2009-10-18 00:00:00                0   
12348     20136  2009-10-28 08:02:51                  NaN                0   
12769     20831  2009-11-12 05:23:00                  NaN                0   

       album.listens 

In [54]:
# Pre-trained light weight model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Combining textual columns for S-BERT
textual_data = (df['track.title'] + " " + df['track.information']).fillna("")
query_text = (str(query['track.title'].iloc[0]) + " " + str(query['track.information'].iloc[0])).strip()

# Computing embeddings
text_embeddings = model.encode(textual_data.tolist())
query_embedding = model.encode([query_text])

KeyError: 'track.information'

In [11]:
# Filter dataset using closest_indices from Gower
filtered_embeddings = text_embeddings[closest_indices]

# Compute cosine similarity
similarity_scores = cosine_similarity(query_embedding, filtered_embeddings).flatten()

# Get the top N results
top_indices = np.argsort(similarity_scores)[::-1][:10]
final_indices = [closest_indices[i] for i in top_indices]

In [12]:
# Retrieve recommendations
recommendations = df.iloc[final_indices]
print(recommendations[['track.title', 'track.information', 'artist.name']])

                                            track.title  \
17926                                     Eating Babies   
2191       Turning Dance and Fast Csardas from Bonchida   
28829                  Steak and Acid/Claudette colbert   
1891                   How Much Pudding Can Tomy Handle   
4426                              Neptune Sunset Casino   
3232   Shitlife (Retrigger Rotuque mix) ft. Don Augusto   
6783     Melting Your Brains (featuring the DRUM BUDDY)   
1862                                           No Coins   
1793                                    Puddin' and Pie   
1820                                    Penny and Jenny   

                                       track.information  \
17926  <p>Eating Babies is from the album Death Face,...   
2191   <p>Effusive thanks to: Veronica Liu (board op)...   
28829  <p>From 'Music for meditation relaxation and t...   
1891   <p><span style="margin: 0pt 5px; float: left;"...   
4426                   <P>Based on Thai Molam styl