In [None]:
!pip install kagglehub --upgrade

In [None]:
import kagglehub

kagglehub.dataset_download("undefinenull/million-song-dataset-spotify-lastfm")

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [None]:
data_path = Path("/root/.cache/kagglehub/datasets/undefinenull/million-song-dataset-spotify-lastfm/versions/1")


songs_data_path = data_path / 'Music Info.csv'
users_data_path = data_path / 'User Listening History.csv'

In [None]:
# songs_data

songs_df = pd.read_csv(songs_data_path, usecols=["track_id","name","artist","spotify_preview_url"])

songs_df.head()

# Dask
For handling out the big data.

In [None]:
!pip install dask[dataframe]

In [None]:
# loading the dask library
import dask.dataframe as dd

# Loading and reading the dataset
df = dd.read_csv(users_data_path)

# Check the first few rows to verify the data
df.head()

In [None]:
df

Here values are not getting printed because "Dask" is storing the values in chunks. So in order to save memory it stores the metadata of the values and not the actual values. If we want to the values then we can use the "compute()" function.

In [None]:
df.compute().head()

If want to see what are the partitions made then we can use "visualize()" function.

In [None]:
df.visualize()

In [None]:
df.visualize(tasks=True)

So as we can see here a total of 9 chunks has been created by "Dask". The default value of the chunk size is : `64 mb`

In [None]:
# to get the total number of partitions made :
df.npartitions

In [None]:
# number of unique tracks in the data
unique_tracks = df.track_id.nunique() # here we will have the dask scaler object.
unique_tracks = unique_tracks.compute()
unique_tracks

In [None]:
# to visulaize the task graph of 'unique_tracks'
unique_tracks.visualize()

In [None]:
unique_tracks.visualize(tasks=True)

In [None]:
# number of unique users in the data
unique_users = df.user_id.nunique()
unique_users

In [None]:
unique_users.visualize()

In [None]:
unique_users = unique_users.compute()
unique_users

In [None]:
# list of unique track ids
unique_track_id = df.track_id.unique()
unique_track_id.visualize()

In [None]:
unique_track_id = unique_track_id.compute()
unique_track_id

In [None]:
# filtered songs

filtered_songs = songs_df[songs_df["track_id"].isin(unique_track_id)]

filtered_songs.reset_index(drop=True, inplace=True)

In [None]:
filtered_songs.head()

# Merging Process

In [None]:
import dask.dataframe as dd
import numpy as np
from scipy.sparse import csr_matrix

# Step 1: Load data with Dask
# Assume the dataset is in a CSV file

df = dd.read_csv(users_data_path)

# Step 2: Ensure playcount is numeric
df['playcount'] = df['playcount'].astype(np.float64)

# "categorize() : it converts the column into category dtype and helps in size reduction"
df = df.categorize(columns=['user_id', 'track_id'])


# Step 3: Convert user_id and track_id to numeric indices
# This is necessary for creating a sparse matrix later
user_mapping = df['user_id'].cat.codes # .cat : .str
track_mapping = df['track_id'].cat.codes # codes : will convert to 'int'

df = df.assign(
    user_idx=user_mapping,
    track_idx=track_mapping
)

In [None]:
df.visualize(tasks=True)

In [None]:
# Step 4: Compute the interaction matrix
# Dask doesn't support pivot tables directly, so we aggregate manually
interaction_array = df.groupby(['track_idx', 'user_idx'])['playcount'].sum().reset_index()

interaction_array.visualize(tasks=True)

In [None]:
"""
As we haven't computed it so we will not be able to see any data/value.
"""
interaction_array

In [None]:
# computing the interaction array
interaction_array = interaction_array.compute()
"""
We will be getting a pandas dataframe and not Dask dataframe.
"""

In [None]:
type(interaction_array)

In [None]:
interaction_array.head()

In [None]:
# Step 5: Create a sparse matrix
# Collect the data into a NumPy-friendly format

row_indices = interaction_array['track_idx']
col_indices = interaction_array['user_idx']
values = interaction_array['playcount']

In [None]:
row_indices.nunique()

In [None]:
# Build a sparse matrix
n_tracks = unique_tracks
n_users = unique_users

sparse_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(n_tracks, n_users))

print("Sparse matrix shape:", sparse_matrix.shape)
print("Non-zero elements:", sparse_matrix.nnz)

In [None]:
sparse_matrix

In [None]:
(sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1]))*100

In [None]:
sparse_matrix[0]

# Calculating Cosine

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
np.where(df['track_id'].cat.categories == "TROINZB128F932F740")

In [None]:
ind = 17018

In [None]:
input_array = sparse_matrix[ind]

input_array

In [None]:
similarity_scores = cosine_similarity(input_array, sparse_matrix)

In [None]:
np.sort(similarity_scores)[-6:][::-1]

In [None]:
np.argsort(similarity_scores.ravel())[-6:][::-1]

In [None]:
recommendations = df['track_id'].cat.categories[np.argsort(similarity_scores.ravel())[-6:][::-1]]

In [None]:
recommendations

In [None]:
filtered_songs[filtered_songs["name"] == "Crazy in Love"]

In [None]:
filtered_songs[filtered_songs["track_id"].isin(recommendations)]

In [None]:
def collaborative_recommendation(song_name,user_data,songs_data,interaction_matrix,k=5):
    # fetch the row from songs data
    song_row = songs_data[songs_data["name"] == song_name]
    print(song_row)
    # track_id of input song
    input_track_id = song_row['track_id'].values.item()
    print(input_track_id)
    # index value of track_id
    ind = np.where(user_data['track_id'].cat.categories == input_track_id)[0].item()
    print(ind)
    # fetch the input vector
    input_array = interaction_matrix[ind]
    # get similarity scores
    similarity_scores = cosine_similarity(input_array, interaction_matrix)
    # get top k recommendations
    recommendation_track_ids = df['track_id'].cat.categories[np.argsort(similarity_scores.ravel())[-k-1:][::-1]]
    print(recommendation_track_ids)
    # get top scores
    top_scores = np.sort(similarity_scores.ravel())[-k-1:][::-1]
    print(top_scores)
    # get the songs from data and print
    temp_df = pd.DataFrame({"track_id":recommendation_track_ids.tolist(),
                            "score":top_scores})
    print(temp_df)
    top_k_songs = (
                    songs_data
                    .loc[songs_data["track_id"].isin(recommendation_track_ids)]
                    .merge(temp_df,on="track_id")
                    .sort_values(by="score",ascending=False)
                    .drop(columns=["track_id","score"])
                    .reset_index(drop=True)
                    )
    return top_k_songs

In [None]:
collaborative_recommendation(song_name="Crazy in Love",
                             user_data=df,
                             songs_data=filtered_songs,
                             interaction_matrix=sparse_matrix)