In [1]:
!pip install kagglehub --upgrade



In [2]:
import kagglehub

# Downloading the dataset from kaggle
kagglehub.dataset_download("undefinenull/million-song-dataset-spotify-lastfm")

Downloading from https://www.kaggle.com/api/v1/datasets/download/undefinenull/million-song-dataset-spotify-lastfm?dataset_version_number=1...


100%|██████████| 639M/639M [00:08<00:00, 81.6MB/s]

Extracting files...





'/root/.cache/kagglehub/datasets/undefinenull/million-song-dataset-spotify-lastfm/versions/1'

In [3]:
# Importing required libraries
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Importing songs dataset and users dataset
data_path = Path("/root/.cache/kagglehub/datasets/undefinenull/million-song-dataset-spotify-lastfm/versions/1")
songs_data_path = data_path / 'Music Info.csv'
users_data_path = data_path / 'User Listening History.csv'

In [5]:
# Load the songs data
df_songs = pd.read_csv(songs_data_path)
df_songs.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


# Getting the dataset ready

In [6]:
# Shape of the data
df_songs.shape

(50683, 21)

In [7]:
# Data info
df_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50683 entries, 0 to 50682
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   track_id             50683 non-null  object 
 1   name                 50683 non-null  object 
 2   artist               50683 non-null  object 
 3   spotify_preview_url  50683 non-null  object 
 4   spotify_id           50683 non-null  object 
 5   tags                 49556 non-null  object 
 6   genre                22348 non-null  object 
 7   year                 50683 non-null  int64  
 8   duration_ms          50683 non-null  int64  
 9   danceability         50683 non-null  float64
 10  energy               50683 non-null  float64
 11  key                  50683 non-null  int64  
 12  loudness             50683 non-null  float64
 13  mode                 50683 non-null  int64  
 14  speechiness          50683 non-null  float64
 15  acousticness         50683 non-null 

In [8]:
# Missing values
df_songs.isna().sum()

Unnamed: 0,0
track_id,0
name,0
artist,0
spotify_preview_url,0
spotify_id,0
tags,1127
genre,28335
year,0
duration_ms,0
danceability,0


In [9]:
# Ratio of missing values in data
(
    df_songs
    .isna()
    .mean()
    .sort_values(ascending=False)
    .head(2)
    .mul(100)
)

Unnamed: 0,0
genre,55.90632
tags,2.223625


In [11]:
# Duplicates in the data based on spotify_id
print(df_songs.duplicated(subset="spotify_id").sum())

# Drop duplicates
df_songs.drop_duplicates(subset=["spotify_id","year","duration_ms"],inplace=True)

9


In [12]:
# Checking for duplicates
print(
    df_songs
    .duplicated(subset=["spotify_id","year","duration_ms"])
    .sum()
)

0


In [15]:
# Since we removed the duplicated from the dataset, inconsistency created between indices - Resetting Index
df_songs.reset_index(drop=True,inplace=True)

In [16]:
# Removing columns which are not required for content based filtering (for vectorization)
cols_to_remove = ["track_id","name","spotify_preview_url","spotify_id","genre"]
df_content_filtering = df_songs.drop(columns=cols_to_remove)
df_content_filtering.head()

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,The Killers,"rock, alternative, indie, alternative_rock, in...",2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,Oasis,"rock, alternative, indie, pop, alternative_roc...",2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,Nirvana,"rock, alternative, alternative_rock, 90s, grunge",1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,Franz Ferdinand,"rock, alternative, indie, alternative_rock, in...",2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,Radiohead,"rock, alternative, indie, alternative_rock, in...",2008,238640,0.515,0.43,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


*Why to drop `genera` column ?*<br>
- *There are approx 55% missing values*<br>
- *All the geners are also mentioned in the tags column*

In [17]:
# Checking for missing values
df_content_filtering.isna().sum()

Unnamed: 0,0
artist,0
tags,1126
year,0
duration_ms,0
danceability,0
energy,0
key,0
loudness,0
mode,0
speechiness,0


In [18]:
# Fill the missing values with "no_tags"
df_content_filtering.fillna({"tags":"no_tags"},inplace=True)

# Checking for missing values
df_content_filtering.isna().sum()

Unnamed: 0,0
artist,0
tags,0
year,0
duration_ms,0
danceability,0
energy,0
key,0
loudness,0
mode,0
speechiness,0


In [19]:
# Converting artist names as lower case
df_content_filtering["artist"] = df_content_filtering["artist"].str.lower()

In [21]:
# Number of unique artists
(
    df_songs
    .loc[:,'artist']
    .nunique()
)

8317

In [22]:
# Number of unique year values
(
    df_songs
    .loc[:,'year']
    .nunique()
)

75

In [24]:
# Value counts for the tags
(
    df_songs
    .loc[:,'tags']
    .str.lower()
    .str.split(',')
    .explode()
    .str.strip()
    .value_counts()
)

Unnamed: 0_level_0,count
tags,Unnamed: 1_level_1
rock,10681
indie,7284
electronic,6592
alternative,6271
pop,4650
...,...
dark_ambient,602
japanese,489
polish,411
j_pop,213


In [25]:
# Value counts for the tags
(
    df_songs
    .loc[:,'tags']
    .str.lower()
    .str.split(',')
    .explode()
    .str.strip()
    .value_counts()
    .loc[lambda ser: ser >= 1000]
)

Unnamed: 0_level_0,count
tags,Unnamed: 1_level_1
rock,10681
indie,7284
electronic,6592
alternative,6271
pop,4650
...,...
ska,1088
gothic_metal,1072
grindcore,1040
french,1018


In [26]:
! pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [27]:
# Importing necessary libraries
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from category_encoders.count import CountEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

In [28]:
df_content_filtering.head()

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,the killers,"rock, alternative, indie, alternative_rock, in...",2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,oasis,"rock, alternative, indie, pop, alternative_roc...",2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,nirvana,"rock, alternative, alternative_rock, 90s, grunge",1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,franz ferdinand,"rock, alternative, indie, alternative_rock, in...",2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,radiohead,"rock, alternative, indie, alternative_rock, in...",2008,238640,0.515,0.43,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [32]:
df_content_filtering[["duration_ms","loudness","tempo"]].describe()

Unnamed: 0,duration_ms,loudness,tempo
count,50674.0,50674.0,50674.0
mean,251153.6,-8.291007,123.508794
std,107589.2,4.548359,29.622349
min,1439.0,-60.0,0.0
25%,192733.0,-10.375,100.6825
50%,234933.0,-7.1995,121.989
75%,288182.8,-5.089,141.64225
max,3816373.0,3.642,238.895


In [33]:
df_content_filtering[["danceability","energy","speechiness","acousticness","instrumentalness","liveness","valence"]].describe()

Unnamed: 0,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence
count,50674.0,50674.0,50674.0,50674.0,50674.0,50674.0,50674.0
mean,0.493522,0.686507,0.076026,0.213798,0.225299,0.215439,0.433113
std,0.178833,0.251803,0.076012,0.302839,0.337067,0.184708,0.258767
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.364,0.514,0.0352,0.0014,1.8e-05,0.0984,0.214
50%,0.497,0.744,0.0482,0.0399,0.00563,0.138,0.405
75%,0.621,0.905,0.0835,0.34,0.441,0.289,0.634
max,0.986,1.0,0.954,0.996,0.999,0.999,0.993


In [34]:
# Cols to transform
frequency_encode_cols = ['year']
ohe_cols = ['artist',"time_signature","key"]
tfidf_col = 'tags'
standard_scale_cols = ["loudness"]
min_max_scale_cols = ["duration_ms","tempo","danceability","energy","speechiness","acousticness","instrumentalness","liveness","valence"]
# Small change - "duration_ms" and "tempo" cannot be -ve, so I added them into min_max_scaler which is already mentioned in standard_scaler

In [35]:
len(frequency_encode_cols + ohe_cols + standard_scale_cols + min_max_scale_cols)

14

In [36]:
# Transformation pipeline
transformer = ColumnTransformer(transformers=[
    ("frequency_encode", CountEncoder(normalize=True,return_df=True), frequency_encode_cols),
    ("ohe", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ("tfidf", TfidfVectorizer(max_features=85), tfidf_col),
    ("standard_scale", StandardScaler(), standard_scale_cols),
    ("min_max_scale", MinMaxScaler(), min_max_scale_cols)
],remainder='passthrough',n_jobs=-1,force_int_remainder_cols=False)

transformer

In [37]:
# fit the transformer
transformer.fit(df_content_filtering)

In [42]:
# transform the data
transformed_df = transformer.transform(df_content_filtering)

# Shape
print(transformed_df.shape)

# tfidf returns a sparse dataframe
transformed_df

(50674, 8431)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 907900 stored elements and shape (50674, 8431)>

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
# Building input vector
# Lets assume a user is hearing a song - "Whenever, Wherever"
df_songs[df_songs["name"] == "Whenever, Wherever"]
song_input = df_content_filtering[df_songs["name"] == "Whenever, Wherever"]
song_input

Unnamed: 0,artist,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1025,shakira,"rock, pop, female_vocalists, singer_songwriter...",2012,196826,0.787,0.828,1,-4.967,0,0.0474,0.298,5e-06,0.206,0.86,107.674,4


In [46]:
# Transforming our input song
input_vector = transformer.transform(song_input)
input_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20 stored elements and shape (1, 8431)>

In [50]:
# Calculate the similarity matrix
similarity_scores = cosine_similarity(transformed_df, input_vector)
similarity_scores.shape

(50674, 1)

In [51]:
similarity_scores

array([[0.99999937],
       [0.99999913],
       [0.99999923],
       ...,
       [0.99999916],
       [0.99999922],
       [0.99999907]])

In [52]:
top_10_songs_indexes = np.argsort(similarity_scores.ravel())[-11:][::-1]
top_10_songs_indexes

array([ 1025, 12305,  6046,  6129,  6818,  3373,  6089, 17241, 38383,
        6121,  2265])

In [53]:
# Top 10 Recommendations
top_10_songs_names = df_songs.iloc[top_10_songs_indexes]
top_10_songs_names

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1025,TRLWDVU128F932B093,"Whenever, Wherever",Shakira,https://p.scdn.co/mp3-preview/09ddeb4ae33ee6e8...,07PHBDuUmOeZ7jeKSbAbKi,"rock, pop, female_vocalists, singer_songwriter...",,2012,196826,0.787,...,1,-4.967,0,0.0474,0.298,5e-06,0.206,0.86,107.674,4
12305,TRYFVKK128F4240FE8,Why Wait,Shakira,https://p.scdn.co/mp3-preview/d78c90c5cb5626be...,0HiJFRxWme9myvUiDlqQ8q,"pop, experimental, singer_songwriter, dance",,2001,221240,0.887,...,1,-5.535,0,0.0431,0.144,0.00059,0.123,0.399,129.943,4
6046,TRAAKDG128F42A0ECB,Hips Don't Lie,Shakira,https://p.scdn.co/mp3-preview/3859547944f57cfb...,01Yj2MCGpjZs34PRlGgz4K,"pop, female_vocalists, singer_songwriter, danc...",Pop,2001,217453,0.777,...,10,-5.867,0,0.0734,0.284,0.0,0.43,0.76,100.003,4
6129,TRBAUVN128F932FEF8,Oops!...I Did It Again,Britney Spears,https://p.scdn.co/mp3-preview/7fb86827422540ad...,095uakqDYR50Uza0mxvPWB,"pop, female_vocalists, dance, 00s",Pop,2014,211786,0.751,...,1,-5.351,0,0.0435,0.34,1.8e-05,0.255,0.886,95.045,4
6818,TRBAHID128F4278EAF,Objection (Tango),Shakira,https://p.scdn.co/mp3-preview/bf65095d5ce58358...,0p9QhtUdbyDAQ6k14hQ2i3,"pop, female_vocalists, singer_songwriter, danc...",Pop,2001,222533,0.603,...,11,-5.282,0,0.0677,0.0147,0.0,0.0246,0.705,179.344,4
3373,TRINUNP12903CD84D9,Did It Again,Shakira,https://p.scdn.co/mp3-preview/5477eae2283113ff...,0eMNEdcC5OImvrfn79J9dU,"electronic, pop, female_vocalists, experimenta...",,2009,227333,0.869,...,5,-5.069,0,0.0896,0.509,0.0,0.0741,0.599,137.955,4
6089,TROKRSJ128F92E08D5,We Can't Stop,Miley Cyrus,https://p.scdn.co/mp3-preview/38ac1429ee3574d6...,07bIq23j2xO9JSjngkILT1,"pop, female_vocalists, dance",Electronic,2014,231600,0.611,...,1,-5.575,0,0.0351,0.0465,0.0,0.332,0.727,80.043,4
17241,TRMBDIR128F4279C1F,Perfect Lover,Britney Spears,https://p.scdn.co/mp3-preview/52671e54d36f077e...,1BhxPx4evrx8X02RHGrLdi,"pop, dance, rnb, 00s",Rock,2007,182680,0.718,...,1,-3.959,0,0.036,0.353,0.00039,0.102,0.805,117.067,4
38383,TRWUWRZ128F42ADA4A,Dreams for Plans,Shakira,https://p.scdn.co/mp3-preview/6e2c021846087a88...,2ObxMmMaDINr0ynkqW2BlY,"pop, female_vocalists, guitar, pop_rock",Pop,2005,242760,0.689,...,1,-7.427,1,0.0286,0.18,3.8e-05,0.0844,0.548,96.098,4
6121,TRGZIMZ128F930A016,La Isla Bonita,Madonna,https://p.scdn.co/mp3-preview/d8f3cafe99c1f0cd...,0rpndqrkU9y9nckNCfjcq6,"pop, female_vocalists, dance, 80s",,2009,242946,0.708,...,1,-4.736,0,0.0362,0.392,1e-06,0.0561,0.968,99.953,4


In [63]:
def content_recommendation(song_name, songs_data, transformed_data, k=10):
    """
    Recommends top k songs similar to the given song based on content-based filtering.

    Parameters:
    song_name (str): The name of the song to base the recommendations on.
    songs_data (DataFrame): The DataFrame containing song information.
    transformed_data (ndarray): The transformed data matrix for similarity calculations.
    k (int, optional): The number of similar songs to recommend. Default is 10.

    Returns:
    DataFrame: A DataFrame containing the top k recommended songs with their names, artists, and Spotify preview URLs.
    """
    # filter out the song from data
    song_row = songs_data.loc[songs_data["name"] == song_name,:]
    if song_row.empty:
        print("Song not found in the dataset.")
    else:
        # get the index of song
        song_index = song_row.index[0]
        print("Current Song -", song_name) # Delete
        # print(song_index)
        # generate the input vector
        input_vector = transformed_data[song_index].reshape(1,-1)
        # calculate similarity scores
        similarity_scores = cosine_similarity(input_vector, transformed_data)
        # print(similarity_scores.shape)
        # get the top k songs
        top_k_songs_indexes = np.argsort(similarity_scores.ravel())[-k-1:][::-1]
        # print(top_k_songs_indexes)
        # get the top k songs names
        top_k_songs_names = songs_data.iloc[top_k_songs_indexes]
        # print the top k songs
        top_k_list = top_k_songs_names[['name','artist','spotify_preview_url']].reset_index(drop=True)
        return top_k_list

In [76]:
content_recommendation(
    song_name="Hips Don't Lie",
    songs_data=df_songs,
    transformed_data=transformed_df,
    k=10
)

Current Song - Hips Don't Lie


Unnamed: 0,name,artist,spotify_preview_url
0,Hips Don't Lie,Shakira,https://p.scdn.co/mp3-preview/3859547944f57cfb...
1,"Whenever, Wherever",Shakira,https://p.scdn.co/mp3-preview/09ddeb4ae33ee6e8...
2,Objection (Tango),Shakira,https://p.scdn.co/mp3-preview/bf65095d5ce58358...
3,I'm Outta Love,Anastacia,https://p.scdn.co/mp3-preview/feda5c101a29c254...
4,My Prerogative,Britney Spears,https://p.scdn.co/mp3-preview/9140e378563ce3e0...
5,Did It Again,Shakira,https://p.scdn.co/mp3-preview/5477eae2283113ff...
6,Naturally,Selena Gomez & the Scene,https://p.scdn.co/mp3-preview/b7b138f1ea2db51b...
7,Always Too Late,Annie,https://p.scdn.co/mp3-preview/7f2f89b82ed206ea...
8,Ready for the Good Times,Shakira,https://p.scdn.co/mp3-preview/cc7e2fa060501a1c...
9,Wonderland,Natalia Kills,https://p.scdn.co/mp3-preview/6f04fd6fe1f199e2...
