Building Music Recommendation System using Spotify Dataset

In an era where digital music consumption is at an all-time high, the ability to tailor music recommendations to individual preferences has become increasingly important. This project aims to develop a music recommendation system that provides personalized song suggestions to users based on their listening habits and preferences. By leveraging data-driven techniques and machine learning algorithms, this system aspires to enhance user experience by delivering music that aligns with their tastes.

In [18]:
#Import Libraries

import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [19]:
data = pd.read_csv('https://raw.githubusercontent.com/TheTheo1/spotify_recommendation/main/data.csv')

In [20]:
genre_data = pd.read_csv('https://raw.githubusercontent.com/TheTheo1/spotify_recommendation/main/data_by_genres.csv')

In [21]:
year_data = pd.read_csv('https://raw.githubusercontent.com/TheTheo1/spotify_recommendation/main/data_by_year.csv')

In [22]:
artist_data = pd.read_csv('https://raw.githubusercontent.com/TheTheo1/spotify_recommendation/main/data_by_artist.csv')

In [31]:
data.head(2)


Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,decade
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954,1920
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936,1920


In [24]:
genre_data.head(2)

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5


In [25]:
year_data.head(2)

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10


In [26]:
artist_data.head(2)

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""Cats"" 1981 Original London Cast",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""Cats"" 1983 Broadway Cast",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [28]:
genre_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB


In [29]:
import pandas as pd


# Create a 'decade' column by extracting the decade from the 'year' column
data['decade'] = data['year'].apply(lambda x: (x // 10) * 10)

#The lambda function takes each year x, divides it by 10 (integer division), and then multiplies it back by 10 to get the starting year of the decade.

# Display the first few rows to check the new column
print(data[['year', 'decade']].head())


   year  decade
0  1921    1920
1  1921    1920
2  1921    1920
3  1921    1920
4  1921    1920


In [32]:
data.sample(10)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,decade
114372,0.0938,1967,0.845,['Burt Bacharach'],0.289,228667,0.136,0,5mH45SStWUZm5UN3ulDqKr,0.0134,1,0.126,-17.579,1,A House Is Not A Home,27,1967-01-01,0.0372,142.822,1960
124730,0.561,2020,0.416,['Eslabon Armado'],0.385,270668,0.645,0,7Aiq4g8tSExAM2qUNkiHbZ,0.0,8,0.0913,-8.032,1,Tal Vez,67,2020-07-17,0.034,182.809,2020
5140,0.71,1947,0.453,"['Lon Hoyt', 'Company']",0.637,309267,0.869,0,1bpwP0e6gS2nPWiJGsEAzb,0.0,9,0.235,-8.427,1,"You Can't Stop The Beat (from ""Hairspray"") - V...",4,1947,0.184,170.007,1940
37410,0.522,2014,0.0579,"['Nicki Minaj', 'Drake', 'Lil Wayne']",0.89,220467,0.667,1,7e44z4wOjkllaD8eulUeMd,3e-05,2,0.108,-6.862,1,Truffle Butter,65,2014-12-15,0.0521,105.099,2010
31230,0.571,1983,0.016,['Dio'],0.484,276373,0.947,0,1dlBMzm6CqwQIUQ8PbT3NP,0.00142,11,0.0691,-5.475,1,Straight Through the Heart,40,1983,0.0467,94.517,1980
61512,0.278,1952,0.995,['Lata Mangeshkar'],0.449,166133,0.0576,0,0CdXd57Obm6P9nyEZKx9LY,0.382,2,0.131,-20.045,1,Woh Paas Nahin Majboor Hai Dil,0,1952-01-01,0.0423,82.949,1950
103814,0.729,1995,0.148,['E-40'],0.826,251333,0.74,1,4R9rFOfgxa7xMCgXwgdMhR,0.0,8,0.298,-6.581,1,Da Bumble,36,1995-03-14,0.206,88.909,1990
136706,0.0972,1999,4e-05,['Cradle Of Filth'],0.342,396829,0.926,0,14JG4VbsqeLsFuDJlkybOT,0.683,7,0.186,-6.313,0,From the Cradle to Enslave,45,1999-06-05,0.0939,147.313,1990
61812,0.789,1953,0.882,['Lata Mangeshkar'],0.595,222707,0.3,0,07XHoD2XHmzgxGHhWw7fLx,0.0,9,0.347,-20.306,1,Kare Badra Tu Na Ja,0,1953-01-01,0.0481,116.461,1950
146390,0.703,1973,0.704,['The Isley Brothers'],0.392,186000,0.555,0,6Mbz2K522lOwpjhjqvLuPx,0.0,7,0.149,-11.378,1,You Walk Your Way,25,1973-08-21,0.166,157.505,1970
