# Imports

In [2]:
# Data import and manipulation
import numpy as np
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Model building
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Ignoring warnings
import warnings
warnings.filterwarnings("ignore")

# Text preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading data

In [3]:
data = pd.read_csv('/content/dataset.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


The data have a shape of 114000x21.

## Data exploring and preprocessing

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [7]:
data.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

The useful columns from this datasets are: 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'track_genre'.

In [8]:
print('Number of unique albums ',data['album_name'].nunique())
print('Number of unique artists ',data['artists'].nunique())
print('Number of unique genres ',data['track_genre'].nunique())

Number of unique albums  46589
Number of unique artists  31437
Number of unique genres  114


Hare is a large number of unique albums. To make it numerical we must use tfidf vectorizer. So, I wont use 'album_name' because it will increase data dimentions and add less value.

In [9]:
data['text'] = data[['artists', 'track_genre']].astype(str).agg(' '.join, axis=1)
data['text'].head()

Unnamed: 0,text
0,Gen Hoshino acoustic
1,Ben Woodward acoustic
2,Ingrid Michaelson;ZAYN acoustic
3,Kina Grannis acoustic
4,Chord Overstreet acoustic


Here, I have added the 'artists' and 'track_genre' columns to create a new column named 'text'.

In [10]:
# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words (1-2 letters)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize & remove stopwords
    return ' '.join(words)

# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


A function for text preprocessing was defined and used to preprocess the 'text' column.

In [11]:
data['text']

Unnamed: 0,text
0,gen hoshino acoustic
1,ben woodward acoustic
2,ingrid michaelson zayn acoustic
3,kina grannis acoustic
4,chord overstreet acoustic
...,...
113995,rainy lullaby world music
113996,rainy lullaby world music
113997,cesária evora world music
113998,michael smith world music


In [12]:
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(data['text'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()


Unnamed: 0,aaron,abba,academy,acoustic,adam,adele,afrobeat,age,airport,alan,...,yungblud,yuvan,zaeden,zara,zeca,zedd,zero,zimmer,zion,zombie
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.629074,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.563876,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


A new dataframe for the text based features were formed, keeping the top 1000 features.

In [13]:
tfidf_df.describe()

Unnamed: 0,aaron,abba,academy,acoustic,adam,adele,afrobeat,age,airport,alan,...,yungblud,yuvan,zaeden,zara,zeca,zedd,zero,zimmer,zion,zombie
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,...,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,0.000548,0.000516,0.000256,0.006712,0.00177,0.000541,0.006857,0.004902,0.00098,0.001354,...,0.000393,0.001061,0.00054,0.000332,0.000628,0.000793,0.000461,0.000854,0.000346,0.00083
std,0.019112,0.020679,0.011847,0.074516,0.032877,0.021086,0.076157,0.052951,0.025291,0.027924,...,0.01776,0.023571,0.020984,0.014437,0.019987,0.023824,0.018113,0.021621,0.013962,0.02299
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.943212,0.828878,0.831609,1.0,0.874563,0.826979,1.0,0.716606,0.653498,0.834255,...,0.915412,0.55922,0.854642,0.829824,0.809234,0.87322,0.823394,0.795001,0.824769,0.854651


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [15]:
numerical_data = data[['popularity', 'duration_ms', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo']]
numerical_data.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,73,230666,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917
1,55,149610,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489
2,57,210826,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332
3,71,201933,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74
4,82,198853,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949


Another dataframe with the useful numerical columns from the actual dataset was created.

In [16]:
data['explicit'] = data['explicit'].astype(int)
numerical_data = pd.concat([numerical_data, data['explicit']], axis=1)


the column 'explicit' had boolean values. These were converted into integer values and this 'explicit' column was added to the 'numerical_data' dataframe.

In [17]:
numerical_data.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,explicit
0,73,230666,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,0
1,55,149610,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,0
2,57,210826,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,0
3,71,201933,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,0
4,82,198853,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,0


In [18]:
# Initialize the scaler
scaler = StandardScaler()

original_columns = numerical_data.columns

# Fit and transform the features
numerical_data_scaled = scaler.fit_transform(numerical_data)

numerical_data_scaled = pd.DataFrame(numerical_data_scaled, columns=original_columns, index=numerical_data.index)

The numerical data were scaled with the standard scaler.

In [19]:
merged_data = pd.concat([numerical_data_scaled, tfidf_df], axis=1)


Merged the scaled numerical data and the text based dataframe into a dataframe.

In [20]:
merged_data.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,yungblud,yuvan,zaeden,zara,zeca,zedd,zero,zimmer,zion,zombie
0,1.782627,0.024575,0.629244,-0.717148,-1.210442,0.300828,-1.326281,0.551848,-0.850202,-0.504109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.975633,-0.730859,-0.845908,-1.88998,-1.210442,-1.784744,0.753988,-0.078993,1.831732,-0.504094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.065299,-0.160332,-0.742186,-1.122669,-1.491343,-0.293288,0.753988,-0.273826,-0.315499,-0.504112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.692961,-0.243214,-1.733304,-2.312994,-1.491343,-2.039252,0.753988,-0.457309,1.774593,-0.503883,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.186125,-0.271919,0.29503,-0.788711,-0.929541,-0.28275,0.753988,-0.303145,0.463399,-0.504112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Correlation Matrix

In [21]:
data_corr = merged_data.corr()
data_corr

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,yungblud,yuvan,zaeden,zara,zeca,zedd,zero,zimmer,zion,zombie
popularity,1.000000,-0.007101,0.035448,0.001056,-0.003853,0.050423,-0.013931,-0.044927,-0.025472,-0.095139,...,0.007935,0.038559,0.020910,0.031825,0.010623,0.005573,0.012515,-0.000418,-0.019809,-0.035486
duration_ms,-0.007101,1.000000,-0.073426,0.058523,0.008114,-0.003470,-0.035556,-0.062600,-0.103788,0.124371,...,-0.007666,0.013120,-0.017006,-0.006935,-0.000860,-0.003088,0.002936,0.010457,-0.000013,-0.001163
danceability,0.035448,-0.073426,1.000000,0.134325,0.036469,0.259077,-0.069219,0.108626,-0.171533,-0.185606,...,-0.007984,0.032243,0.021458,0.010769,0.003313,0.017444,-0.005382,-0.070916,0.027305,-0.001932
energy,0.001056,0.058523,0.134325,1.000000,0.048006,0.761690,-0.078362,0.142509,-0.733906,-0.181879,...,0.015546,-0.000935,-0.009614,0.004841,0.006000,0.010705,0.011392,-0.061477,0.016453,0.039669
key,-0.003853,0.008114,0.036469,0.048006,1.000000,0.038590,-0.135916,0.020418,-0.040937,-0.006823,...,0.001003,-0.008973,0.003337,0.004693,-0.000928,-0.000735,0.001368,0.002673,0.001275,-0.002428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zedd,0.005573,-0.003088,0.017444,0.010705,-0.000735,0.027817,-0.006304,-0.011530,-0.013316,-0.016033,...,-0.000736,-0.001498,-0.000857,-0.000765,-0.001046,1.000000,-0.000847,-0.001315,-0.000826,-0.001202
zero,0.012515,0.002936,-0.005382,0.011392,0.001368,0.009346,-0.003925,-0.002670,-0.013408,-0.001784,...,-0.000563,-0.001146,-0.000656,-0.000585,-0.000800,-0.000847,1.000000,-0.001006,-0.000632,-0.000919
zimmer,-0.000418,0.010457,-0.070916,-0.061477,0.002673,-0.085056,-0.009595,-0.014499,0.026327,0.080503,...,-0.000874,-0.001778,-0.001018,-0.000908,-0.001242,-0.001315,-0.001006,1.000000,-0.000980,-0.001427
zion,-0.019809,-0.000013,0.027305,0.016453,0.001275,0.019377,0.016257,0.007429,-0.015278,-0.012337,...,-0.000548,-0.001117,-0.000639,-0.000570,-0.000780,-0.000826,-0.000632,-0.000980,1.000000,-0.000896


# Fitting K-means to data

In [22]:
# Define the number of clusters
n_clusters = 1140

# Initialize K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit K-means to the data
data['cluster'] = kmeans.fit_predict(merged_data)


I  have fitted the k-means clustering algorithm to make 1140 clusters. Meaning each cluster will contain 100 items. Thus the recommender system will be capable of recommending 100 most similar songs for a particular song.

In [23]:
data

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,text,cluster
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,0,0.676,0.4610,...,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic,gen hoshino acoustic,370
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,0,0.420,0.1660,...,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic,ben woodward acoustic,441
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,0,0.438,0.3590,...,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic,ingrid michaelson zayn acoustic,910
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,0,0.266,0.0596,...,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic,kina grannis acoustic,602
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,0,0.618,0.4430,...,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic,chord overstreet acoustic,193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113995,113995,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,0,0.172,0.2350,...,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music,rainy lullaby world music,1068
113996,113996,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,0,0.174,0.1170,...,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music,rainy lullaby world music,851
113997,113997,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,0,0.629,0.3290,...,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music,cesária evora world music,538
113998,113998,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,0,0.587,0.5060,...,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music,michael smith world music,1133


# Music recommendation function

In [24]:
def get_kmeans_recommendations(song_name, data, top_n=10):
    # Find the cluster of the given song
    song_cluster = data.loc[data['track_name'] == song_name, 'cluster'].values

    if len(song_cluster) == 0:
        print(f"Song '{song_name}' not found in the dataset.")
        return []

    song_cluster = song_cluster[0]

    # Get the songs from the same cluster
    cluster_songs = data[data['cluster'] == song_cluster]

    # Exclude the input song itself
    cluster_songs = cluster_songs[cluster_songs['track_name'] != song_name]

    # Get the top N recommendations
    recommendations = cluster_songs.head(top_n)

    return recommendations['track_name'].tolist()


Here, I have created the song recommendation function.

# Recommendation example

In [25]:
# Example usage
song_name = 'Ghost - Acoustic'
recommended_songs = get_kmeans_recommendations(song_name, data, top_n=10)
print(recommended_songs)


['Back to You', 'When You Come Home', 'Back to You', "I'll Be Seeing You", 'Smile', 'Red Right Hand', 'How Have You Been?', 'For the Dancing and the Dreaming', 'Rock a Bye Baby', "Dodo l'enfant do"]


for the song 'Ghost - Acoustic' our recommender system recommended these 10 songs: 'Back to You', 'When You Come Home', 'Back to You', "I'll Be Seeing You", 'Smile', 'Red Right Hand', 'How Have You Been?', 'For the Dancing and the Dreaming', 'Rock a Bye Baby', "Dodo l'enfant do"