In [16]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
sns.set()
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv("spotify.csv")

## Data Exploration

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174389 entries, 0 to 174388
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      174389 non-null  float64
 1   artists           174389 non-null  object 
 2   danceability      174389 non-null  float64
 3   duration_ms       174389 non-null  int64  
 4   energy            174389 non-null  float64
 5   explicit          174389 non-null  int64  
 6   id                174389 non-null  object 
 7   instrumentalness  174389 non-null  float64
 8   key               174389 non-null  int64  
 9   liveness          174389 non-null  float64
 10  loudness          174389 non-null  float64
 11  mode              174389 non-null  int64  
 12  name              174389 non-null  object 
 13  popularity        174389 non-null  int64  
 14  release_date      174389 non-null  object 
 15  speechiness       174389 non-null  float64
 16  tempo             17

In [6]:
data.isnull().sum()

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
dtype: int64

In [7]:
df = data.drop(columns=['id', 'name', 'artists', 'release_date', 'year'])
df.corr()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
acousticness,1.0,-0.263217,-0.089169,-0.750852,-0.208176,0.221956,-0.028028,-0.029654,-0.546639,0.064633,-0.396744,-0.022437,-0.22384,-0.166968
danceability,-0.263217,1.0,-0.100757,0.204838,0.200842,-0.215589,0.026266,-0.110033,0.249541,-0.048358,0.123746,0.239962,0.005479,0.536713
duration_ms,-0.089169,-0.100757,1.0,0.060516,-0.033808,0.103621,0.00202,0.028942,0.019791,-0.046849,0.024717,-0.097838,-0.008182,-0.183199
energy,-0.750852,0.204838,0.060516,1.0,0.102561,-0.17775,0.03578,0.134815,0.779267,-0.05616,0.328939,-0.112616,0.266448,0.326418
explicit,-0.208176,0.200842,-0.033808,0.102561,1.0,-0.130609,0.005282,0.037288,0.106249,-0.062503,0.152545,0.353872,0.008075,-0.009275
instrumentalness,0.221956,-0.215589,0.103621,-0.17775,-0.130609,1.0,-0.004619,-0.047941,-0.317562,-0.056731,-0.300625,-0.133966,-0.068656,-0.219188
key,-0.028028,0.026266,0.00202,0.03578,0.005282,-0.004619,1.0,-0.003368,0.025227,-0.127397,0.001951,0.009648,0.005009,0.025592
liveness,-0.029654,-0.110033,0.028942,0.134815,0.037288,-0.047941,-0.003368,1.0,0.062695,0.001677,-0.078959,0.122034,0.008586,-0.005781
loudness,-0.546639,0.249541,0.019791,0.779267,0.106249,-0.317562,0.025227,0.062695,1.0,-0.01925,0.337194,-0.213504,0.217914,0.30252
mode,0.064633,-0.048358,-0.046849,-0.05616,-0.062503,-0.056731,-0.127397,0.001677,-0.01925,1.0,0.007652,-0.040711,0.002438,0.021592


## Data Transformation

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
datatypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
normarization = data.select_dtypes(include=datatypes)
for col in normarization.columns:
    MinMaxScaler(col)

In [11]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
features = kmeans.fit_predict(normarization)
data['features'] = features
MinMaxScaler(data['features'])

MinMaxScaler(feature_range=0         9
1         9
2         9
3         5
4         9
         ..
174384    9
174385    9
174386    0
174387    0
174388    9
Name: features, Length: 174389, dtype: int32)

## Driver Code 

In [14]:
class Spotify_Recommendation():
    def __init__(self,dataset):
        self.dataset = dataset
    def recommend(self,songs, amount=1):
        distance = []
        song = self.dataset[(self.dataset.name.str.lower() == songs.lower())].head(1).values[0]
        rec = self.dataset[self.dataset.name.str.lower() != songs.lower()]
        for songs in tqdm(rec.values):
            d = 0 
            for col in np.arange(len(rec.columns)):
                if not col in [1, 6, 12, 14, 18]:
                    d = d + np.absolute(float(song[col])- float(songs[col]))
            distance.append(d)
        rec['distance'] = distance
        rec = rec.sort_values('distance')
        columns = ['artists', 'name']
        return rec[columns][:amount]


In [24]:
recommendations = Spotify_Recommendation(data)
recommendations.recommend("Love Story", 10)

100%|██████████| 174384/174384 [00:09<00:00, 18797.92it/s]


Unnamed: 0,artists,name
173785,['Wallows'],1980s Horror Film II
12358,['Patti LaBelle'],Lady Marmalade
54107,['Creed'],With Arms Wide Open - New Version With Strings
126433,"['Ozuna', 'Daddy Yankee', 'J Balvin', 'Farruko...",Baila Baila Baila - Remix
15682,['Foo Fighters'],Learn to Fly
108596,['Bombay Bicycle Club'],Shuffle
137074,['The Stranglers'],Skin Deep
173443,['Luke Bryan'],What Makes You Country
29949,['David Bowie'],Lady Grinning Soul - 2013 Remaster
74558,"['Kid Cudi', 'CeeLo Green']",Scott Mescudi Vs. The World
