# Spotify Graph Data Pre-Processing

In [1]:
# import
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from numpy.linalg import norm

In [2]:
# read data
df_original = pd.read_csv('spotify.csv', index_col=0)

# show data
df_original.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


Below, we take a random sample of 1000 rows from the original dataframe. We, then add the three Regina Spektor songs and reset the index.

In [3]:
# sample 1000 rows
df = df_original.sample(n=1000, random_state=25)

# add songs by artist Regina Spektor
df = df.append(df_original[df_original['artists'] == 'Regina Spektor']).reset_index(drop=True)

# remove duplicate songs based on song name and artist
# df = df.drop_duplicates(subset=['artists', 'track_name'])


  df = df.append(df_original[df_original['artists'] == 'Regina Spektor']).reset_index(drop=True)


In [4]:
# features are [popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre]
features = ['popularity','duration_ms','danceability','energy','key','loudness','speechiness','acousticness','instrumentalness',
            'liveness','valence','tempo']

# normalize data with MinMaxScaler
norm_df = df.copy()
scaler = MinMaxScaler()
norm_df[features] = scaler.fit_transform(df[features])

# show data
norm_df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5DVFVF9v4XZqzP5LdaZP0Q,Official HIGE DANdism,One-Man Tour 2021-2022 -Editorial-@Saitama Sup...,Hello - LIVE,0.305882,0.065666,False,0.515088,0.881877,0.090909,0.815867,0,0.037461,0.013453,0.0,0.309632,0.55317,0.602061,4,anime
1,4rMqvswE1TVZ8ta1hE6QJU,Hank Williams,"The Garden Spot Programs, 1950",I Can't Get You off My Mind,0.188235,0.033552,False,0.49948,0.266235,0.0,0.785385,1,0.040479,0.939759,5e-05,0.595907,0.810838,0.695822,4,honky-tonk
2,2iA4neLS2rg1dlciGgmt4X,Megadeth,Release Your ANGER,Symphony Of Destruction,0.0,0.05571,False,0.634755,0.889885,0.363636,0.832655,0,0.060874,0.000796,0.0865,0.046817,0.517382,0.663167,4,hard-rock
3,4QtkuEXUmPI29GYjm3vILr,Kvatro,Ты моя Москва,Ты моя Москва,0.070588,0.042809,False,0.775234,0.640625,0.090909,0.740747,1,0.052862,0.233935,0.0,0.203183,0.793456,0.42656,4,romance
4,35OKb5Kp34H1DHPU5JtIbE,DeadSquad,Horror Vision (original version 2009),Dominasi Belati,0.352941,0.050094,True,0.331946,0.949948,0.090909,0.786799,1,0.166493,0.033031,8e-05,0.503927,0.270961,0.664523,4,death-metal


In [5]:
# convert important features to numpy array
norm_df = norm_df[features].to_numpy()

# calculate cosine similarity
cosine_sim = cosine_similarity(X=norm_df)

# show cosine similarity
cosine_sim

array([[1.        , 0.76759422, 0.95068639, ..., 0.56980903, 0.80584581,
        0.59701575],
       [0.76759422, 1.        , 0.69925756, ..., 0.7270593 , 0.86252394,
        0.70495341],
       [0.95068639, 0.69925756, 1.        , ..., 0.58608707, 0.71990279,
        0.62240898],
       ...,
       [0.56980903, 0.7270593 , 0.58608707, ..., 1.        , 0.82150441,
        0.98877418],
       [0.80584581, 0.86252394, 0.71990279, ..., 0.82150441, 1.        ,
        0.79981538],
       [0.59701575, 0.70495341, 0.62240898, ..., 0.98877418, 0.79981538,
        1.        ]])

In [6]:
# add song as edge to song if cosine similarity is greater than 0.8 and edge does not already exist in other direction
edges = []
for i in range(len(cosine_sim)):
    for j in range(len(cosine_sim[i])):
        if cosine_sim[i][j] > 0.8 and cosine_sim[i][j] < 0.999:
            edges.append((i, j, cosine_sim[i][j]))

# write edges to csv where each row is each element in edges
with open('edges.csv', 'w') as f:
    # header
    f.write('source,target,similarity\n')
    
    # write each edge
    for edge in edges:
        f.write(str(edge[0]) + ',' + str(edge[1]) + ',' + str(edge[2]) + '\n')  

In [7]:
# write to csv with index as song_idx
df.to_csv('nodes.csv', index_label='song_idx')