In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [28]:
data = pd.read_csv("spotify.csv", index_col=0)

In [29]:
# Removes songs longer than 9 minutes so duration is more normally distributed
data = data.loc[data['duration_ms'] <= 540000]

# Converts popularity, tempo, and duration to percentages
cols = ["popularity", "tempo", "duration_ms"]
for c in cols:
    max_ = max(data[c])
    data[c] = data[c].apply(lambda x: x/max_)

# Converts loudness to a percentage
max_loud = max(data["loudness"])
min_loud = min(data["loudness"])
data["loudness"] = data["loudness"].apply(lambda x: (x - max_loud)/min_loud)

# Converts artists to a list
data["artists"] = data["artists"].apply(lambda x: str(x).split(";"))

In [30]:
data.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,[Gen Hoshino],Comedy,Comedy,0.73,0.427159,False,0.676,0.461,1,0.227696,0,0.143,0.0322,1e-06,0.358,0.715,0.361245,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,[Ben Woodward],Ghost (Acoustic),Ghost - Acoustic,0.55,0.277056,False,0.42,0.166,1,0.439462,1,0.0763,0.924,6e-06,0.101,0.267,0.318397,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,"[Ingrid Michaelson, ZAYN]",To Begin Again,To Begin Again,0.57,0.390419,False,0.438,0.359,0,0.288022,1,0.0557,0.21,0.0,0.117,0.12,0.313643,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,[Kina Grannis],Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,0.71,0.37395,False,0.266,0.0596,0,0.465305,1,0.0363,0.905,7.1e-05,0.132,0.143,0.746758,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,[Chord Overstreet],Hold On,Hold On,0.82,0.368246,False,0.618,0.443,2,0.286952,1,0.0526,0.469,0.0,0.0829,0.167,0.492863,4,acoustic


In [33]:
# Returns the similarity between 2 artist lists
def score_artists(a1, a2):
    return len(set(a1).intersection(set(a2)))

# Returns a value of the similarity between 2 rows
def row_similarity(r1, r2):
    # If the tows are the same row, returns 0
    if r1["track_id"] == r2["track_id"]:
        return 0
    else:
        score = 0
        score += score_artists(r1["artists"], r2["artists"])
        
        # A dict for determining the type of each field. True will cause it to be evauluated as a float, false as a category
        field_type = {"album_name":False, "popularity":True, "duration_ms":True, "explicit":False, "danceability":True,"energy":True, 
                      "key":False, "loudness":True, "speechiness":True, "acousticness":True, "instrumentalness":True,
                     "tempo":True, "time_signature":False, "track_genre":False}
        
        # Scores the values based on their similarity
        for field in field_type.keys():
            # If the value is a float, assumes it is a percentage and calculates similarity by taking the difference
            if field_type[field]:
                score += 1 - abs(r1[field] - r2[field])
            # Otherwise, simply does quality of the two fields
            elif r1[field] == r2[field]:
                score += .9
        
    return score

In [58]:
# Iterates through the df, trying every combination of rows. Creates anedge between 2 rows if the row similarity is 
# above a given threshold. The default threshold of 12 creates about 8 edges for every node, lowering the threshold will
# increase this number. This function has a really long runtime for larger values
def score_all(df, score_threshold=12):
    scores = {}
    
    for i1, r1 in df.iterrows():
        for i2, r2 in df.iterrows():
            score = row_similarity(r1,r2)
            
            if score > score_threshold:
                scores[(i1, i2)] = round(score,3)
                
    return scores

In [59]:
scors = score_all(data.head(1000))

In [60]:
print(len(scors))

7948
