In [26]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from sklearn.cluster import KMeans
import pandas as pd
from bson.objectid import ObjectId
import IPython
from ipywidgets import interact, interactive, fixed, interact_manual
from math import ceil

In [29]:
load_dotenv()
pd.set_option('display.max_rows', None)

In [17]:
# connect to db
client = MongoClient(os.getenv('DB_CONNECTION_STRING'))
db = client.music_analysis
songs = db.songs


In [18]:
print(songs.count_documents({}))

1


In [19]:
df = pd.DataFrame(list(songs.find()))

In [20]:
df.sample()

Unnamed: 0,_id,filename,highleveldanceabilityalldanceable,highlevelgenre_electronicallambient,highlevelgenre_electronicalldnb,highlevelgenre_electronicallhouse,highlevelgenre_electronicalltechno,highlevelgenre_electronicalltrance,highlevelgenre_rosamericaallcla,highlevelgenre_rosamericaalldan,...,highlevelmood_partyvalue,highlevelmood_relaxedvalue,highlevelmood_sadvalue,highlevelmoods_mirexvalue,highleveltimbrevalue,highlevelvoice_instrumentalvalue,tonalchords_key,tonalchords_scale,tonalkey_krumhanslkey,tonalkey_krumhanslscale
0,60d3ab4a05a085d7b46686e5,../music/ACO/Reload (The Remix Collection)/01 ...,0.841794,0.075549,0.008549,0.875557,0.021145,0.0192,0.003269,0.026249,...,party,relaxed,not_sad,Cluster4,dark,voice,A,minor,C,major


In [22]:
# drop invalid data
# TODO map strings to numbers
dfCleaned = df.drop([
    # string values
    '_id',
    'filename',
    'highlevelmood_partyvalue',
    'highlevelmood_relaxedvalue',
    'highlevelmood_sadvalue',
    'highlevelmoods_mirexvalue',
    'highleveltimbrevalue',
    'highlevelgenre_rosamericavalue',
    'highlevelgenre_electronicvalue',
    'highlevelmood_acousticvalue',
    'highlevelmood_aggressivevalue',
    'highleveldanceabilityvalue',
    'highlevelmood_electronicvalue',
    'highlevelmood_happyvalue',
    'highlevelvoice_instrumentalvalue',
    'tonalchords_key',
    'tonalchords_scale',
    'tonalkey_krumhanslkey',
    'tonalkey_krumhanslscale',
    # TODO testing -- temp disable numeric values
    #'rhythmbeats_count',
    #'rhythmbpm',
    'tonaltuning_frequency'
], axis=1)

# normalize all data to average 100
for (columnName, columnData) in dfCleaned.iteritems():
    #print(columnName)
    normalizeMultiplier = 100 / columnData.mean()
    #print(normalizeMultiplier)
    dfCleaned[columnName] *= normalizeMultiplier

dfCleaned.sample()

Unnamed: 0,highleveldanceabilityalldanceable,highlevelgenre_electronicallambient,highlevelgenre_electronicalldnb,highlevelgenre_electronicallhouse,highlevelgenre_electronicalltechno,highlevelgenre_electronicalltrance,highlevelgenre_rosamericaallcla,highlevelgenre_rosamericaalldan,highlevelgenre_rosamericaallhip,highlevelgenre_rosamericaalljaz,...,lowleveldissonancemedian,lowleveldynamic_complexity,rhythmbeats_count,rhythmbeats_loudnessmedian,rhythmbpm,rhythmdanceability,rhythmonset_rate,tonalchords_changes_rate,tonalchords_number_rate,tonalchords_strengthmedian
0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [30]:
# cluster songs into groups
n_clusters = int(ceil(songs.count_documents({}) / int(os.getenv('MUSIC_GROUP_SIZE'))))
kmeans = KMeans(n_clusters, random_state=0).fit(dfCleaned)
labels = kmeans.predict(dfCleaned)

In [31]:
# add labels to results
dfCleaned.insert(0, 'group', labels)
dfCleaned.insert(0, 'song', df['filename'])
dfCleaned.insert(0, 'id', df['_id'])

In [36]:
# show a cluster
cluster = dfCleaned.query('group == 0')
sorted_cluster = cluster.sort_values(by=['tonalchords_strengthmedian'])
sorted_cluster

Unnamed: 0,id,song,group,highleveldanceabilityalldanceable,highlevelgenre_electronicallambient,highlevelgenre_electronicalldnb,highlevelgenre_electronicallhouse,highlevelgenre_electronicalltechno,highlevelgenre_electronicalltrance,highlevelgenre_rosamericaallcla,...,lowleveldissonancemedian,lowleveldynamic_complexity,rhythmbeats_count,rhythmbeats_loudnessmedian,rhythmbpm,rhythmdanceability,rhythmonset_rate,tonalchords_changes_rate,tonalchords_number_rate,tonalchords_strengthmedian
0,60d3ab4a05a085d7b46686e5,../music/ACO/Reload (The Remix Collection)/01 ...,0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [37]:
# collect songs & only allow one song per artist
# TODO scrape tags instead of using file name
playlist = []
artists = []
for index, song in sorted_cluster.iterrows():
    filename = song['song'];
    #playlist.append()
    parts = filename.split("/")
    artist = parts[4]
    #if artist == "Compilations" or artist not in artists:
    if artist not in artists:
        artists.append(artist)
        playlist.append(filename)
#print(playlist)

# show player
def play(song):
    IPython.display.display(IPython.display.Audio(song, rate=44100))
    pass
   
interact(play, song=playlist);

interactive(children=(Dropdown(description='song', options=('../music/ACO/Reload (The Remix Collection)/01 Ais…