In [1]:
from pymongo import MongoClient
from sklearn.cluster import KMeans
import pandas as pd
from bson.objectid import ObjectId
import IPython
from ipywidgets import interact, interactive, fixed, interact_manual

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
# connect to db
client = MongoClient('mongodb://mongo:27017/')
db = client.music_analysis
songs = db.songs


In [4]:
print(songs.count())

11140


  """Entry point for launching an IPython kernel.


In [5]:
df = pd.DataFrame(list(songs.find()))

In [7]:
df.sample()

Unnamed: 0,_id,filename,highleveldanceabilityalldanceable,highlevelgenre_electronicallambient,highlevelgenre_electronicalldnb,highlevelgenre_electronicallhouse,highlevelgenre_electronicalltechno,highlevelgenre_electronicalltrance,highlevelgenre_rosamericaallcla,highlevelgenre_rosamericaalldan,...,highlevelmood_partyvalue,highlevelmood_relaxedvalue,highlevelmood_sadvalue,highlevelmoods_mirexvalue,highleveltimbrevalue,highlevelvoice_instrumentalvalue,tonalchords_key,tonalchords_scale,tonalkey_krumhanslkey,tonalkey_krumhanslscale
2868,5e4d99274814f92fad32ac71,./../../music/David Bowie/Pin Ups/12 Where Hav...,0.729399,0.116023,0.05006,0.714548,0.012954,0.106415,0.009262,0.024917,...,party,not_relaxed,not_sad,Cluster4,dark,voice,D,minor,G,major


In [8]:
# drop invalid data
# TODO map strings to numbers
dfCleaned = df.drop([
    # string values
    '_id',
    'filename',
    'highlevelmood_partyvalue',
    'highlevelmood_relaxedvalue',
    'highlevelmood_sadvalue',
    'highlevelmoods_mirexvalue',
    'highleveltimbrevalue',
    'highlevelgenre_rosamericavalue',
    'highlevelgenre_electronicvalue',
    'highlevelmood_acousticvalue',
    'highlevelmood_aggressivevalue',
    'highleveldanceabilityvalue',
    'highlevelmood_electronicvalue',
    'highlevelmood_happyvalue',
    'highlevelvoice_instrumentalvalue',
    'tonalchords_key',
    'tonalchords_scale',
    'tonalkey_krumhanslkey',
    'tonalkey_krumhanslscale',
    # TODO testing -- temp disable numeric values
    #'rhythmbeats_count',
    #'rhythmbpm',
    'tonaltuning_frequency'
], axis=1)

# normalize all data to average 100
# TODO weight certain values?
for (columnName, columnData) in dfCleaned.iteritems():
    #print(columnName)
    normalizeMultiplier = 100 / columnData.mean()
    #print(normalizeMultiplier)
    dfCleaned[columnName] *= normalizeMultiplier

dfCleaned.sample()

Unnamed: 0,highleveldanceabilityalldanceable,highlevelgenre_electronicallambient,highlevelgenre_electronicalldnb,highlevelgenre_electronicallhouse,highlevelgenre_electronicalltechno,highlevelgenre_electronicalltrance,highlevelgenre_rosamericaallcla,highlevelgenre_rosamericaalldan,highlevelgenre_rosamericaallhip,highlevelgenre_rosamericaalljaz,...,lowleveldissonancemedian,lowleveldynamic_complexity,rhythmbeats_count,rhythmbeats_loudnessmedian,rhythmbpm,rhythmdanceability,rhythmonset_rate,tonalchords_changes_rate,tonalchords_number_rate,tonalchords_strengthmedian
4541,82.404192,183.890706,82.803136,38.141059,100.664256,25.708395,10.354466,8.640893,138.234582,88.974957,...,98.410851,98.453002,72.942147,66.95126,79.063285,116.297675,93.261213,71.632119,36.317634,93.061561


In [9]:
# cluster data
# TODO how to determine optimal # of clusters?
kmeans = KMeans(n_clusters=250, random_state=0).fit(dfCleaned)
labels = kmeans.predict(dfCleaned)

In [10]:
# add labels to results
dfCleaned.insert(0, 'group', labels)
dfCleaned.insert(0, 'song', df['filename'])
dfCleaned.insert(0, 'id', df['_id'])

In [15]:
# show a cluster
cluster = dfCleaned.query('group == 69')
sorted_cluster = cluster.sort_values(by=['tonalchords_strengthmedian'])
sorted_cluster

Unnamed: 0,id,song,group,highleveldanceabilityalldanceable,highlevelgenre_electronicallambient,highlevelgenre_electronicalldnb,highlevelgenre_electronicallhouse,highlevelgenre_electronicalltechno,highlevelgenre_electronicalltrance,highlevelgenre_rosamericaallcla,...,lowleveldissonancemedian,lowleveldynamic_complexity,rhythmbeats_count,rhythmbeats_loudnessmedian,rhythmbpm,rhythmdanceability,rhythmonset_rate,tonalchords_changes_rate,tonalchords_number_rate,tonalchords_strengthmedian
5405,5e4efef47ac2d042041f602b,./../../music/DJ Shadow/Camel Bobsled Race (Q-...,69,128.902044,199.52772,48.682147,11.145371,132.767758,56.717706,17.968441,...,105.171649,106.554461,591.794781,82.817241,76.244943,116.802867,119.209371,141.755903,17.183153,92.698259
7532,5e517d1e3f2129c97e1601dc,./../../music/Compilations/My CD/01 Track01.mp3,69,144.04279,205.701918,42.128168,11.638698,99.149145,49.12651,6.427857,...,104.420241,84.906932,635.835322,116.917269,82.036374,97.491132,123.863567,149.44741,13.837659,95.558085
1048,5e4d0f3d4814f92fad32a555,./../../music/LTJ Bukem/2 Step Drum & Bass/00.m4a,69,130.413417,201.277491,48.126135,10.985342,130.363405,52.778191,5.134595,...,101.738037,97.73709,589.435466,58.854475,73.366769,106.934457,110.574431,92.138334,15.294254,96.089654
1046,5e4d0e004814f92fad32a553,./../../music/LTJ Bukem/2 Step Drum & Bass/00 ...,69,141.829048,191.244794,59.743906,12.734238,141.109736,70.980563,7.492989,...,101.751795,59.730231,601.03543,93.14032,73.414222,100.313459,105.574077,89.951458,14.565957,96.51906
1216,5e4d1b414814f92fad32a5fd,./../../music/PghElectro/Almost Summer/01 Five...,69,142.112791,206.578928,48.676775,9.054414,92.440348,53.524812,10.340038,...,104.029233,70.989339,640.750561,75.953393,72.838379,96.081355,124.811903,125.874144,17.479149,97.147366
3997,5e4e0cb47ac2d042041f5aab,./../../music/DJ Shadow & Cut Chemist/Product ...,69,129.950483,225.493849,33.244368,4.379137,43.065862,29.74231,27.444704,...,103.805391,96.121841,665.916585,65.708922,86.873708,104.139862,123.327071,127.032131,19.817094,97.483513
7467,5e4fe255eeed0903c9e62918,./../../music/Compilations/Breathe Me - Single...,69,143.833556,227.339368,37.399904,4.077301,38.388753,24.952469,1.046295,...,100.271302,105.900156,781.523008,194.702345,101.653527,117.05613,124.035982,97.090973,14.565957,97.677622
1675,5e4d40a44814f92fad32a7c8,"./../../music/andhim/Body Language, Vol. 14/01...",69,143.833573,227.339368,37.399904,4.077301,38.388753,24.952469,1.046302,...,100.271302,105.900156,781.523008,194.702345,101.653527,117.05613,124.035982,97.090973,14.565957,97.677622
1215,5e4d1aa04814f92fad32a5fc,./../../music/PghElectro/Almost Summer/01 Almo...,69,95.784173,223.859432,31.90047,4.966513,49.723994,31.487334,25.784337,...,99.800296,75.83241,659.035251,82.230539,77.850142,98.689205,111.399612,88.890175,14.565957,99.149995
8937,5e5221ef256d2fedcab8f439,./../../music/The Allman Brothers Band/The Fil...,69,88.19636,233.567827,42.384474,2.32649,20.140227,14.790515,10.014537,...,103.040458,71.313786,911.481928,61.009143,129.331362,90.17509,102.975366,84.805859,10.19617,99.833461


In [16]:
# collect songs & only allow one song per artist
# TODO scrape tags instead of using file name
playlist = []
artists = []
for index, song in sorted_cluster.iterrows():
    filename = song['song'];
    #playlist.append()
    parts = filename.split("/")
    artist = parts[4]
    #if artist == "Compilations" or artist not in artists:
    if artist not in artists:
        artists.append(artist)
        playlist.append(filename)
#print(playlist)

# show player
def play(song):
    IPython.display.display(IPython.display.Audio(song, rate=44100))
    pass
   
interact(play, song=playlist);

interactive(children=(Dropdown(description='song', options=('./../../music/DJ Shadow/Camel Bobsled Race (Q-Ber…