<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Preliminaries" data-toc-modified-id="Preliminaries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preliminaries</a></span><ul class="toc-item"><li><span><a href="#Data_SRC" data-toc-modified-id="Data_SRC-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Data_SRC</a></span></li><li><span><a href="#Load-libraries" data-toc-modified-id="Load-libraries-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load libraries</a></span></li></ul></li><li><span><a href="#Demo-of-process_mpd" data-toc-modified-id="Demo-of-process_mpd-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Demo of process_mpd</a></span><ul class="toc-item"><li><span><a href="#counting-the-number-of-playlist" data-toc-modified-id="counting-the-number-of-playlist-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>counting the number of playlist</a></span></li><li><span><a href="#get-unique-tracks-in-all-playlist" data-toc-modified-id="get-unique-tracks-in-all-playlist-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>get unique tracks in all playlist</a></span></li><li><span><a href="#create-random-subset-of-1%-of-all-playlists" data-toc-modified-id="create-random-subset-of-1%-of-all-playlists-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>create random subset of 1% of all playlists</a></span></li></ul></li><li><span><a href="#Get-audio-features" data-toc-modified-id="Get-audio-features-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get audio features</a></span><ul class="toc-item"><li><span><a href="#setup" data-toc-modified-id="setup-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>setup</a></span></li><li><span><a href="#Read-tracks" data-toc-modified-id="Read-tracks-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Read tracks</a></span></li><li><span><a href="#spotify-web-api" data-toc-modified-id="spotify-web-api-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>spotify-web api</a></span></li></ul></li></ul></div>

# Preliminaries
## Data_SRC

In [2]:
#location of the dataset
#DATA_SRC='/home/beangoben/ml_data/mpd.v1/data'
DATA_SRC='../data/raw/5k_subset'
# add src scripts to utilize
import sys
sys.path.append('../src')
sys.path.append('../src/data/')
sys.path.append('../src/models/')
sys.path.append('../src/features/')
sys.path.append('../src/visualization/')
# will reload any library
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
X[:50,:]

## Load libraries

In [3]:
import os
import json
# utility
from tqdm import tqdm_notebook as tqdm
# scientific python
import numpy as np
import pandas as pd
# nice printing
from pprint import pprint
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Demo of process_mpd

Located in data/data_utils.py, **process_mpd** will take a directory path, a function to operate on a playlist and a object to store results (could be a dataframe or a dict for example). 

Optional is a *max_n* (max number of slices to operate on) and use *rand=False* for random or sorted files.


## counting the number of playlist

In [37]:
from data_utils import process_mpd

def count_playlist(playlist,results):
    results['count']=results['count']+1
    return

In [None]:
results={}
results['count']=0
process_mpd(DATA_SRC,count_playlist,results,max_n=10,rand=True)
pprint(results)

## get unique tracks in all playlist

In [9]:
from data_utils import process_mpd, normalize_name

def unique_trackids(playlist,results):
    pid = playlist['pid']
    for track in playlist['tracks']:
        results.append({'title':track['track_name'],'artist':track['artist_name'],'uri':track['track_uri'],'pid':pid})
    return

In [10]:
results=[]
process_mpd(DATA_SRC,unique_trackids,results,max_n=5)
print(len(results))
#pd.DataFrame({'track_uri':list(results)}).to_csv('../data/interim/5k_track_uri.csv',index=False,compression='gzip')

335094


In [16]:
df = pd.DataFrame(results)
print(df.shape)
df = df.drop_duplicates('uri')
print(df.shape)
df.to_csv('../data/interim/5k_track_uri.csv',index=False,compression='gzip')

(335094, 4)
(107781, 4)


## create random subset of 1% of all playlists

In [37]:
from data_utils import process_mpd
import random

def coinflip(percent=0.001):
    return random.uniform(0, 1) < percent

def random_subset(playlist,results):
    if coinflip(1):
        results.append(playlist)
    return

In [None]:
results=[]
process_mpd(DATA_SRC,random_subset,results)
print(len(results))
pprint(results[-1])

 86%|████████▋ | 863/1000 [06:15<00:59,  2.30it/s]

In [27]:
a_dict={"info": {
        "generated_on": "2017-12-03 08:41:42.057563", 
        "slice": "random subset of 0.1% of playlist", 
        "version": "v1"
    }, "playlists": results}
with open('../data/intetim/random_01_percent.json','w') as afile:
    afile.write(json.dumps(a_dict, indent=4))

# Get audio features
## setup

In [5]:
import spotify
import pandas as pd

## Read tracks

In [71]:
df = pd.read_csv('../data/interim/5k_track_uri.csv')
df['csv_id']=df.index.tolist()
print(df.columns)
print(df.shape)
df.head(1)

Index(['artist', 'pid', 'title', 'uri', 'csv_id'], dtype='object')
(108870, 5)


Unnamed: 0,artist,pid,title,uri,csv_id
0,missy elliott,0,lose control feat ciara & fat man scoop,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0


## spotify-web api

In [80]:
def get_audio_features(chunk,sp):
    tids = chunk['uri'].apply(lambda x: x.replace('spotify:track:','')).tolist()
    afeatures = sp.audio_features(tids)
    return afeatures

def get_popularity(chunk,sp):
    tids = chunk['uri'].apply(lambda x: x.replace('spotify:track:','')).tolist()
    tracks = sp.tracks(tids)
    pop =[{'popularity':t['popularity']} for t in tracks['tracks']]
    return pop

def get_artist(chunk,sp):
    tids = chunk['uri'].apply(lambda x: x.replace('spotify:track:','')).tolist()
    tracks = sp.tracks(tids)
    arts =[{'artist_uri':t['artists'][0]['uri']} if t is not None else {'artitst_uri':None}  for t in tracks['tracks']]
    return arts

def get_artist_genres(chunk,sp):
    tids = chunk['artist_uri'].apply(lambda x: x.replace('spotify:track:','')).tolist()
    arts = sp.artists(tids)
    genres =[{'genres':a['genres']} if a is not None else {'genres':None}  for a in arts['artists']]
    return genres

In [83]:
rows = spotify.df_chunk_apply(df, 50, get_artist)
spotify.replace_none(rows)
a_df = pd.DataFrame(rows)




[{'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk'},
 {'artist_uri': 'spotify:artist:26dSoYclwsYLMAKD3tpOr4'},
 {'artist_uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m'},
 {'artist_uri': 'spotify:artist:31TPClRtHm23RisEBtV3X7'},
 {'artist_uri': 'spotify:artist:5EvFsr3kj42KNv97ZEnqij'},
 {'artist_uri': 'spotify:artist:23zg3TcAtWQy7J6upgbUnj'},
 {'artist_uri': 'spotify:artist:23zg3TcAtWQy7J6upgbUnj'},
 {'artist_uri': 'spotify:artist:6wPhSqRtPu1UhRCDX5yaDJ'},
 {'artist_uri': 'spotify:artist:1Y8cdNmUJH7yBTd9yOvr5i'},
 {'artist_uri': 'spotify:artist:1G9G7WwrXka3Z1r7aIDjI7'},
 {'artist_uri': 'spotify:artist:2jw70GZXlAI8QzWeY2bgRc'},
 {'artist_uri': 'spotify:artist:2Hjj68yyUPiC0HKEOigcEp'},
 {'artist_uri': 'spotify:artist:2Hjj68yyUPiC0HKEOigcEp'},
 {'artist_uri': 'spotify:artist:2Hjj68yyUPiC0HKEOigcEp'},
 {'artist_uri': 'spotify:artist:27FGXRNruFoOdf1vP8dqcH'},
 {'artist_uri': 'spotify:artist:0f5nVCcR06GX8Qikz0COtT'},
 {'artist_uri': 'spotify:artist:0p4nmQO2msCgU4IF37Wi3j'},
 {'artist_uri'

In [87]:
n=10
[ (i,j) for i in range(n) for j in range(i+1,n)]

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (6, 7),
 (6, 8),
 (6, 9),
 (7, 8),
 (7, 9),
 (8, 9)]

In [84]:

def divirsity(plist,dist_f,norm_f):
    """ diversity of a playlist
    
    Common choices for dist_f are inverse cosine similarity, inverse
    Pearson correlation, or Hamming distance
    """"
    n = len(plist)
    sum_d = [ dist_f(plist[i],plist[j]) for i in range(n) for j in range(i+1,n)]
    p_norm = norm_f(plist)
    val = sum_df/(p_norm*(p_norm-1))

    return val

SyntaxError: invalid syntax (<ipython-input-84-d05056eb7c96>, line 1)