In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from os.path import isfile, join

import time

sns.set()

In [2]:
# Load master song table with added metadata
master = pd.read_pickle('../data/master200.pkl')

In [3]:
print(master.shape)
display(master.head())

(999950, 25)


Unnamed: 0_level_0,artist_name,artist_uri,track_name,album_uri,duration_ms,album_name,count,track_uri,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artist_genres,artist_popularity,album_genres,album_popularity,album_release_date
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Sidney Bechet's Blue Note Jazzmen,spotify:artist:2XouUSO0EAJ9gMMoHiXqMt,Muskrat Ramble,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,220293,Jazz Classics,1,spotify:track:0002yNGLtYSYtc0X6ZnFvp,0.455,0.623,...,0.903,0.634,0.951,182.345,4,[],18,[],37,1993-01-01
159583,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Blue Horizon,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,264933,Jazz Classics,5,spotify:track:1EWPMNHfdVNJwBpG9BcxXB,0.327,0.372,...,0.835,0.153,0.38,66.036,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01
271702,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Blame It On The Blues - Alternate Take,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,175893,Jazz Classics,1,spotify:track:26N4Y48EjprAtvlY6yWZTA,0.574,0.606,...,0.948,0.349,0.965,101.361,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01
445190,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Summertime,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,251906,Jazz Classics,16,spotify:track:3RlJx8xwZEyToSuGrygilr,0.608,0.138,...,0.908,0.0853,0.318,83.124,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01
626275,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Dear Old Southland,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,243693,Jazz Classics,1,spotify:track:4qwAa1rOm8iaegHzoM1b31,0.4,0.32,...,0.842,0.195,0.613,86.186,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01


Convert `album_release_year` to a continuous number of the release year. The day of month or month itself of the release date is likely much less important. Should ideally treat as categorical, but this would give a *lot* of levels and not unreasonble to treat year as ordinal. 

In [4]:
master['album_release_year'] = [reldate[0:4] for reldate in master['album_release_date']]

Several columns contain text (should be categorical):

In [5]:
master.dtypes

artist_name            object
artist_uri             object
track_name             object
album_uri              object
duration_ms             int64
album_name             object
count                   int64
track_uri              object
danceability          float64
energy                float64
key                     int64
loudness              float64
mode                    int64
speechiness           float64
acousticness          float64
instrumentalness      float64
liveness              float64
valence               float64
tempo                 float64
time_signature          int64
artist_genres          object
artist_popularity       int64
album_genres           object
album_popularity        int64
album_release_date     object
album_release_year     object
dtype: object

Remove columns that we do not need:
- `album_genres` is always empty. Seems like a field that used to exist, but no longer does.
- `track_uri` is the url to the track. This is unique to each song and cannot be used to model. It is saved in the stored dataframe for reference if we need it later.
- `album_uri` for the same reason.
- `artist_uri` for the same reason.

Until we can effecrtively one-hot encode as categoricals, any remaining predictors of type `object` can be dropped, so that we only have numeric features.

In [6]:
master.drop(labels = list(master.columns[np.where(master.dtypes == np.dtype('O'))]), 
            axis = 1, inplace = True)

In [7]:
master.dtypes

duration_ms            int64
count                  int64
danceability         float64
energy               float64
key                    int64
loudness             float64
mode                   int64
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
time_signature         int64
artist_popularity      int64
album_popularity       int64
dtype: object

There are some missing values here (throws an error if we try to fit a model). Drop rows with NA, as the remaining rows are those we intend to use.

In [17]:
master.dropna(axis = 0, inplace = True)

No point in running cross-validation here since we have unlabelled data. Hence, simple train test split not for the purpose of evaluating out-of-sample (since we have no labels), but so we can subjectively evaluate how our unsupervised methods and/or algorithms for playlist generation perform on unseen data (as unsupervised still does fit to the data it sees). Since we have a million songs and have to evaluate out-of-sample manually, a specific number of, say, $500$ songs held out is plenty.

In [18]:
test_size = 500

#test_ind = np.random.choice(np.arange(int(0.05*len(master))), test_size, replace = False)
test_songids = np.random.choice(master.index, test_size, replace = False)
train_songids = list(set(master.index) - set(test_songids))

# Verify that it works
assert set(np.append(train_songids, test_songids)) == set(master.index)

# Split up into train and test
train_df = master.loc[train_songids, :]
test_df = master.loc[test_songids, :]

print(train_df.shape, len(train_songids))
print(test_df.shape, len(test_songids))

(999449, 16) 999449
(500, 16) 500


### Min-Max scaling
Before we can pass the data through distance-based algorithm, we need to scale the data.

In [19]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()

train = minmax.fit_transform(np.array(train_df))
test = minmax.transform(np.array(test_df))

## K-Means Clustering
The number of clusters here maps directly to the number of families of playlists we can generate. Every song will be predicted to belong to a cluster, and the rest of the playlist generated with songs from that cluster.

In [25]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 10, random_state = 42, max_iter = 300, verbose = 0)

kmeans.fit(train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [62]:
# Verify that model did in fact converge
assert kmeans.n_iter_ < 300

# Verify we got a prediction for every observation in train
assert len(kmeans.labels_) == train.shape[0]

# Sum of squared distances of samples (train) to their closest cluster center
# kmeans.inertia_

# Centers of the clusters in the n-dimensions (n features we have)
cluster_centers = kmeans.cluster_centers_

train_labels = kmeans.labels_

train_labels

array([9, 2, 3, ..., 9, 1, 4], dtype=int32)

In [63]:
# Predict cluster of a new song
kmeans.predict(test[0:1])

array([6], dtype=int32)

In [64]:
# Get distance of each observation (row) to each cluster center (dimension/col)
assert np.argmin(kmeans.transform(test)[0, :]) == kmeans.predict(test[0:1,:])[0]

kmeans.transform(test)

array([[1.19418679, 1.39192074, 1.03917361, ..., 1.4095347 , 1.06730364,
        0.89983469],
       [1.49107138, 0.59710045, 1.39793652, ..., 0.85533106, 0.95907279,
        1.77437557],
       [1.00532207, 1.35506272, 0.70172183, ..., 1.22171067, 1.09147292,
        0.99120915],
       ...,
       [1.38477707, 0.65237897, 1.126919  , ..., 0.48515253, 0.9457774 ,
        1.77153209],
       [1.28888554, 1.34009808, 1.17008394, ..., 1.47127084, 1.43972095,
        1.52173539],
       [1.0275919 , 1.2998571 , 0.50306534, ..., 1.12705314, 1.41706658,
        1.4914206 ]])

### Use K-Means to generate playlist from cold start

Given a set of start songs for the cold start, there are some potential ways of populating a playlist. For each of those songs, predict the cluster to which it belongs, then either:
- Pull songs from the mode cluster (i.e. the most common), if tied pick a random cluster and get all new songs from there.
- **Our implementation**: Pull randomly from one of the predicted clusters each time, with probability weighted equal to the number of times each cluster was predicted.
- Instead of predicting the single cluster of each start song, get its distance to all clusters, sum them up, average them, and use that to determine the closet cluster(s) of the songs as a group, then pull from either the closest one or closest $n$ with some weighting scheme (e.g. by distance).

### TODO: implement algorithm for cold start

In [122]:
# Testing purposes
start_songs = [test[i:i+1, :] for i in range(3)]
#start_songs = test[0:3, :]

play_len = 20

In [126]:
# Ensure proper dimensions of songs: handles list of individual songs or set of songs
start_clusters = kmeans.predict(np.reshape(start_songs, (len(start_songs), -1)))
print('Starting songs clusters:', start_clusters)

# Get count of each start cluster and its respective weight of total
start_cluster, cluster_count = np.unique(start_clusters, return_counts = True)
cluster_weight = cluster_count / np.sum(cluster_count)
print('Cluster, Counts, Weights:', start_cluster, cluster_count, cluster_weight)

new_song_clusters = np.random.choice(start_cluster, size = play_len, p = cluster_weight)
print('New song clusters:', new_song_clusters)

Starting songs clusters: [6 1 6]
Cluster, Counts, Weights: [1 6] [1 2] [0.33333333 0.66666667]
New song clusters: [6 6 1 6 6 6 6 6 6 6 1 6 1 6 1 6 6 6 6 6]


In [None]:
def create_playlist(start_songs, playlist_length = 20):
    pass

### TODO: t-sne visualisation of fit clusters