In [32]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from os.path import isfile, join

import time

sns.set()

In [33]:
%matplotlib inline

In [34]:
# Load master song table with added metadata
master = pd.read_pickle('../data/master200.pkl')

In [35]:
print(master.shape)
display(master.head())

(999950, 25)


Unnamed: 0_level_0,artist_name,artist_uri,track_name,album_uri,duration_ms,album_name,count,track_uri,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artist_genres,artist_popularity,album_genres,album_popularity,album_release_date
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Sidney Bechet's Blue Note Jazzmen,spotify:artist:2XouUSO0EAJ9gMMoHiXqMt,Muskrat Ramble,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,220293,Jazz Classics,1,spotify:track:0002yNGLtYSYtc0X6ZnFvp,0.455,0.623,...,0.903,0.634,0.951,182.345,4,[],18,[],37,1993-01-01
159583,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Blue Horizon,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,264933,Jazz Classics,5,spotify:track:1EWPMNHfdVNJwBpG9BcxXB,0.327,0.372,...,0.835,0.153,0.38,66.036,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01
271702,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Blame It On The Blues - Alternate Take,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,175893,Jazz Classics,1,spotify:track:26N4Y48EjprAtvlY6yWZTA,0.574,0.606,...,0.948,0.349,0.965,101.361,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01
445190,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Summertime,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,251906,Jazz Classics,16,spotify:track:3RlJx8xwZEyToSuGrygilr,0.608,0.138,...,0.908,0.0853,0.318,83.124,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01
626275,Sidney Bechet,spotify:artist:1RsmXc1ZqW3WBs9iwxiSwk,Dear Old Southland,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,243693,Jazz Classics,1,spotify:track:4qwAa1rOm8iaegHzoM1b31,0.4,0.32,...,0.842,0.195,0.613,86.186,4,"['bebop', 'big band', 'cool jazz', 'dixieland'...",52,[],37,1993-01-01


There are some missing values here (throws an error if we try to fit a model). Drop rows with NA, as the remaining rows are those we intend to use.

In [36]:
old = master.shape
master.dropna(axis = 0, inplace = True)
print('Removed', old[0] - master.shape[0], 'observations')

Removed 9 observations


Convert `album_release_year` to a continuous number of the release year. The day of month or month itself of the release date is likely much less important. Should ideally treat as categorical, but this would give a *lot* of levels and not unreasonble to treat year as ordinal. 

In [37]:
master['album_release_year'] = np.array(
    [reldate[0:4] for reldate in master['album_release_date']], dtype = 'int')
master.drop(['album_release_date'], 1, inplace = True)

Several columns contain text (should be categorical):

In [38]:
master.dtypes

artist_name            object
artist_uri             object
track_name             object
album_uri              object
duration_ms             int64
album_name             object
count                   int64
track_uri              object
danceability          float64
energy                float64
key                     int64
loudness              float64
mode                    int64
speechiness           float64
acousticness          float64
instrumentalness      float64
liveness              float64
valence               float64
tempo                 float64
time_signature          int64
artist_genres          object
artist_popularity       int64
album_genres           object
album_popularity        int64
album_release_year      int64
dtype: object

Remove columns that we do not need:
- `album_genres` is always empty. Seems like a field that used to exist, but no longer does.
- `track_uri` is the url to the track. This is unique to each song and cannot be used to model. It is saved in the stored dataframe for reference if we need it later.
- `album_uri` for the same reason.
- `artist_uri` for the same reason.

Until we can effecrtively one-hot encode as categoricals, any remaining predictors of type `object` can be dropped, so that we only have numeric features.

In [39]:
master.drop(labels = ['album_genres', 'track_uri', 'album_uri', 'artist_uri'], 
           axis = 1, inplace = True)

Artist genres is missing for about $18\%$ of songs. For those that have it, it's a list of sometimes extremely specific genres. Hard to do something useful with, so drop it.

In [40]:
np.sum(master.artist_genres == '[]')/master.shape[0]

0.17600038402265733

We also can't do anything interest with track name (unless we want to try to use NLP to look for similarities in titles, but that seems like a stretch, so drop that too.

In [41]:
master.drop(['track_name', 'artist_genres'], 1, inplace = True)

Now whatever column is of type `object` is one we want to treat as categorical and one-hot-encode. Songs in the same album and/or by the same artist likely fit well together.  
  
**Note:** Perhaps redundant to keep both? If a song is in the same album, most of the time it will also be by the same artist? At the same time, collection albums may be a great way to indirectly capture genre information and relatedness beyond being by the same artist. There's just a *lot* of albums, i.e. we'll get a stupid amount of features with one-hot-encoding.

In [42]:
master.dtypes

artist_name            object
duration_ms             int64
album_name             object
count                   int64
danceability          float64
energy                float64
key                     int64
loudness              float64
mode                    int64
speechiness           float64
acousticness          float64
instrumentalness      float64
liveness              float64
valence               float64
tempo                 float64
time_signature          int64
artist_popularity       int64
album_popularity        int64
album_release_year      int64
dtype: object

# Fancy model alternatives

## Multiple Correspondence Analysis (MCA)
Not as a model, but as an alternative to PCA that works on categorical (binary) data, either for widened song-in-playlist indicators, or one-hot-encoded `artist` and/or `album` for songs.

Read more here: http://vxy10.github.io/2016/06/10/intro-MCA/  
Package here: https://pypi.org/project/mca/  
Why PCA isn't appropriate: https://www.reddit.com/r/MachineLearning/comments/3nuh7g/is_it_effective_to_use_one_hot_encoding_of/

Options:
1. ~~MCA to reduce dimensionality of song-in-playlist indicators (array takes too much memory and spares doesn't seem to work)~~: even with sparse and $n>100$ this runs OOM. See notebook `sparse matrix with MCA -- OOM error.ipynb` for the various attempts. 
2. MCA to reduce dimensionality of one-hot-encoded `artist` and/or `album` indicators

## Cosine Similarity and/or Euclidean Distance
Computes the angle $[0, 1]$ between two vectors, in our case between the vector of playlist indicators of two songs. A similarity of 1 means the songs appear in the exact same playlists.

Note that similarity is 1 - distance, and vice versa.

Can be done in chunks or subsets, but is still *very* computationally expensive due to the size of the vectors and sheer number of songs.

**Update:** Is now very fast using sparse `scipy` matrices and `sklearn` distance algorithms. Still runs OOM if try to do it pairwise across entire dataset, but not really any need for that: better to do one-vs-all (across entire dataset or across K-Means cluster) when populating playlists with seed songs. See details below.

## Multi-label models
From sklearn documentation: **Multiclass** classification algorithms require that each observation belongs to one – and only one, though potentially among many possible – class. **Multilabel**, in contrast, assigns to each sample a *set* of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A text might be about any of religion, politics, finance or education at the same time or none of these.

Read more here: https://scikit-learn.org/stable/modules/multiclass.html  
Multi-label models in `sklearn` on text data: https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5

1. Treat playlists as labels and assign each song multiple, according to which playlists it's in. Challenge: 200,000 classes, many with very few observations

## HDBSCAN: big-boy clustering
Alternative to K-Means.

Docs: https://hdbscan.readthedocs.io/en/latest/  
How HDBSCAN works: https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html  
Lightning Talk, Clustering with HDBScan: https://towardsdatascience.com/lightning-talk-clustering-with-hdbscan-d47b83d1b03a

## Matrix decomposition
`sklearn` has a great library for matrix decomposition, including on sparse matrices: https://scikit-learn.org/stable/modules/decomposition.html

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition

Options to look into which is appropriate, depending on whether on just songs-in-playlist indicators, one-hot-encoded categoricals (artist, album, etc.), and/or the continuous data:
- Truncated Singular Value Decomposition
- Latent Dirichlet Allocation
- Mini-batch sparse PCA (looks like we can get PCA to run on everything after all)
- Kernel PCA

## Embedding
*"Formally, an embedding is a mapping of a categorical variable into an n-dimensional vector.
This provides us with 2 advantages. First, we limit the number of columns we need per category. Second, embeddings by nature intrinsically group similar variables together."*

*"Traditionally, the best way to deal with categorical data has been one hot encoding — a method where the categorical variable is broken into as many features as the unique number of categories for that feature and for every row, a 1 is assigned for the feature representing that row’s category and rest of the features are marked 0.
There are a lot of issues with this method. For categories with lots of unique features we get a lot of sparse data. Also each vector is equidistant from every other vector which causes us to lose the value of relationships between variables.
Embeddings are a solution to dealing with categorical variables while avoiding a lot of the pitfalls of one hot encoding."*

https://medium.com/@davidheffernan_99410/an-introduction-to-using-categorical-embeddings-ee686ed7e7f9

https://towardsdatascience.com/building-a-recommendation-system-using-neural-network-embeddings-1ef92e5c80c9

Options
- Can be used on text as a supervised problem
- Unclear how it would work on songs-in-playlists indicators: nothing to make it supervised against?

## ~~Normalised Mutual Information~~~
Score to evaluate unsupervised clustering: https://www.analyticsvidhya.com/blog/2018/05/essentials-of-deep-learning-trudging-into-unsupervised-deep-learning/

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html

NMI only works with known labels; compares clusters to labels, which we don't have.

## Auto-Encoder into K-Means (or other)
https://www.analyticsvidhya.com/blog/2018/05/essentials-of-deep-learning-trudging-into-unsupervised-deep-learning/  
Many clustering algorithms, K-Means included, struggle on high-dimensional data. Use an auto-encoder to reduce dimensionality first, then run K-means on the predictions of that autoencoder (i.e. lower-dimensional data).

How does it handle extremely sparse indicators?

## Deep Embedding for Clustering Analysis (DEC)
https://www.analyticsvidhya.com/blog/2018/05/essentials-of-deep-learning-trudging-into-unsupervised-deep-learning/

https://arxiv.org/abs/1511.06335  
  
State of the art model related to the above. Trains both clustering and autoencoder models to get better performance.

## Song-in-playlist indicators as sparse matrix
https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#sparse-migration

In [103]:
playlists

284_0      [340039, 125250, 881533, 653897, 49614, 356319...
284_1      [738782, 7646, 142078, 900881, 533258, 429837,...
284_2      [552361, 135177, 507876, 865927, 638474, 55164...
284_3      [214695, 27387, 700562, 448130, 1000188, 37723...
284_4      [576080, 600, 170841, 842370, 450149, 8624, 89...
                                 ...                        
283_995    [387321, 550498, 904203, 123847, 811636, 17392...
283_996    [744373, 636897, 939642, 839829, 731677, 88964...
283_997    [16947, 679919, 17738, 263393, 313648, 66922, ...
283_998    [615912, 323911, 855546, 150903, 539581, 18282...
283_999    [985680, 723192, 786619, 812661, 423261, 14162...
Length: 200000, dtype: object

In [43]:
# Read list of lists: parent index is playlist ID, child index is song in that playlist
playlists = pd.read_pickle('../data/playlists_song_ids_200.pkl')

# Give each parent list a number, i.e. playlist ID
play, song = zip(*enumerate(playlists))

# Expand into pairs of playlist-song, i.e. 100-long playlist becomes 100 pairs
pairs = [[z[0], s] for z in zip(play, song) for s in z[1]]

# column is song ID, row is playlist ID
col, row = zip(*pairs)
assert len(row) == len(col)

In [44]:
# Create sparse matrix
from scipy.sparse import csr_matrix, coo_matrix
mat = csr_matrix((np.ones(len(col), dtype = 'int'), (row, col)))
mat.shape

(1003760, 200000)

**Warning:** Usually `mat.A` gets you a dense matrix with zeros as zeros instead of simply being left out, *but* that will make Jupyter shit the bed due to the crazy memory requirements.

Various songs were ditched from the master songs table when adding metadata

In [45]:
mat = mat[master.index.values, :]
assert mat.shape[0] == master.shape[0]

Seems like it works: first playlist has 13 stored elements, corresponding to `len(playlists[0])`:

In [46]:
len(playlists[0]), mat[:, 0]

(13, <999941x1 sparse matrix of type '<class 'numpy.int64'>'
 	with 13 stored elements in Compressed Sparse Row format>)

#### Try Numpy operations on sparse matrix to see if we can do anything useful with it
**Works like a charm, and is *very* fast! :D**

In [47]:
# Occurrences of each song across all playlists
np.sum(mat, axis = 1)

matrix([[1],
        [5],
        [1],
        ...,
        [1],
        [1],
        [1]])

In [48]:
# Songs in each playlist
np.sum(mat, axis = 0)

matrix([[ 13,  85, 156, ...,  48,   8,  25]], dtype=int64)

#### Drop songs that occur below a certain threshold $n$

In [49]:
n = 10

In [50]:
songs_keep_ind = np.argwhere(np.sum(mat, axis = 1) >= n)[:,0]

In [51]:
mat_sub = mat[songs_keep_ind, :]
mat_sub.shape

(123328, 200000)

In [52]:
# Sparsity original
1 - (np.sum(mat)/(mat.shape[0]*mat.shape[1]))

0.9999346299381664

In [53]:
# Sparsity new subset
1 - (np.sum(mat_sub)/(mat_sub.shape[0]*mat_sub.shape[1]))

0.9995422149876751

Drop the same songs from the master songs metadata table

In [54]:
master_sub = master.iloc[songs_keep_ind, :]
master_sub.shape

(123328, 19)

Some playlists now have no songs or are very short. Drop those containing less than $m$ songs:

In [55]:
m = 10

In [56]:
p_sums = np.array(np.sum(mat_sub, axis = 0)).reshape((-1,))

In [57]:
mat_sub = mat_sub[:, p_sums >= m]
mat_sub.shape

(123328, 183939)

**Warning:** Some songs may now never appear in any playlist (happens at least once), so be aware that summing across columns will yield some NaNs.

E.g. we have 

In [58]:
np.sum(mat_sub[0, :].A), mat_sub[0, :]

(12, <1x183939 sparse matrix of type '<class 'numpy.int64'>'
 	with 12 stored elements in Compressed Sparse Row format>)

In [59]:
from scipy.spatial.distance import cosine, euclidean, cdist

In [60]:
# Cosine here is DISTANCE, not similarity
1 - cosine(mat_sub[0, :].A, mat_sub[1, :].A)

0.0

In [61]:
1 - cosine(mat_sub[0, :].A, mat_sub[0, :].A)

1.0

Computing cosine similarity of even one song across the entire dataset is extremely slow. Probably best to do this from within the relevant cluster when pulling new songs to populate playlist (but need a lot of clusters for this to pay off, since time becomes original time divided by number of clusters).

#### ~~TODO: find a faster way to calculate this~~
#### DONE, see below

**Note:** The method is slow, but we'd technically only have to do this once per cluster (in total the entire dataset once) if we combine all seed songs into one vector.

In [None]:
similarities = [1 - cosine(mat_sub[0, :].A, mat_sub[i, :].A) for i in range(mat_sub.shape[0])]

In [32]:
# Replace NaN with 0: occurs if a song never appears in any remaining playlist
similarities = np.nan_to_num(similarities, nan = 0)

In [33]:
assert len(similarities) == mat_sub.shape[0]

Song ID of first song (for which we want to find closest neighbours):

In [64]:
master_sub.index.values[0]

445190

#### TODO: find a good way combine all seed songs into one vector and calculate cosine similarity against all songs in cluster from which we pull new songs for efficiency, preferably weighting shared playlists across the seed songs more heavily (if multiple seed songs are in a playlist, that's likely a good playlist to pull new songs from)
Way to weight more important songs: e.g. if $n$ songs appear in the most frequent playlist, weight that by $1$ and all playlists with only one seed song by $1/n$ when calculating distance. Hence, a song that appears in the most frequent playlist gets a distance of $0$ for that playlist, $1$ for playlists that aren't a match, and $1-1/n$ for playlists with only one seed song.

In [60]:
# Find the ten songs most similar to the first song in master (including self)
similarities[np.argsort(similarities)[::-1][0:10]]

array([1.        , 0.2236068 , 0.20412415, 0.19867985, 0.19364917,
       0.19245009, 0.18257419, 0.16012815, 0.15430335, 0.1490712 ])

In [62]:
# Song IDs of the closest songs
master_sub.index.values[np.argsort(similarities)[::-1][0:10]]

array([445190, 421694, 883140, 135113, 800583, 657505, 717728, 644240,
       820522, 842045])

#### ~~TODO: Maybe Euclidian distance is faster and/or makes more sense?  ~~~
#### Not faster, boils down to judgment call which makes more sense. See below.
That's what the paper use in their probabilistic embedding of playlists

In [None]:
euc_dist = [euclidean(mat_sub[0, :].A, mat_sub[i, :].A) for i in range(mat_sub.shape[0])]

In [65]:
# Euclidean distances of 10 closest songs in indicator vector space
np.array(euc_dist)[np.argsort(euc_dist)][0:10]

array([0.        , 3.46410162, 3.60555128, 3.60555128, 3.60555128,
       3.74165739, 3.74165739, 3.74165739, 3.74165739, 3.87298335])

In [66]:
master_sub.index.values[np.argsort(euc_dist)[0:10]]

array([445190, 348315, 231929, 613065, 355371, 538055, 202803, 513149,
       242376,   1975])

Cosine similarity and Euclidean distance give different results. Which is better?

### This is super-efficient! Runs in virtually an instant
**Note:** it is not the most precise way of doing the computation, trading off precision for speed, but it is so very nearly identical as to be negligibly imprecise.

**Note:** Still shits the bed if try to do it for the entire dataset. Need to do one or a few vs all.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html

In [62]:
from sklearn.metrics.pairwise import euclidean_distances

In [63]:
euc_dist_sklearn = euclidean_distances(mat_sub, mat_sub[0, :])

In [64]:
euc_dist_sklearn = euc_dist_sklearn.reshape((-1,))

In [None]:
print('Exactly equal to scipy in % of cases:', 
      100*np.sum(euc_dist == euc_dist_sklearn)/len(euc_dist))

In [66]:
# Sorted closest songs by distance
euc_dist_sklearn[np.argsort(euc_dist_sklearn)][0:10]

array([0.        , 3.46410162, 3.60555128, 3.60555128, 3.60555128,
       3.74165739, 3.74165739, 3.74165739, 3.74165739, 3.87298335])

In [67]:
# Sorted closet songs by ID
master_sub.index.values[np.argsort(euc_dist_sklearn)[0:10]]

array([445190, 348315, 231929, 613065, 355371, 538055, 202803, 513149,
       242376,   1975])

#### Try same speed-up for cosine distance / similarity
Also very fast
  
*"Cosine distance is defined as 1.0 minus the cosine similarity."*  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_distances.html#sklearn.metrics.pairwise.cosine_distances

In [68]:
from sklearn.metrics.pairwise import cosine_distances

In [114]:
cos_dist_sklearn = cosine_distances(mat_sub, mat_sub[0, :])

In [115]:
cos_dist_sklearn = cos_dist_sklearn.reshape((-1,))

In [116]:
# Sorted closest songs by cosine distance
cos_dist_sklearn[np.argsort(cos_dist_sklearn)][0:10]

array([0.        , 0.7763932 , 0.79587585, 0.80132015, 0.80635083,
       0.80754991, 0.81742581, 0.83987185, 0.84569665, 0.8509288 ])

In [72]:
# Sorted closest songs by cosine distance
master_sub.index.values[np.argsort(cos_dist_sklearn)[0:10]]

array([445190, 421694, 883140, 135113, 800583, 657505, 717728, 644240,
       820522, 842045])

In [117]:
master_sub.loc[master_sub.index.values[np.argsort(cos_dist_sklearn)[0:10]]]

Unnamed: 0_level_0,artist_name,duration_ms,album_name,count,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_popularity,album_popularity,album_release_year
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
445190,Sidney Bechet,251906,Jazz Classics,16,0.608,0.138,3,-17.379,1,0.0409,0.979,0.908,0.0853,0.318,83.124,4,52,37,1993
421694,Kenny Dorham,190400,Quiet Kenny,15,0.336,0.0619,5,-20.494,0,0.0363,0.985,0.891,0.11,0.197,60.705,4,44,41,1992
883140,Joe Henderson,482440,Page One,13,0.531,0.366,5,-11.697,0,0.0496,0.911,0.88,0.0916,0.785,81.867,4,46,39,1999
135113,Chet Baker,318466,Chet,19,0.505,0.0251,3,-21.22,1,0.0393,0.941,0.906,0.142,0.175,113.162,4,68,1,1959
800583,Chet Baker,413786,Chet [Keepnews Collection],21,0.441,0.0319,2,-19.168,0,0.0357,0.872,0.898,0.088,0.113,110.471,4,68,1,1959
657505,Oliver Nelson,526826,The Blues And The Abstract Truth,41,0.536,0.249,0,-14.381,0,0.0431,0.884,0.854,0.123,0.35,116.289,4,43,0,1961
717728,Herb Ellis,251600,Ellis In Wonderland,10,0.798,0.212,0,-10.529,1,0.0436,0.933,0.95,0.102,0.592,118.149,4,51,0,1956
644240,J.J. Johnson,548240,The Trombone Master,16,0.466,0.395,5,-18.022,0,0.0432,0.442,0.00531,0.278,0.664,118.5,4,37,28,1989
820522,Lee Morgan,339333,Candy,16,0.416,0.201,8,-11.461,1,0.0379,0.975,0.803,0.107,0.372,165.869,3,51,36,1957
842045,Gerry Mulligan,442506,Gerry Mulligan Meets Ben Webster,15,0.514,0.0141,1,-24.166,1,0.0448,0.958,0.458,0.111,0.112,111.552,4,52,25,1963


## Truncated SVD
https://scikit-learn.org/stable/modules/decomposition.html#lsa  
*"TruncatedSVD is very similar to PCA, but differs in that it works on sample matrices  directly instead of their covariance matrices. When the columnwise (per-feature) means of  are subtracted from the feature values, truncated SVD on the resulting matrix is equivalent to PCA. In practical terms, this means that the TruncatedSVD transformer accepts scipy.sparse matrices without the need to densify them, as densifying may fill up memory even for medium-sized document collections."*

In [81]:
from sklearn.decomposition import TruncatedSVD

In [94]:
tsvd = TruncatedSVD(n_components = 100, n_iter = 10, random_state = 42)

In [95]:
tsvd.fit(mat_sub)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=10,
             random_state=42, tol=0.0)

In [96]:
tsvd.explained_variance_ratio_

array([0.01383844, 0.00809865, 0.00609205, 0.0057266 , 0.00520378,
       0.00450739, 0.00359385, 0.00280834, 0.00265275, 0.00251792,
       0.00230597, 0.00213145, 0.00207328, 0.00198966, 0.00192167,
       0.00186485, 0.00181185, 0.00170267, 0.00169155, 0.00155155,
       0.00149177, 0.00140797, 0.00137596, 0.00134795, 0.0013379 ,
       0.00127013, 0.00124342, 0.0011895 , 0.00117219, 0.00115459,
       0.00113761, 0.00109757, 0.00106658, 0.00101405, 0.00101133,
       0.00100415, 0.00095823, 0.00095001, 0.000926  , 0.00091934,
       0.00088557, 0.00085431, 0.00084836, 0.00083353, 0.00081405,
       0.00080045, 0.0007799 , 0.00076834, 0.00075532, 0.00074718,
       0.00071362, 0.0007062 , 0.00070509, 0.00070138, 0.00069256,
       0.00066742, 0.00066651, 0.0006448 , 0.00063775, 0.00063527,
       0.00062548, 0.00061817, 0.00061363, 0.00060879, 0.00060188,
       0.00059092, 0.00058793, 0.0005846 , 0.00057103, 0.00057008,
       0.00056493, 0.0005547 , 0.00055175, 0.00055225, 0.00054

In [99]:
np.cumsum(tsvd.explained_variance_ratio_)

array([0.01383844, 0.02193709, 0.02802914, 0.03375574, 0.03895952,
       0.04346691, 0.04706076, 0.0498691 , 0.05252185, 0.05503977,
       0.05734574, 0.05947719, 0.06155047, 0.06354013, 0.0654618 ,
       0.06732665, 0.06913851, 0.07084117, 0.07253272, 0.07408427,
       0.07557604, 0.076984  , 0.07835996, 0.07970791, 0.08104582,
       0.08231595, 0.08355936, 0.08474887, 0.08592106, 0.08707565,
       0.08821326, 0.08931083, 0.09037741, 0.09139145, 0.09240279,
       0.09340694, 0.09436517, 0.09531517, 0.09624118, 0.09716052,
       0.09804609, 0.09890041, 0.09974876, 0.10058229, 0.10139634,
       0.10219679, 0.10297669, 0.10374503, 0.10450035, 0.10524753,
       0.10596115, 0.10666735, 0.10737244, 0.10807382, 0.10876638,
       0.1094338 , 0.11010031, 0.11074511, 0.11138286, 0.11201812,
       0.1126436 , 0.11326177, 0.1138754 , 0.11448418, 0.11508606,
       0.11567699, 0.11626491, 0.11684952, 0.11742055, 0.11799063,
       0.11855556, 0.11911027, 0.11966202, 0.12021426, 0.12075

100 singular values only account for a total of 13% of the variation. Likely because of the extreme sparsity.

# TODO: MCA one-hot-encoded artist and album names

In [73]:
master_sub.dtypes

artist_name            object
duration_ms             int64
album_name             object
count                   int64
danceability          float64
energy                float64
key                     int64
loudness              float64
mode                    int64
speechiness           float64
acousticness          float64
instrumentalness      float64
liveness              float64
valence               float64
tempo                 float64
time_signature          int64
artist_popularity       int64
album_popularity        int64
album_release_year      int64
dtype: object

Convert type object to categorical and one-hot-encode with pandas

# TODO: Autoencoder into K-Means

# TODO: Embedding song-in-playlist indicators