In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
class SpotifyPlaylistDataset:
    def __init__(self, verbose=True):
        self.verbose = verbose
        self.playlists = None
        self.current_slice = 0
        self._load_slice(0)
        
    def _load_slice(self, data_slice):
        if self.verbose:
            print(f"Carregando dados da slice {data_slice*1000}-{data_slice*1000+999}")
        with open(f'../data/mpd.slice.{data_slice*1000}-{data_slice*1000+999}.json') as f:
            self.playlists = json.load(f)['playlists']
        self.current_slice = data_slice
    
    def read_playlist(self, index):
        if not (self.current_slice*1000 <= index <= self.current_slice*1000+999):
            self._load_slice(index//1000)
        return self.playlists[index % 1000]
    
    def __len__(self):
        return 1_000_000
    
        
            

In [3]:
from scipy.sparse import csr_matrix, lil_matrix, save_npz, load_npz

In [4]:
tracks = pd.read_csv('../tracks/tracks.csv')['id']

In [5]:
def create_sparse_matrix():
    dataset = SpotifyPlaylistDataset()
    sparse_matrix = lil_matrix((len(dataset), len(tracks)))
    for i in range(10_000):
        playlist = [track['track_uri'] for track in dataset.read_playlist(i)['tracks']]
        for index in tracks[tracks.isin(playlist)].index:
            sparse_matrix[i, index] = 1
    return sparse_matrix

In [6]:
sm = csr_matrix(create_sparse_matrix())
save_npz('sparse_matrix.npz', sm)

Carregando dados da slice 0-999
Carregando dados da slice 1000-1999
Carregando dados da slice 2000-2999
Carregando dados da slice 3000-3999
Carregando dados da slice 4000-4999
Carregando dados da slice 5000-5999
Carregando dados da slice 6000-6999
Carregando dados da slice 7000-7999
Carregando dados da slice 8000-8999
Carregando dados da slice 9000-9999


In [7]:
sm = load_npz('sparse_matrix.npz')

In [8]:
sm

<1000000x2261616 sparse matrix of type '<class 'numpy.float64'>'
	with 656134 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.metrics import pairwise_distances

In [48]:
sm[1020]

<1x2261616 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [58]:
argsort = np.argsort(pairwise_distances(sm[1020],sm, metric='cosine'))
argsort

array([[  1020,   7224,    885, ..., 333342, 333344, 999999]])

In [56]:
dataset = SpotifyPlaylistDataset()
dataset.read_playlist(1020)

Carregando dados da slice 0-999
Carregando dados da slice 1000-1999


{'name': 'vaporwave',
 'collaborative': 'false',
 'pid': 1020,
 'modified_at': 1483056000,
 'num_tracks': 17,
 'num_albums': 15,
 'num_followers': 1,
 'tracks': [{'pos': 0,
   'artist_name': 'Blank Banshee',
   'track_uri': 'spotify:track:6NaPhrkmnrmo2dpW2HHyBk',
   'artist_uri': 'spotify:artist:1oR9pQhucVTJyi5lH2Y2iT',
   'track_name': 'Ammonia Clouds',
   'album_uri': 'spotify:album:5AcOa8jiTLbwlBaIIbCjYn',
   'duration_ms': 163282,
   'album_name': 'Blank Banshee 0'},
  {'pos': 1,
   'artist_name': 'VAPERROR',
   'track_uri': 'spotify:track:1XUTJKIdDGFTGgXJMppvrm',
   'artist_uri': 'spotify:artist:0AEVuiJFczDd4dkmZh2Kha',
   'track_name': 'Surf',
   'album_uri': 'spotify:album:0sgm1uRNjgNSCggcnZfiR3',
   'duration_ms': 130588,
   'album_name': 'Mana Pool'},
  {'pos': 2,
   'artist_name': 'CYBEREALITYライフ',
   'track_uri': 'spotify:track:2NKqrYeZC5uirATpR2CwMB',
   'artist_uri': 'spotify:artist:5wxQkBn8VmHggkeH94BdYf',
   'track_name': 'サービスのカタログ http\u200b:\u200b/\u200b/\u200bwww\u20

In [61]:
dataset.read_playlist(7224)

Carregando dados da slice 7000-7999


{'name': 'V A P O R W A V E',
 'collaborative': 'false',
 'pid': 7224,
 'modified_at': 1509408000,
 'num_tracks': 15,
 'num_albums': 10,
 'num_followers': 1,
 'tracks': [{'pos': 0,
   'artist_name': '18 Carat Affair',
   'track_uri': 'spotify:track:61QkubO5ny4DtTFoOaIRGg',
   'artist_uri': 'spotify:artist:0va8U409dr4gA1zQ2gSnD5',
   'track_name': 'Desire',
   'album_uri': 'spotify:album:1l3ok2CuYB8wWKUTm4jssh',
   'duration_ms': 98771,
   'album_name': 'Vintage Romance'},
  {'pos': 1,
   'artist_name': 'luxury elite',
   'track_uri': 'spotify:track:38k3otUd41zKhvUvZU7iUZ',
   'artist_uri': 'spotify:artist:28516pIwBLUO62yBiLAfdI',
   'track_name': 'Totally Rad',
   'album_uri': 'spotify:album:0LOSJF9SLsu9UPamEbfkpj',
   'duration_ms': 78251,
   'album_name': 'With Love'},
  {'pos': 2,
   'artist_name': 'luxury elite',
   'track_uri': 'spotify:track:2TjAHMFLFsKdXXqaePxwC6',
   'artist_uri': 'spotify:artist:28516pIwBLUO62yBiLAfdI',
   'track_name': 'Midnight',
   'album_uri': 'spotify:alb

In [60]:
sm[1020]

<1x2261616 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [37]:
tracks2 = pd.read_csv('../tracks/tracks.csv', index_col='id')

In [42]:
tracks2.loc['spotify:track:7tLWiJdD0kmejSuAQb5WJd']

name                                                         Ay Yo
duration_ms                                                 197533
artists             {"(4O2YL4ygn6eTBC0w1hyWUM,\"Melanie Fiona\")"}
explicit                                                         f
popularity                                                      21
acousticness                                                 0.185
danceability                                                 0.485
energy                                                       0.728
instrumentalness                                          0.000001
key                                                              1
liveness                                                     0.143
loudness                                                    -4.903
mode                                                             1
speechiness                                                 0.0524
tempo                                                       87