# Etapa 0 - Processamento de Dados

In [2]:
import pickle
import os

from json import load
from tqdm.notebook import tqdm

In [3]:
class IncrementalEncoder():
    def __init__(self):
        self.encoding = {}
        self.last_index = 0
    
    def fit_one(self, value):
        if value in self.encoding.keys():
            return self.encoding[value]
        else:
            self.encoding[value] = self.last_index
            self.last_index += 1

            return self.encoding[value]

    def transform(self, values):
        return [self.encoding[x] for x in values]
    
    def load_encoding(self, filepath):
        from json import load

        with open(filepath, 'r') as _file:
            self.encoding = load(_file)
    
    def save_encoding(self, filepath):
        from json import dump

        with open(filepath, 'w+') as _file:
            dump(self.encoding, _file)


### Dados Treinamento

In [83]:
artists_encoder = IncrementalEncoder()
tracks_encoder = IncrementalEncoder()

dataset     = []
for i in tqdm(range(0, 1000000, 1000)):
    _min = i
    _max = i + 999

    with open(f"..\dados\spotify_million_playlist_dataset\data\mpd.slice.{_min}-{_max}.json", 'r') as _file:
        playlists = load(_file)['playlists']

    for playlist in playlists:
        playlist_encoded_content = []
        for track in playlist['tracks']:
            artist_id = artists_encoder.fit_one(track['artist_uri'])
            track_id = tracks_encoder.fit_one(track['track_uri'])

            playlist_encoded_content.append((track['pos'], artist_id, track_id))
        
        dataset.append((playlist['name'], playlist_encoded_content))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [2]:
if not os.path.isdir('../dados-processados/'):
    os.mkdir('../dados-processados/')

In [84]:
with open("../dados-processados/dataset.pickle", 'wb') as _file:
    pickle.dump(dataset, _file)

In [85]:
tracks_encoder.save_encoding("../dados-processados/encoding_tracks.json")
artists_encoder.save_encoding("../dados-processados/encoding_artists.json")

### Dados Desafio

In [4]:
artists_encoder = IncrementalEncoder()
artists_encoder.load_encoding("../dados-processados/encoding_artists.json")

tracks_encoder = IncrementalEncoder()
tracks_encoder.load_encoding("../dados-processados/encoding_tracks.json")

In [7]:
dataset_challenge = []

with open(f"..\dados\spotify_million_playlist_dataset_challenge\challenge_set.json", 'r') as _file:
    playlists = load(_file)['playlists']

for playlist in playlists:
    playlist_encoded_content = []
    for track in playlist['tracks']:
        artist_id = artists_encoder.fit_one(track['artist_uri'])
        track_id = tracks_encoder.fit_one(track['track_uri'])

        playlist_encoded_content.append((track['pos'], artist_id, track_id))

    dataset_challenge.append((playlist.get('name', None), playlist_encoded_content))

len(dataset_challenge)

10000

In [10]:
with open("../dados-processados/dataset_challenge.pickle", 'wb') as _file:
    pickle.dump(dataset_challenge, _file)