# Etapa 0 - Processamento de Dados

In [1]:
import pickle
import os
import pandas as pd

from json import load
from tqdm.notebook import tqdm

from utils.incremental_encoder import IncrementalEncoder

### Dados Treinamento

In [3]:
tracks_encoder = IncrementalEncoder()

dataset     = []
for i in tqdm(range(0, 1000000, 1000)):
    _min = i
    _max = i + 999

    with open(f"..\dados\spotify_million_playlist_dataset\data\mpd.slice.{_min}-{_max}.json", 'r') as _file:
        playlists = load(_file)['playlists']

    for playlist in playlists:
        playlist_encoded_content = []
        for track in playlist['tracks']:
            track_id = tracks_encoder.fit_one(track['track_uri'])

            playlist_encoded_content.append(track_id)
        
        dataset.append((playlist['pid'], playlist_encoded_content, len(playlist_encoded_content)))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [4]:
if not os.path.isdir('../dados-processados/'):
    os.mkdir('../dados-processados/')

with open("../dados-processados/dataset.pickle", 'wb') as _file:
    pickle.dump(dataset, _file)

tracks_encoder.save("../dados-processados/encoding_tracks")

## Separação em Experimentos

In [12]:
if not os.path.isdir('../dados-processados/experimentos/'):
    os.mkdir('../dados-processados/experimentos/')

In [29]:
def pega_X(playlist, k):
    return playlist[:k]

def pega_y(playlist, k):
    return playlist[k:]

In [33]:
experimentos = pd.DataFrame(
    {
        'nome'          : [   1,   2,   3,   4],
        'n'             : [2000,2000,2000,2000],
        'min_musicas'   : [ 150,  38,  20,  10],
        'tam_query'     : [ 100,  20,  10,   5],
    }
)

experimentos

Unnamed: 0,nome,n,min_musicas,tam_query
0,1,2000,150,100
1,2,2000,38,20
2,3,2000,20,10
3,4,2000,10,5


In [38]:
dataset_ = pd.DataFrame(dataset, columns=['pid','playlist','length'])

for i, row in experimentos.iterrows():
    dados_experimento = dataset_[ dataset_.length > row['min_musicas'] ].sample(n=row['n'])

    dataset_ = dataset_[ ~dataset_.pid.isin(dados_experimento.pid) ]

    dados_experimento['X'] = dados_experimento.playlist.apply(lambda x : pega_X(x, row['tam_query']))
    dados_experimento['y'] = dados_experimento.playlist.apply(lambda x : pega_y(x, row['tam_query']))

    dados_experimento.drop(columns=['playlist','length'], inplace=True)

    dados_experimento.to_pickle(f"../dados-processados/experimentos/teste_{row['nome']}.pickle")

dataset_.drop(columns=['length'], inplace=True)
dataset_.to_pickle(f"../dados-processados/experimentos/treino.pickle")

Preview

In [41]:
dados_experimento.head()

Unnamed: 0,pid,X,y
250501,250501,"[11190, 14449, 6702, 4704, 37055]","[12705, 308, 13586, 4620, 2414, 17918, 4899, 7..."
189219,189219,"[1980, 2152, 4587, 34305, 12905]","[16909, 8331, 29, 16960, 7971, 35, 8162, 33, 1..."
5280,5280,"[6517, 6519, 52341, 6518, 13807]","[13503, 30376, 13555, 1811, 1260, 1828, 49918,..."
650243,650243,"[96467, 10842, 17527, 229282, 254204]","[3964, 19071, 7268, 323065, 87980, 1300, 2930,..."
903925,903925,"[9127, 232095, 67193, 891554, 13494]","[13498, 6859, 58083, 1130, 9537, 220, 3682, 36..."


In [40]:
dataset_.head()

Unnamed: 0,pid,playlist
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,1,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6..."
2,2,"[90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ..."
3,3,"[154, 155, 156, 157, 158, 159, 160, 161, 162, ..."
4,4,"[280, 281, 282, 283, 284, 285, 286, 287, 288, ..."
