# Etapa 1 - Treinamento de Modelo Personalized PageRank

In [1]:
import pickle
import json
import dgl
import torch
import numpy as np
import pandas as pd
import os
from scipy.sparse import dok_matrix

from tqdm.notebook import tqdm

from utils.incremental_encoder import IncrementalEncoder

## Preparação de Dados

In [None]:
tracks_encoder = IncrementalEncoder()
tracks_encoder.load("../dados-processados/encoding_tracks.json")

In [2]:
# Carregando Dados Processados
with open("../dados-processados/experimentos/treino.pickle", 'rb') as _file:
    dataset = pickle.load(_file)

dataset

Unnamed: 0,pid,playlist
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,1,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6..."
2,2,"[90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ..."
3,3,"[154, 155, 156, 157, 158, 159, 160, 161, 162, ..."
4,4,"[280, 281, 282, 283, 284, 285, 286, 287, 288, ..."
...,...,...
999995,999995,"[113082, 5782, 48787, 5783, 61875, 17695, 5046..."
999996,999996,"[2262284, 2262285, 2262286, 2262287, 2262288, ..."
999997,999997,"[39642, 41142, 5433, 22055, 3859, 1042, 9151, ..."
999998,999998,"[5767, 5871, 14503, 3935, 68970, 5837, 5837, 1..."


### Calcula Rede 1 - Tipo Pixie

In [None]:
# Calcula tamanho de matriz
tam_matriz = tracks_encoder.last_index + dataset.shape[0]

# Cria matriz esparsa
network_pixie = dok_matrix((tam_matriz, tam_matriz), dtype=int)

# Preenche matriz com frequências de transição
for i, row in tqdm(dataset.iterrows()):
    pid = row['pid']

    # Relaciona Musica com Playlist
    for track in row.playlist:
        network_pixie[pid, track] = 1

In [None]:
network_pixie_dgl = dgl.from_scipy(
    sp_mat = network_pixie
)

with open("../dados-processados/network_pixie_dgl.pickle", 'wb') as _file:
    pickle.dump(network_pixie_dgl, _file)

### Calcula Rede 2 - CMTD

In [3]:
# Calcula tamanho de matriz
tam_matriz = tracks_encoder.last_index + dataset.shape[0]

# Cria matriz esparsa
network_cmtd = dok_matrix((tam_matriz, tam_matriz), dtype=int)

# Preenche matriz com frequências de transição
for playlist_id, playlist_name, tracklist in tqdm(dataset):
    for track_idx in range( len(tracklist)-1 ):
        current_track_id = tracklist[track_idx]
        next_track_id = tracklist[track_idx+1]
        
        network_cmtd[current_track_id, next_track_id] += 1
        network_cmtd[next_track_id, current_track_id] += 1

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [6]:
network_cmtd_dgl = dgl.from_scipy(
    sp_mat = network_cmtd
)

init_nodes, final_nodes = network_cmtd_dgl.edges()

weights = []
for i in range(len(init_nodes)):
    weight = network[init_nodes[i], final_nodes[i]]
    weights.append( weight )

network_cmtd_dgl.edata['weights'] = torch.tensor(weights, dtype=float)

with open("../dados-processados/network_cmtd_dgl.pickle", 'wb') as _file:
    pickle.dump(network_cmtd_dgl, _file)