# Etapa 2 - Avaliação de Modelo Personalized PageRank

Variações:
- Weighted / Unweighted
- Directed / Undirected
- Visit Length Target: [5, 10, 20]
- Query Target Size: [100, 200, 300]

In [1]:
from models.personalized_pagerank import PersonalizedPageRank

import pickle
import dgl
import pandas as pd
from tqdm import tqdm

In [2]:
with open("../dados-processados/dgl_network.pickle", 'rb') as _file:
    dgl_network = pickle.load(_file)

In [42]:
model = PersonalizedPageRank( 
    visit_target        = 8,
    query_target_size   = 150,
    dgl_network         = dgl_network
)

In [4]:
with open("../dados-processados/dataset_challenge.pickle", 'rb') as _file:
    testset = pickle.load(_file)

In [46]:
preds = []

for pid, name, playlist in tqdm(testset[:]):
    if len(playlist) == 0:
        prediction = model.predict([[0]])
    else:
        prediction = model.predict([playlist])
    
    preds.append( (pid, playlist, prediction[0]) )

100%|██████████| 10000/10000 [03:34<00:00, 46.69it/s]


In [47]:
preds_df = pd.DataFrame(preds, columns=['pid','query','result'])
preds_df['tam'] = preds_df.result.apply(lambda x: len([int(obj) for obj in x]))

preds_df

Unnamed: 0,pid,query,result,tam
0,1000002,[],"[34, 6688, 10, 16119, 17437, 246735, 1090019, ...",500
1,1000003,[],"[516088, 937542, 953384, 4572, 8051, 8100, 456...",500
2,1000004,[],"[3075, 301, 6410, 12059, 310, 1607, 5539, 1390...",500
3,1000006,[],"[13705, 28397, 20060, 133662, 65296, 8887, 997...",500
4,1000007,[],"[3061, 32184, 11629, 41425, 16429, 76, 8554, 3...",500
...,...,...,...,...
9995,1006767,[752323],"[393649, 280457, 41179, 74965, 181702, 41141, ...",500
9996,1006771,[2739],"[4101, 1340, 5579, 36771, 295673, 132847, 2774...",500
9997,1006773,[7181],"[36433, 3928, 54529, 13411, 20456, 14371, 775,...",500
9998,1006775,[11262],"[16212, 114752, 25322, 14618, 130488, 115430, ...",500


In [48]:
preds_df[ preds_df.tam < 500 ]

Unnamed: 0,pid,query,result,tam
2685,1006003,"[65513, 171265, 433136, 433137, 433138]","[313237, 433139, 313238, 65512, 111129, 9829, ...",486
8945,1048425,"[25665, 174338, 236800, 123564, 123576, 123563...","[128440, 1379464, 1379468, 1379529, 1379534, 1...",316
9000,1002313,[849293],"[849294, 1615140, 1615141, 1615142, 1615143, 1...",401


## Formatação para Teste

The first non-commented/blank line should start with "team_info" and then include team name, and contact email address. 

for each challenge playlist there should be a line of the form:
- pid, trackuri_1, trackuri_2, track_uri_3, ..., track_uri_499, track_uri_500

before submission, the csv should be gzipped.

In [49]:
from utils.incremental_encoder import IncrementalEncoder

encoder = IncrementalEncoder()
encoder.load("../dados-processados/encoding_tracks.json")

In [50]:
records = []

for i, row in preds_df.iterrows():
    record = {
        'pid' : row.pid
    }

    for order, track in enumerate(row['result']):
        record[f'trackuri_{order+1}'] = encoder.labels[int(track)]

    records.append(record)

formatted_result = pd.DataFrame().from_records(records)

formatted_result.head()

Unnamed: 0,pid,trackuri_1,trackuri_2,trackuri_3,trackuri_4,trackuri_5,trackuri_6,trackuri_7,trackuri_8,trackuri_9,...,trackuri_491,trackuri_492,trackuri_493,trackuri_494,trackuri_495,trackuri_496,trackuri_497,trackuri_498,trackuri_499,trackuri_500
0,1000002,spotify:track:3uoQULcUWfnt6nc6J7Vgai,spotify:track:4KTtYhxFtFL7mBwnjkKfLm,spotify:track:2gam98EZKrF9XuOkU13ApN,spotify:track:4wH4dJgrsxONID6KS2tDQM,spotify:track:77dgyxbuL53WfkLZU3fk3o,spotify:track:2RssA66mESJXjhdZO8Rrkl,spotify:track:3eW4afPMJdCN7bGVM56aWH,spotify:track:5fv7SEAPH6EYuhFujpzVyv,spotify:track:1BLz7nBzRElSPfg3ndJkHz,...,spotify:track:2gQ3lDcFYa3yFOkaw8PtuO,spotify:track:1CvhKmrutTAta5awpJcFDn,spotify:track:25khomWgBVamSdKw7hzm3l,spotify:track:04DwTuZ2VBdJCCC5TROn7L,spotify:track:0cOBMETjhxublnnwhbnzJO,spotify:track:4KKLTVEcvfc4zKOq7Zixcg,spotify:track:7t2bFihaDvhIrd2gn2CWJO,spotify:track:2MYl0er3UZ1RlKwRb5LODh,spotify:track:34Fulx6Umr9LoA4UKdcjVP,spotify:track:1Nh5WUisukhSNgwXLRoyZf
1,1000003,spotify:track:7BDFC2k8JXeiPcKRgG0NUC,spotify:track:0oWJiWYnKrq4fHUZAuZmDq,spotify:track:3nTO0bvRQvVT2NhBKAu2Qm,spotify:track:2rkVoKVEMuct8SmEIGKzBw,spotify:track:6x53SJOV1PAZ8ZUwH4NFXQ,spotify:track:6PyLw3l6ndolBrTPQp3Ibm,spotify:track:7Je4aDuchYqv2YWMudH5ZM,spotify:track:2rX8226UEM7si6DiOzyDRw,spotify:track:6CaExy6jpVBIHxVdmubJw8,...,spotify:track:1nOcjkKgryMCwU1QNFlAgn,spotify:track:5eFxwmqKrHpSQDOEIFYlgY,spotify:track:5MuB7HgZW9HF01Y6NWD4ku,spotify:track:6FTtI7HJnPE9jtFnhAshre,spotify:track:56srYJ7KjvldWCF2CDv9VO,spotify:track:7q096gH9G0OjuUduheqM4l,spotify:track:2kPNfupzN6RAk2ehFcmjPG,spotify:track:5hlWtrnVUyAze5WkxF9P56,spotify:track:6RcQOut9fWL6FSqeIr5M1r,spotify:track:5hydb8abMsDWNefmuqDYeK
2,1000004,spotify:track:7KIbDUwumrpG5f30kEYW1v,spotify:track:0CAfXk7DXMnon4gLudAp7J,spotify:track:6DbqCKweKwVkHgRv1CI53D,spotify:track:0vupCkmy497h49a74Xsxj1,spotify:track:1dzQoRqT5ucxXVaAhTcT0J,spotify:track:495O1Affo7AurEPQcvcr18,spotify:track:2bJvI42r8EF3wxjOuDav4r,spotify:track:7oVEtyuv9NBmnytsCIsY5I,spotify:track:7uKcScNXuO3MWw6LowBjW1,...,spotify:track:3rbNV2GI8Vtd8byhUtXZID,spotify:track:6TrNRd98WksT9Kkmx9uj6R,spotify:track:6A5NlmBCsCGbJ27jHQgKV5,spotify:track:4ACUTWppRnO7hTFDJed6MB,spotify:track:6mjEABpi5cQ5gqFFOkR1Cc,spotify:track:6C7RJEIUDqKkJRZVWdkfkH,spotify:track:1uDjaezEbalGyGnuH80zDK,spotify:track:5lnsCyEKWofnC00U4Ax0ti,spotify:track:4WjH9Bzt3kx7z8kl0awxh4,spotify:track:1sNSG13fsK6KPKKNIQXXrh
3,1000006,spotify:track:7hfRrdFJgFKK3cJ4rmkecE,spotify:track:4lh1PamTsomWbFpkOPyfrD,spotify:track:44n97yHySt0Z9rqPaXgjCK,spotify:track:6Ck8Lq4srTWQP1PXZ9P8aZ,spotify:track:3BRO4QR9ZDj9Ae3VtbjeUr,spotify:track:7y9iMe8SOB6z3NoHE2OfXl,spotify:track:6j0OqIhUxvOgGPTXxNj124,spotify:track:7lLAOmWJXM9csJkg3yKrph,spotify:track:6zsk6uF3MxfIeHPlubKBvR,...,spotify:track:4MeDnO5yA2Zi6IMlVApRci,spotify:track:4BHSjbYylfOH5WAGusDyni,spotify:track:1EaKU4dMbesXXd3BrLCtYG,spotify:track:54OEArtp5D4bmOITIPjuw0,spotify:track:5yuShbu70mtHXY0yLzCQLQ,spotify:track:5iDYsWcIA9m4LeJmdelR6H,spotify:track:0VgkVdmE4gld66l8iyGjgx,spotify:track:04KTF78FFg8sOHC1BADqbY,spotify:track:60WcXjimfbhk6T0gj1Eufg,spotify:track:4qikXelSRKvoCqFcHLB2H2
4,1000007,spotify:track:6A5NlmBCsCGbJ27jHQgKV5,spotify:track:2KOt2JrCB720UxIbyzweQo,spotify:track:0qxYx4F3vm1AOnfux6dDxP,spotify:track:51KKQAgYFoJHgVIuJWHdHb,spotify:track:3f0U5NaD1bCk8nmKpn2ZJY,spotify:track:4CJVkjo5WpmUAKp3R44LNb,spotify:track:3RlsVPIIs5KFhLFhxZ4iDF,spotify:track:3rjM7GhxdVq1YySsHBs21i,spotify:track:0k6DnZMLoEUH8NGD5zh2SE,...,spotify:track:6RsWqX8zABZLhZydXxEFOm,spotify:track:66UVpCZ5aH3VV3Ic3PBUrP,spotify:track:3fLBmhcgWkPI47LfVQ8paB,spotify:track:5hJFhO9dvhJoDvUZZ9iWSw,spotify:track:48LwbDH7u5UMrsY3sjfjhe,spotify:track:0NBiC3zLXoBQXBjsbnbwJq,spotify:track:0HPqEqr2tG0VOHGtj8PM4t,spotify:track:03tqyYWC9Um2ZqU0ZN849H,spotify:track:5ORf8BZMbq4xN9kiumrPcQ,spotify:track:1uDjaezEbalGyGnuH80zDK


In [51]:
formatted_result[ formatted_result.trackuri_500.isna() ]

Unnamed: 0,pid,trackuri_1,trackuri_2,trackuri_3,trackuri_4,trackuri_5,trackuri_6,trackuri_7,trackuri_8,trackuri_9,...,trackuri_491,trackuri_492,trackuri_493,trackuri_494,trackuri_495,trackuri_496,trackuri_497,trackuri_498,trackuri_499,trackuri_500
2685,1006003,spotify:track:7gW1u5rbxe0rRTu5KgXClq,spotify:track:34AAWc81DPK1lmyJWmWnTy,spotify:track:5F5edI2fPe7kYqyUSxTNHD,spotify:track:5YRvdslvtxh3aWJcne4Tm2,spotify:track:5uFpMA47DLi9dyLT9ztGjN,spotify:track:3T7dNA7O8c3Axj5WyDNcH3,spotify:track:6XAdcAseYtijN0QUnQFsH2,spotify:track:0tXPhc8LvM4dPvoRwI66XQ,spotify:track:0L3aeM2PNlDVxibSADK8Oq,...,,,,,,,,,,
8945,1048425,spotify:track:5qI5Nj9DZG0Dsk1NXYfEKC,spotify:track:6JKaT97eiXhmty2ooBiqRo,spotify:track:3YsQ068w8PU5OkTwemRlvY,spotify:track:3ocEKeC7iGviXbp29jMREl,spotify:track:4F1RQKtn3OkC4c7iUuPR4G,spotify:track:1jyOP0ejeOy5uVj4v88ru8,spotify:track:0vQkeZWE4locWuvT9ct3Ig,spotify:track:1UgNMcVy0rJP74Uj2Bpdm0,spotify:track:02jkTsctQzIK1aIjCozlkE,...,,,,,,,,,,
9000,1002313,spotify:track:0qz7eW2ar2tt7xClWOOvB1,spotify:track:5fcoza8YqZcVyZ9o9eTI05,spotify:track:0XkVBGVQLteaS7vAmE40L8,spotify:track:4N0xhDafP6nJTQTsqX9TkG,spotify:track:44xCMp7l6iNdd9tqYzGfGm,spotify:track:6hGwC7jyzcBp9JngAC6nDt,spotify:track:2ATFO1Q4st0H6BA0SReqUy,spotify:track:1DphZcamyzA6NZlA5QzqIp,spotify:track:7ydpQQZjRnNZdGcxEYaJ6H,...,,,,,,,,,,


In [52]:
# Temporário
formatted_result = formatted_result.fillna('-1')

In [53]:
header = f'team_info, pedrocaio, pedromn@cos.ufrj.br'
body = formatted_result.to_csv(index=False, header=False)

with open("../dados/spotify_million_playlist_dataset_challenge/submission.csv", "w+") as _file:
    _file.write(header + "\n" + body)

In [54]:
len(encoder.labels)

2262292

In [55]:
len(set(encoder.labels))

2262292

In [56]:
for i, row in formatted_result.iterrows():
    lista = row.to_list()[1:]
    conjunto = set(lista)
    print(row.pid, len(lista), len(conjunto))
    break

1000002 500 500


In [57]:
a = preds_df.iloc[0].result
preds_df.iloc[0].pid

nodes = [int(x) for x in a]
print( len(set(a)), len(set(nodes)))

500 500
