In [3]:
from collections import namedtuple

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import pytorch_lightning as pl
import os
import tqdm
import json
import sklearn.metrics as sm

import tensorboardX as tb
# import tensorflow as tf
import datetime, os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dataset
import model
np.random.seed(31337)

In [4]:
DATA_DIR = "data"

In [5]:
train_data = pd.read_csv(os.path.join(DATA_DIR, 'train_data_artist.csv'))
val_data = pd.read_csv(os.path.join(DATA_DIR, 'val_data_artist.csv'))
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test_data_artist.csv'))
features = ['start', 'track', 'artist_context', 'artist_track']

In [6]:
dm = dataset.ContextualRankerData(train_data=train_data,
                                      val_data=val_data,
                                      test_data=test_data,
                                      features=features)


## Compute top recommendations

In [7]:
track_meta = pd.read_json("/home/lolvista/MADE/sem2/recsys_course/recsys-itmo-spring-2023/botify/data/tracks.json", lines=True)

In [8]:
track_meta

Unnamed: 0,artist,title,track
0,Jack Johnson,The Cove,0
1,Billy Preston,Nothing from Nothing,1
2,Paco De Lucia,Entre Dos Aguas,2
3,Josh Rouse,Under Cold Blue Stars,3
4,The Dead 60s,Riot Radio (Soundtrack Version),4
...,...,...,...
49995,Alain Bashung,Osez Joséphine,49995
49996,Habib Koité,Kanawa,49996
49997,De Saloon,Morder,49997
49998,She Wants Revenge,Red Flags And Long Nights,49998


In [9]:
artist_meta = pd.read_csv('data/artists.csv').set_index('artist')
artist_meta.head()

Unnamed: 0_level_0,Unnamed: 0,artist_id
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Jack Johnson,0,0
Billy Preston,1,1
Paco De Lucia,2,2
Josh Rouse,3,3
The Dead 60s,4,4


In [10]:
track_meta = track_meta.join(artist_meta, on='artist')

## Load model and predict recommendations for each track

In [12]:
best_path = 'checkpoints/2emb_epoch=78-val_loss=0.1026.ckpt'
best = model.ContextualRanker.load_from_checkpoint(best_path)

In [13]:
best

ContextualRanker(
  (net): Net(
    (context_emb): Embedding(50000, 100)
    (track_emb): Embedding(50000, 100)
    (artist_context_emb): Embedding(11464, 100)
    (artist_track_emb): Embedding(11464, 100)
  )
)

In [14]:
trainer = pl.Trainer(
    max_epochs=300,
    accelerator='gpu', 
    devices=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
trainer.test(best, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 170.98it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        avg_loss            0.1544792354106903
        rdm_loss            0.2778107523918152
        test_loss           0.10292837768793106
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.10292837768793106,
  'avg_loss': 0.1544792354106903,
  'rdm_loss': 0.2778107523918152}]

In [16]:
context_embeddings = dict(best.named_parameters())["net.context_emb.weight"].data.cpu().numpy()
track_embeddings = dict(best.named_parameters())["net.track_emb.weight"].data.cpu().numpy()
artist_context_embeddings = dict(best.named_parameters())["net.artist_context_emb.weight"].data.cpu().numpy()
artist_track_embeddings = dict(best.named_parameters())["net.artist_track_emb.weight"].data.cpu().numpy()


inp_emb = context_embeddings[track_meta.track] + artist_context_embeddings[track_meta.artist_id]
print(inp_emb.shape)

out_emb = track_embeddings[track_meta.track] + artist_track_embeddings[track_meta.artist_id]
print(out_emb.shape)

(50000, 100)
(50000, 100)


In [17]:
k = 100
with open("tracks_my_nn_recs.json", "w") as rf:
    for i, track in tqdm.tqdm(track_meta.iterrows()):
       
        neighbours = np.argpartition(-np.dot(out_emb, inp_emb[i]), k)[:k]
        
        recommendation = dict(track)
        recommendation["recommendations"] = neighbours.tolist()
        
        rf.write(json.dumps(recommendation) + "\n")

50000it [07:56, 104.88it/s]


In [20]:
track = 234
embedding = context_embeddings[track]
track_meta.loc[track_meta["track"] == track, ["artist", "title"]]

Unnamed: 0,artist,title
234,Daft Punk,The Brainwasher


In [21]:
k = 10
neighbours = np.argpartition(-np.dot(track_embeddings, embedding), k)[:k]
track_meta.loc[track_meta["track"].isin(neighbours), ["artist", "title"]]

Unnamed: 0,artist,title
225,Daft Punk,Digital Love
625,Simian Mobile Disco,Simple
1290,Black Eyed Peas,Pump It
1997,Justin Bieber,Up
1998,LMFAO / Lil Jon,Shots
3692,Klaus Schulze,North Of the Yukon
5198,Metallica / Marianne Faithfull,The Memory Remains
6759,Taylor Swift,Tied Together With A Smile
9086,Nickelback,Never Again
20692,Eminem,Curtains Close
