In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from gensim.models import Word2Vec, KeyedVectors
from tqdm.auto import tqdm
import random




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wv = KeyedVectors.load("word2vec128.wordvectors", mmap='r')
df_playlists_train = pd.read_csv('data/train_playlists.csv', sep='\t')
df_playlists_train['track_uris'] = df_playlists_train['track_uris'].apply(lambda x: x.strip('[]').replace("'", "").split(', '))
df_playlists_val = pd.read_csv('data/validation_playlists_sample.csv', sep='\t').dropna().reset_index(drop=True)
df_playlists_val['track_uris'] = df_playlists_val['track_uris'].apply(lambda x: x.strip('[]').replace("'", "").split(', '))

wv.sort_by_descending_frequency()

### Next token prediction

In [8]:
class PlaylistDataset(Dataset):
    def __init__(self, playlists, wv):
        self.playlists = playlists
        self.wv = wv
    def __len__(self):
        return len(self.playlists)

    def __getitem__(self, idx):
        playlist = self.playlists[idx]
        playlist = [self.wv.key_to_index[word] for word in playlist]
        src = playlist[:-1]  # All except the last token
        tgt = playlist[1:]   # All except the first token
        return torch.tensor(src, dtype=torch.long), torch.tensor(tgt, dtype=torch.long)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lens = [len(src) for src in src_batch]
    max_src_len = max(src_lens)

    # Pad sequences
    padded_src_batch = torch.zeros((len(src_batch), max_src_len), dtype=torch.long)
    padded_tgt_batch = torch.zeros((len(tgt_batch), max_src_len), dtype=torch.long)
    src_attention_masks = torch.zeros((len(src_batch), max_src_len), dtype=torch.bool)

    for i, (src, tgt) in enumerate(zip(src_batch, tgt_batch)):
        padded_src_batch[i, :len(src)] = src
        padded_tgt_batch[i, :len(tgt)] = tgt
        src_attention_masks[i, :len(src)] = 1

    return padded_src_batch, padded_tgt_batch, src_attention_masks

class TestDataset(Dataset):
    def __init__(self, playlists, wv):
        self.playlists = playlists
        self.wv = wv
    def __len__(self):
        return len(self.playlists)

    def __getitem__(self, idx):
        playlist = self.playlists.iloc[idx]
        src = [self.wv.key_to_index[word] for word in playlist["x"]]
        tgt = [self.wv.key_to_index[word] for word in playlist["y"]]
        return torch.tensor(src, dtype=torch.long), torch.tensor(tgt, dtype=torch.long)

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout, cutoffs, embedding_model = None, wv = None, cat = True):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
            hidden_dim, vocab_size, cutoffs=cutoffs, div_value=4.0
        )
        self.embedding = embedding_model
        self.wv = wv
        self.cat = cat

    def forward(self, src, src_lengths, targets=None):
        with torch.no_grad():
            if self.embedding is not None:
                src_emb = torch.stack([self.embedding(
                    torch.stack(
                        [torch.tensor(self.wv[self.wv.index_to_key[e]]).cuda() for e in i]), self.cat).cuda() 
                        for i in src])
            else:
                src_emb = torch.stack([
                    torch.stack(
                        [torch.tensor(self.wv[self.wv.index_to_key[e]]).cuda() for e in i]).cuda() 
                        for i in src])

        packed_input = nn.utils.rnn.pack_padded_sequence(src_emb, src_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_input)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Shift targets for the next-token prediction
        output = output[:, :-1, :].contiguous().view(-1, output.size(-1))
        if targets is not None:
            targets = targets[:, 1:].contiguous().view(-1)
            loss = self.adaptive_softmax(output, targets).loss
            return output, loss
        else:
            probs = self.adaptive_softmax.log_prob(output)
            return probs.view(src.size(0), -1, probs.size(-1))
class WordEmbeddingEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(WordEmbeddingEncoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
    
    def forward(self, x, cat = False):
        og = x
        x = F.relu(self.fc1(x))
        x = F.normalize(self.fc2(x), p=2, dim=1)  # Normalize embeddings to unit length
        if cat:
            x = torch.cat((og, x), axis = -1)
        return x
def top_k_accuracy(probs, targets, k=5):
    # Get the top k indices along the second dimension (i.e., for each example in the batch)
    _, top_k_indices = torch.topk(probs, k, dim=1)
    
    # Check if the targets are in the top k indices
    correct = top_k_indices.eq(targets.view(-1, 1).expand_as(top_k_indices))
    
    # Calculate the top k accuracy
    top_k_acc = correct.sum().float() / targets.size(0)
    return top_k_acc.item()
def NDCG(probs, targets, k=5):
    # Get the top k indices
    _, top_k_indices = torch.topk(probs, k)
    top_k_indices = top_k_indices.cpu().numpy()
    
    targets = targets.cpu().numpy()
    
    relevance_scores = np.isin(top_k_indices, targets).astype(float)
    DCG = relevance_scores[0] + np.sum(relevance_scores[1:] / np.log2(np.arange(2, k + 1)))

    # Calculate the IDCG (Ideal DCG)
    IDCG = 1 + np.sum(1 / np.log2(np.arange(2, relevance_scores.sum()) + 1))
    # Calculate the NDCG
    NDCG = DCG / IDCG
    return NDCG

def r_precision(probs, targets, R=None):

    # Determine R if not provided (default to the number of relevant items for each example)
    if R is None:
        R = len(targets)    
    _, top_R_indices = torch.topk(probs, k=R)
    
    # Check if the targets are in the top R indices

    relevant_items = targets
    num_relevant = min(R, len(relevant_items))
    top_k_for_example = top_R_indices[:num_relevant]
    
    # Count how many relevant items are in the top R predictions
    correct = torch.sum(torch.isin(top_k_for_example, relevant_items))

    # Calculate R-precision
    r_precision_score = correct / R
    return r_precision_score


In [5]:
# word_embedding_encoder = None
word_embedding_encoder =  WordEmbeddingEncoder(128, 128)
word_embedding_encoder.load_state_dict(torch.load('models/triplet_1_vec128_40.pt'))
word_embedding_encoder.eval().cuda()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(wv)  
embedding_dim = 256
hidden_dim = 128
num_layers = 2
dropout = 0.5
cutoffs = [vocab_size//1000, vocab_size // 100, vocab_size // 10, 3 * vocab_size // 4]
train_dataset = PlaylistDataset(df_playlists_train['track_uris'], wv)
dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn, num_workers=8)
val_dataset = PlaylistDataset(df_playlists_val['track_uris'], wv)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn, num_workers=8)
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout, cutoffs, wv = wv, embedding_model=word_embedding_encoder).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
writer = SummaryWriter("runs/lstm_lambda_1_vec128_")
num_epochs = 10
max_it_per_epoch = 10000




In [6]:
num_epochs = 5
print("Base Validation")
torch.manual_seed(42069)
model.eval()
total_val_loss = 0
top_10_acc = 0
top_50_acc = 0
with torch.no_grad():
    for src, tgt, src_attention_masks in tqdm(val_dataloader, total=len(val_dataloader)):
        src, tgt, src_attention_masks = src.to(device), tgt.to(device), src_attention_masks.to(device)
        src_lengths = src_attention_masks.sum(dim=1).cpu()
        _, loss = model(src, src_lengths, targets=tgt)
        probs = model(src, src_lengths)
        total_val_loss += loss.item()
        top_10_acc += top_k_accuracy(probs[:,-1], tgt[:,-1], k=10)
        top_50_acc += top_k_accuracy(probs[:,-1], tgt[:,-1], k=50)
        del probs
writer.add_scalar("Loss/Validation", total_val_loss / len(val_dataloader),0)
writer.add_scalar("Top-10 Accuracy/Validation", top_10_acc / len(val_dataloader),0)
writer.add_scalar("Top-50 Accuracy/Validation", top_50_acc / len(val_dataloader),0)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}")
    model.train()
    for step, (src, tgt, src_attention_masks) in tqdm(enumerate(dataloader), total=max_it_per_epoch):
        if step >= max_it_per_epoch:
            break
        src, tgt, src_attention_masks = src.to(device), tgt.to(device), src_attention_masks.to(device)
        src_lengths = src_attention_masks.sum(dim=1).cpu()
        size = src.shape
        optimizer.zero_grad()
        output, loss = model(src, src_lengths, targets=tgt)
        loss.backward()
        optimizer.step()
        writer.add_scalar("Loss/Training", loss.item(), step + epoch * max_it_per_epoch)

    print("Validation")
    model.eval()
    total_val_loss = 0
    top_10_acc = 0
    top_50_acc = 0
    with torch.no_grad():
        for src, tgt, src_attention_masks in tqdm(val_dataloader, total=len(val_dataloader)):
            src, tgt, src_attention_masks = src.to(device), tgt.to(device), src_attention_masks.to(device)
            src_lengths = src_attention_masks.sum(dim=1).cpu()
            _, loss = model(src, src_lengths, targets=tgt)
            probs = model(src, src_lengths)
            total_val_loss += loss.item()
            top_10_acc += top_k_accuracy(probs[:,-1], tgt[:,-1], k=10)
            top_50_acc += top_k_accuracy(probs[:,-1], tgt[:,-1], k=50)
            del probs
    writer.add_scalar("Loss/Validation", total_val_loss / len(val_dataloader), epoch + 1)
    writer.add_scalar("Top-10 Accuracy/Validation", top_10_acc / len(val_dataloader), epoch + 1)
    writer.add_scalar("Top-50 Accuracy/Validation", top_50_acc / len(val_dataloader), epoch + 1)
    torch.save(model.state_dict(), f"models/lstm_lambda_1_vec128_{epoch}.pt") # Should be 0.5!
            

Base Validation


100%|██████████| 4995/4995 [05:59<00:00, 13.89it/s]


Epoch 1


100%|██████████| 10000/10000 [28:08<00:00,  5.92it/s]


Validation


100%|██████████| 4995/4995 [06:00<00:00, 13.85it/s]


Epoch 2


100%|██████████| 10000/10000 [25:52<00:00,  6.44it/s]


Validation


100%|██████████| 4995/4995 [04:47<00:00, 17.39it/s]


Epoch 3


100%|██████████| 10000/10000 [27:58<00:00,  5.96it/s]


Validation


100%|██████████| 4995/4995 [05:35<00:00, 14.90it/s]


Epoch 4


100%|██████████| 10000/10000 [27:29<00:00,  6.06it/s]


Validation


100%|██████████| 4995/4995 [05:58<00:00, 13.94it/s]


Epoch 5


100%|██████████| 10000/10000 [30:07<00:00,  5.53it/s]


Validation


100%|██████████| 4995/4995 [06:02<00:00, 13.78it/s]


### Test set prediction

In [26]:
test_df = pd.read_csv('data/test_set_final.csv', sep=',')
test_df['num_ground_truth_final'].value_counts()

num_ground_truth_final
1     2000
5     2000
10    2000
25    2000
Name: count, dtype: int64

In [5]:
test_df['x'] = test_df['tracks_incomplete'].apply(lambda x: [e["track_uri"].split(":")[-1] for e in eval(x)])
test_df['y'] = test_df['ground_truth'].apply(lambda x: [e["track_uri"].split(":")[-1] for e in eval(x)])
test_df.reset_index(drop=True, inplace=True)

In [18]:
wv = KeyedVectors.load("word2vec128.wordvectors", mmap='r')
wv.sort_by_descending_frequency()
vocab_size = len(wv)
embedding_dim = 256
hidden_dim = 128
num_layers = 2
dropout = 0.5
cutoffs = [vocab_size//1000, vocab_size // 100, vocab_size // 10, 3 * vocab_size // 4]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model = None
embedding_model = WordEmbeddingEncoder(128, 128).to(device)


Net = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout, cutoffs, wv = wv, embedding_model=embedding_model).to(device)
Net.load_state_dict(torch.load('models/lstm_lambda_1_vec128_4.pt'))

Net.eval()
test_dataset = TestDataset(test_df, wv)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=8)



In [19]:
# Evaluate the model on the test set
targets = {}
total_test_ndcg = 0
total_test_r_precision = 0
for src, tgt in tqdm(test_dataloader, total=len(test_dataloader)):
    if len(tgt[0]) not in targets.keys():
        targets[len(tgt[0])] = {"count": 0, "r_precision": 0, "ndcg": 0}
    src, tgt = src.to(device), tgt.to(device)
    src_lengths = torch.ones(src.size(0)).to(torch.long).to(device) * src.size(1)
    probs = Net(src, src_lengths.cpu())
    total_test_ndcg += NDCG(probs[0,-1], tgt[0,-1], k=500)
    total_test_r_precision += r_precision(probs[0,-1], tgt)
    targets[len(tgt[0])]["count"] += 1
    targets[len(tgt[0])]["r_precision"] += r_precision(probs[0,-1], tgt)
    targets[len(tgt[0])]["ndcg"] += NDCG(probs[0,-1], tgt, k=500)
    del probs
with open ("results.txt", "a") as f:
    f.write("--------------------------------\n")
    f.write("LSTM Adaptive Softmax Lambda 1\n")
    f.write("--------------------------------\n")
    f.write(f"Test NDCG: {total_test_ndcg / len(test_dataloader)}\n")
    f.write(f"Test R-Precision: {total_test_r_precision / len(test_dataloader)}\n")
    for k, v in targets.items():
        f.write(f"Test R-Precision for {k}: {v['r_precision'] / v['count']}\n")
        f.write(f"Test NDCG for {k}: {v['ndcg'] / v['count']}\n")


100%|██████████| 8000/8000 [01:39<00:00, 80.80it/s] 
