In [1]:
import pandas as pd
import numpy as np
import os
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import torch.optim as optim
from tensorboardX  import SummaryWriter # conflict with tensorboard using tf.io

### Importing and prepare features dataset

In [2]:
# Import song features
features = pd.read_pickle("features_preprocessed.pkl")
features= features.reset_index(drop=True)

In [3]:
# Mapping of track id and index
track_id_to_index = {track_id: idx for idx, track_id in enumerate(features["track_id"])}
features_without_ids= features.drop(["track_id"], axis= 1).values

In [4]:
# Transform to numpy arrays
features_without_ids = np.array([[np.array(cell, dtype=np.float32) for cell in row] for row in features_without_ids], dtype=np.float32)

### Making sequences from playlists dataset

In [5]:
# Function to find match between playlist song id and features song id
def song_match(playlist, track_id_to_index):
    # Extract IDs from playlist
    playlist_ids=  [track["track_uri"].split(":")[-1] for track in playlist["tracks"]]
    # Return list of playlist song IDs that exist in features
    return [id for id in playlist_ids if id in track_id_to_index.keys()]


In [6]:
# Function to encode playlist name and L2 normalize it
def encode_and_normalise(playlist_name, transformer):
    embeddings= transformer.encode(playlist_name)
    norm = np.linalg.norm(embeddings)
    return embeddings / norm if norm != 0 else embeddings

In [7]:
# Function to create sequence
def create_sequence(sequence_length, playlist_ids, i):
    # Input sequence: k consecutive songs 
    # Without ID
    input_sequence = [features_without_ids[track_id_to_index[song_id]] for song_id in playlist_ids[i:i + sequence_length]]
    input_sequence = torch.tensor(np.array(input_sequence), dtype=torch.float32)
    
    return input_sequence


In [8]:
# Create sequences and targets
# Work on subset for development
min_playlist_length = 30
sequence_length = 5  # Sliding window size (k)
sequences = []
targets = []
playlist_name_embs= []
playlist_ids= []
target_song_ids = []
sequences_made= 0
# Sentence transformer
transformer = SentenceTransformer("all-MiniLM-L6-v2")

# Read file
with open("clean_playlists.json", "r") as f:
    content= json.load(f)
    
    # Iterate on playlists
    for playlist_pid in content.keys():
        # Encode playlist name
        playlist_name_emb= encode_and_normalise(content[playlist_pid]["name"], transformer)
        tracks_ids= content[playlist_pid]["tracks"]
        # Make sequences of k window size
        for i in range(len(tracks_ids) - sequence_length):
            # Sequence: k song (without song ID)
            sequence= create_sequence(sequence_length, tracks_ids, i)
            sequences.append(sequence)

            # Target: k+1 song (without song ID)
            target= features_without_ids[track_id_to_index[tracks_ids[i + sequence_length]]].tolist()
            targets.append(target)
            
            # Store playlist name emb for soft gating later
            playlist_name_embs.append(playlist_name_emb)

            # Store the target song ID for evaluation
            target_song_ids.append(tracks_ids[i + sequence_length])

            playlist_ids.append(playlist_pid)
            # Verbosity
            sequences_made +=1
            print(f"{sequences_made} sequences made")

1 sequences made
2 sequences made
3 sequences made
4 sequences made
5 sequences made
6 sequences made
7 sequences made
8 sequences made
9 sequences made
10 sequences made
11 sequences made
12 sequences made
13 sequences made
14 sequences made
15 sequences made
16 sequences made
17 sequences made
18 sequences made
19 sequences made
20 sequences made
21 sequences made
22 sequences made
23 sequences made
24 sequences made
25 sequences made
26 sequences made
27 sequences made
28 sequences made
29 sequences made
30 sequences made
31 sequences made
32 sequences made
33 sequences made
34 sequences made
35 sequences made
36 sequences made
37 sequences made
38 sequences made
39 sequences made
40 sequences made
41 sequences made
42 sequences made
43 sequences made
44 sequences made
45 sequences made
46 sequences made
47 sequences made
48 sequences made
49 sequences made
50 sequences made
51 sequences made
52 sequences made
53 sequences made
54 sequences made
55 sequences made
56 sequences made
5

In [9]:
# Convert to tensors
sequences= torch.tensor(np.array(sequences), dtype=torch.float32)
targets = torch.tensor(np.array(targets), dtype=torch.float32)
playlist_name_embs = torch.tensor(np.array(playlist_name_embs), dtype=torch.float32)

### Defining dataset class

In [None]:
# Defining dataset class for dataloader and automatic stacking
class PlaylistDataset(Dataset):
    def __init__(self, sequences, targets, name_embs):
        self.sequences = sequences
        self.targets = targets
        self.name_embs = name_embs
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx], self.name_embs[idx]
    
    # DataLoader will automatically stack same-shaped tensors
dataset = PlaylistDataset(sequences, targets, playlist_name_embs)

### Train val test split

In [11]:
# Get all unique title embeddings (these define playlists)
unique_titles = torch.unique(dataset.name_embs, dim=0)

# Split titles into train/val/test
train_titles, test_titles = train_test_split(unique_titles, test_size=0.2, random_state=42)
train_titles, val_titles = train_test_split(train_titles, test_size=0.125, random_state=42)  # 70/10/20 split

# Create boolean masks for sequences
train_mask = torch.isin(dataset.name_embs, train_titles)
val_mask = torch.isin(dataset.name_embs, val_titles)
test_mask = torch.isin(dataset.name_embs, test_titles)

# Create subsets
train_dataset = Subset(dataset, torch.where(train_mask)[0])
val_dataset = Subset(dataset, torch.where(val_mask)[0])
test_dataset = Subset(dataset, torch.where(test_mask)[0])

In [12]:
# Check no title appears in multiple splits
assert not set(train_titles) & set(val_titles)
assert not set(train_titles) & set(test_titles)
assert not set(val_titles) & set(test_titles)

# Check sequence counts
print(f"Train: {len(train_dataset)} sequences")
print(f"Val: {len(val_dataset)} sequences")
print(f"Test: {len(test_dataset)} sequences")

Train: 7135844 sequences
Val: 941024 sequences
Test: 1524498 sequences


In [13]:
# Creating dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [14]:
# Verfying the shape
batch = next(iter(train_loader))
print([b.shape for b in batch]) 

[torch.Size([64, 5, 19, 384]), torch.Size([64, 19, 384]), torch.Size([64, 384])]


### Defining Deep Learning Model

In [15]:
# Defining RNN model with GRU and Soft Gating
class MusicRec(nn.Module):
    def __init__(self, target_feature_count= 19):
        super().__init__()
        self.target_feature_count= target_feature_count
        self.gru = nn.GRU(19*384, 512, batch_first=True)
        self.name_gate = nn.Linear(384, 19)  # 19 features to modulate
        self.head = nn.Sequential(
            nn.Dropout(0.3), # dropout for overfitting
            nn.Linear(512, target_feature_count * 384))  # Predict all features

    def forward(self, x, name_emb):
        # x: [batch, 5, 19, 384]
        # name_emb: [batch, 384]

        # Soft gating 
        gates= torch.sigmoid(self.name_gate(name_emb)) # [batch, 19]
        x = x * gates.unsqueeze(1).unsqueeze(-1)  # [batch, 5, 19, 384]

        # flatten x to [batch, 5, 19*384]
        x = x.flatten(2)  

        # Process sequence
        _, hidden = self.gru(x)
        
        pred = self.head(hidden.squeeze(0))  # [batch, 19*384]
        return pred.view(-1, self.target_feature_count, 384)  # Reshape to [batch, 19, 384]

### Hyperparameters

In [16]:
# Config 
config = {
    "batch_size": 128,
    "lr": 3e-4,
    "epochs": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
}
# Initialize
model = MusicRec().to(config["device"])
optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
writer = SummaryWriter()  # TensorBoard logging

### Dataloaders subset for debugging

In [17]:
# Create subsets because the full dataset would take 308 hours
subset_pct= 10
debug_train = Subset(train_dataset, indices=range(len(train_dataset) // subset_pct))
debug_val = Subset(val_dataset, indices=range(len(val_dataset) // subset_pct))
debug_test = Subset(test_dataset, indices=range(len(test_dataset) // subset_pct))

# Tiny dataloaders (disable shuffling for reproducibility)
debug_train_loader = DataLoader(debug_train, batch_size=config["batch_size"], shuffle=False)
debug_val_loader = DataLoader(debug_val, batch_size=config["batch_size"], shuffle=False)
debug_test_loader = DataLoader(debug_test, batch_size=config["batch_size"], shuffle=False)

In [26]:
debug_train[0][0].shape

torch.Size([5, 19, 384])

### Training and validation

In [66]:
def cosine_loss(pred, target):
    return 1 - torch.mean(F.cosine_similarity(pred, target, dim=-1))

best_val_loss = float('inf')
early_stop_counter = 0

for epoch in range(config["epochs"]):
    # --- Training Phase ---
    model.train()
    #debug_model.train()
    train_loss = 0
    for batch_idx, (seq, target, name_emb) in enumerate(debug_train_loader):
        # Print progress every batch
        print(f"Epoch {epoch+1} | Batch {batch_idx+1}/{len(debug_train_loader)}")
        seq, target, name_emb = seq.to(config["device"]), target.to(config["device"]), name_emb.to(config["device"])
        
        optimizer.zero_grad()
        
        pred = model(seq, name_emb)
        #pred= debug_model(seq, name_emb)
        loss = cosine_loss(pred, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # --- Validation Phase ---
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for seq, target, name_emb in debug_val_loader:
            seq, target, name_emb = seq.to(config["device"]), target.to(config["device"]), name_emb.to(config["device"])
            pred = model(seq, name_emb)
            #pred = debug_model(seq, name_emb)
            val_loss += cosine_loss(pred, target).item()
    
    # Update scheduler and check early stopping
    scheduler.step(val_loss)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        #torch.save(debug_model.state_dict(), "best_debug_model.pth")
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= 5:
            print("Early stopping triggered")
            break

    # --- Logging ---
    #writer.add_scalars('Loss', {
    #    'train': train_loss / len(debug_train_loader),
    #    'val': val_loss / len(debug_val_loader)
    #}, epoch)
        writer.add_scalars('Loss', {
        'train': train_loss / len(debug_train_loader),
        'val': val_loss / len(debug_val_loader)
    }, epoch)
    
    #print(f"Epoch {epoch+1}: "
    #      f"Train Loss: {train_loss/len(debug_train_loader):.4f} | "
    #      f"Val Loss: {val_loss/len(debug_val_loader):.4f}")
    print(f"Epoch {epoch+1}: "
          f"Train Loss: {train_loss/len(debug_train_loader):.4f} | "
          f"Val Loss: {val_loss/len(debug_val_loader):.4f}")



Epoch 1 | Batch 1/5575
Epoch 1 | Batch 2/5575
Epoch 1 | Batch 3/5575
Epoch 1 | Batch 4/5575
Epoch 1 | Batch 5/5575
Epoch 1 | Batch 6/5575
Epoch 1 | Batch 7/5575
Epoch 1 | Batch 8/5575
Epoch 1 | Batch 9/5575
Epoch 1 | Batch 10/5575
Epoch 1 | Batch 11/5575
Epoch 1 | Batch 12/5575
Epoch 1 | Batch 13/5575
Epoch 1 | Batch 14/5575
Epoch 1 | Batch 15/5575
Epoch 1 | Batch 16/5575
Epoch 1 | Batch 17/5575
Epoch 1 | Batch 18/5575
Epoch 1 | Batch 19/5575
Epoch 1 | Batch 20/5575
Epoch 1 | Batch 21/5575
Epoch 1 | Batch 22/5575
Epoch 1 | Batch 23/5575
Epoch 1 | Batch 24/5575
Epoch 1 | Batch 25/5575
Epoch 1 | Batch 26/5575
Epoch 1 | Batch 27/5575
Epoch 1 | Batch 28/5575
Epoch 1 | Batch 29/5575
Epoch 1 | Batch 30/5575
Epoch 1 | Batch 31/5575
Epoch 1 | Batch 32/5575
Epoch 1 | Batch 33/5575
Epoch 1 | Batch 34/5575
Epoch 1 | Batch 35/5575
Epoch 1 | Batch 36/5575
Epoch 1 | Batch 37/5575
Epoch 1 | Batch 38/5575
Epoch 1 | Batch 39/5575
Epoch 1 | Batch 40/5575
Epoch 1 | Batch 41/5575
Epoch 1 | Batch 42/5575
E

In [67]:
# Check sample predictions vs. targets
model.eval()
with torch.no_grad():
    sample_seq, sample_target, sample_name_emb = next(iter(debug_train_loader))
    pred = model(sample_seq, sample_name_emb)
    print("Sample Predictions (First 30 Targets):")
    for i in range(5):
        print(f"  Target {i}: {sample_target[0, i, :3].tolist()}...")
        print(f"  Pred {i}:   {pred[0, i, :3].tolist()}...\n")

# Check cosine similarity
print("Cosine Similarity (should be 0.0-0.3 initially):")
print(F.cosine_similarity(pred.flatten(), sample_target.flatten(), dim=0).item())

Sample Predictions (First 30 Targets):
  Target 0: [-0.037026118487119675, 0.029057607054710388, -0.0015366016887128353]...
  Pred 0:   [-0.9563893675804138, 1.3291209936141968, -0.08672542124986649]...

  Target 1: [0.07062146067619324, -0.06726048141717911, -0.042237602174282074]...
  Pred 1:   [-1.8494404554367065, 1.4275233745574951, 0.15021438896656036]...

  Target 2: [-0.04580943286418915, -0.012545882724225521, 0.07738617062568665]...
  Pred 2:   [0.42305511236190796, -1.9909782409667969, 1.491255521774292]...

  Target 3: [0.7145289778709412, -0.42815670371055603, -0.033558107912540436]...
  Pred 3:   [1.580798864364624, -1.3324084281921387, 2.845816135406494]...

  Target 4: [0.17125454545021057, 0.06785944104194641, 0.7632662057876587]...
  Pred 4:   [1.1306989192962646, 0.4293002486228943, 4.8036932945251465]...

Cosine Similarity (should be 0.0-0.3 initially):
0.8185421228408813


In [45]:
# Check if any target song matches the input songs in the sequence
for seq, target, _ in debug_train_loader:
    for song_in_seq in seq.unbind(dim=1):  # Iterate through input songs
        if torch.allclose(song_in_seq, target, atol=1e-4):
            print("LEAKAGE DETECTED: Target matches an input song!")
            
# Check gate values - they should NOT be near 0.0 or 1.0
print("Gate values (min, max, mean):", 
      torch.sigmoid(model.name_gate.weight).min().item(),
      torch.sigmoid(model.name_gate.weight).max().item(),
      torch.sigmoid(model.name_gate.weight).mean().item())

# Compare predictions to the last input song
last_input_songs = sample_seq[:, -1, :, :]  # Shape: [batch, 19, 384]
similarity = F.cosine_similarity(
    pred.flatten(), 
    last_input_songs.flatten(), 
    dim=0
).item()
print("Prediction vs. Last Input Song Similarity:", similarity)


# Compare predictions to random targets
rand_pred = torch.randn_like(pred)  # Random noise
true_loss = cosine_loss(pred, target).item()
rand_loss = cosine_loss(rand_pred, target).item()
print(f"Model loss: {true_loss:.3f} | Random loss: {rand_loss:.3f}")


sims = F.cosine_similarity(pred, target, dim=-1)  # [batch, 19]
print(f"Feature similarity - Min: {sims.min().item():.3f} | Max: {sims.max().item():.3f}")

Gate values (min, max, mean): 0.4696260094642639 0.5302415490150452 0.5000720620155334
Prediction vs. Last Input Song Similarity: 0.7622901201248169
Model loss: 0.124 | Random loss: 1.000
Feature similarity - Min: 0.379 | Max: 0.996


### Evaluation

In [68]:
def evaluate_model(model, loader, device):
    model.eval()
    total_loss = 0
    feature_cosines = torch.zeros(19, device=device)  # For all 19 features
    
    with torch.no_grad():
        for seq, target, name_emb in loader:
            seq, target, name_emb = seq.to(device), target.to(device), name_emb.to(device)
            pred = model(seq, name_emb)
            
            # Overall loss
            total_loss += cosine_loss(pred, target).item()
            
            # Per-feature metrics
            feature_cosines += F.cosine_similarity(pred, target, dim=-1).mean(0)
    
    metrics = {
        f"feat_{i}": feature_cosines[i].item() / len(loader) 
        for i in range(19)
    }
    return total_loss / len(loader), metrics

# --- Final Test Evaluation ---
model.load_state_dict(torch.load("best_model.pth"))
test_loss, test_metrics = evaluate_model(model, debug_test_loader, config["device"])
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Per-Feature Cosine: {test_metrics}")

# Cleanup
writer.close()

  model.load_state_dict(torch.load("best_model.pth"))



Test Loss: 0.1394
Per-Feature Cosine: {'feat_0': 0.47869903769269084, 'feat_1': 0.4328717737389891, 'feat_2': 0.8418727593133913, 'feat_3': 0.8512045329049129, 'feat_4': 0.9910203562486892, 'feat_5': 0.9312814802131397, 'feat_6': 0.9529970924326238, 'feat_7': 0.8910180418283348, 'feat_8': 0.8976175193018561, 'feat_9': 0.835459254732068, 'feat_10': 0.9293094097367869, 'feat_11': 0.8460709744651845, 'feat_12': 0.9488892011194421, 'feat_13': 0.9055224937080537, 'feat_14': 0.9817418860109061, 'feat_15': 0.9152936487389891, 'feat_16': 0.8749158206402056, 'feat_17': 0.8712543257130873, 'feat_18': 0.9744820818805054}


In [70]:
pred.shape

torch.Size([128, 19, 384])