## 0. Imports

In [1]:
import utils.dataset_functions as df
import utils.user_features as uf
import utils.two_towers as ttn
import pandas as pd
import ast
import torch
import numpy as np
from threading import Thread
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as progress_bar


(Path("dataset") / "processed").mkdir(parents=True, exist_ok=True)
data_dir = Path("dataset") / "unprocessed"
data_dir.mkdir(parents=True, exist_ok=True)

## 1. Download and write locally to CSV's

In [2]:
# Write files locally
dataset_types = ["likes", "listens", "dislikes", "unlikes", "undislikes"]
dataset = df.YambdaDataset('flat', '50m')
for dt in dataset_types:
    df.download_df(dataset=dataset, dataset_type=dt)


if not (data_dir / "embeddings.csv").exists():
    embeddings = dataset.audio_embeddings().to_pandas()
    embeddings.to_csv(data_dir / "embeddings.csv", index=False)
    del embeddings

## 2. Load Dataframes

In [3]:
# If not done before
if (Path("dataset") / "processed" / "merged.csv").exists():
    user_item_data = pd.read_csv(Path("dataset") / "processed" / "merged.csv", index_col=False)
    user_item_data["normalized_embed"] = user_item_data["normalized_embed"].apply(df.parse_embedding)
    user_item_data['labels'] = user_item_data['labels'].apply(ast.literal_eval)
    #user_item_data['labels'] = user_item_data['labels'].apply(lambda l: np.array(l))

else:
    # User like-dislike interactions
    likes = pd.read_csv(data_dir / "likes.csv", usecols=['uid', 'timestamp', 'item_id'])
    dislikes = pd.read_csv(data_dir / "dislikes.csv", usecols=['uid', 'timestamp', 'item_id'])
    unlikes = pd.read_csv(data_dir / "unlikes.csv", usecols=['uid', 'timestamp', 'item_id'])
    undislikes = pd.read_csv(data_dir / "dislikes.csv", usecols=['uid', 'timestamp', 'item_id'])

    # User listen interactions
    listens = pd.read_csv(data_dir / "listens.csv", index_col=False)
    listens.drop(columns=['is_organic'])

    # due to computational limitations, we constrain our dataset to users to have between 500 and 5000 timestamps.
    listens = listens.groupby('uid').filter(lambda x: 500 <= len(x) <= 5000)

    # Embeddings
    embeddings = pd.read_csv(data_dir/'embeddings.csv', usecols=['item_id', 'normalized_embed'], index_col=False)
    embeddings["normalized_embed"] = embeddings["normalized_embed"].apply(df.parse_embedding)

    # Merge the song embeddings and user listens dataset 
    user_item_data = pd.merge(listens, embeddings, on='item_id', how='inner')
    
    
    # save memory
    del listens
    del embeddings

    
    # Determine the labels under different conditions using this function.
    user_item_data[["labels", "net_interactions"]] = user_item_data.apply(
    uf.get_song_label_and_user_interacton,
    axis=1,
    args=(likes, user_item_data, unlikes, dislikes, undislikes),
    result_type="expand") 


    

    # Save our processed dataset.
    user_item_data.to_csv(Path("dataset") / "processed" / "merged.csv", index=False)

    # save memory again
    del likes
    del dislikes
    del unlikes
    del undislikes


user_item_data

Unnamed: 0,uid,timestamp,item_id,is_organic,played_ratio_pct,track_length_seconds,normalized_embed,labels,net_interactions
0,100,39420,8326270,0,100,170,"[-0.169998115, -0.0959603293, 0.0354052303, -0...","[0, 0, 1, 1]",0
1,100,39420,1441281,0,100,105,"[-0.11168661, -0.06717089, -0.01324262, -0.075...","[0, 0, 1, 1]",0
2,100,39625,286361,0,100,185,"[-0.08362152, 0.01492759, 0.04177505, -0.07362...","[0, 0, 1, 1]",0
3,100,40110,732449,0,100,240,"[-0.09272503, 0.00863106, 0.00500664, -0.07165...","[0, 0, 1, 1]",0
4,100,40360,3397170,0,46,130,"[0.00739911, 0.02237171, -0.05895943, -0.04705...","[0, 0, 0, 0]",0
...,...,...,...,...,...,...,...,...,...
295,100,3233610,7371186,1,100,205,"[-0.21943855, -0.0690354, 0.06722355, -0.13852...","[1, 1, 1, 1]",1
296,100,3233800,7239034,1,100,190,"[-0.06149148, 0.05783159, -0.06702066, -0.0260...","[0, 0, 1, 1]",0
297,100,3235445,8661238,1,100,170,"[-0.04264607, -0.04572908, -0.07309788, -0.190...","[0, 1, 1, 1]",0
298,100,3235445,4509561,1,1,230,"[-0.1307608, -0.05088218, 0.00834938, -0.01987...","[0, 1, 0, 0]",0


## 3. Create and save user features
We do this in train/val/test splits

In [5]:
users = user_item_data#['uid'].unique()

__train_val_set, test_set = train_test_split(
    users,
    test_size=0.10,   # 10 % is test data
    random_state=42,        # reproducible shuffling
    shuffle=True
)

train_set, val_set = train_test_split(
    __train_val_set,
    test_size=0.22,   # ~20% validation
    random_state=42,
    shuffle=True
)


print("train:", len(train_set), "users")
print("val  :", len(val_set), "users")
print("test :", len(test_set), "users")

# poorly designed split, but I'm okay with losing a few users (we have more than enough anyway).
# Added + 1 in the indices to ensure no duplicates (still horrible method though).
train_split = len(train_set)//4 

# # Multithread it to make it somewhat time managable
t1 = Thread(target=uf.extract_and_save_features, args=(train_set[0:train_split], user_item_data, 'train', Path("dataset")/ "processed" / "train"))
t2 = Thread(target=uf.extract_and_save_features, args=(train_set[train_split+1:2*train_split], user_item_data, 'train', Path("dataset")/ "processed" / "train"))
t3 = Thread(target=uf.extract_and_save_features, args=(train_set[2*train_split+1:3*train_split], user_item_data, 'train', Path("dataset")/ "processed" / "train"))
t4 = Thread(target=uf.extract_and_save_features, args=(train_set[3*train_split+1:4*train_split], user_item_data, 'train', Path("dataset")/ "processed" / "train"))
t5 = Thread(target=uf.extract_and_save_features, args=(val_set, user_item_data, 'val', Path("dataset")/ "processed" / "val"))
t6 = Thread(target=uf.extract_and_save_features, args=(test_set, user_item_data, 'test', Path("dataset")/ "processed" / "test"))

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()


t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()


# free up memory, we don't need this anymore
del user_item_data 

train: 210 users
val  : 60 users
test : 30 users


train:   0%|          | 0/52 [00:00<?, ?it/s]

Exception in thread Thread-4 (extract_and_save_features):
Traceback (most recent call last):
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paul/Documents/uni/ir/TTIR/utils/user_features.py", line 138, in extract_and_save_features
    save_processed_data(user_feats, label_specific_feats, user_ids, song_embeds, song_labels, file=user_file)
  File "/home/paul/Documents/uni/ir/TTIR/utils/dataset_functions.py", line 72, in save_processed_data
    user_feats = torch.from_numpy(np.stack(user_feats)).float().clone()
                                  ^^^^^^^^^^^^^^^^^^^^
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/site-packages

train:   0%|          | 0/51 [00:00<?, ?it/s]

train:   0%|          | 0/51 [00:00<?, ?it/s]

train:   0%|          | 0/51 [00:00<?, ?it/s]

Exception in thread Thread-5 (extract_and_save_features):
Traceback (most recent call last):
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-6 (extract_and_save_features):
Traceback (most recent call last):
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    self.run()
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
Exception in thread Thread-7 (extract_and_save_features):
Traceback (most recent call last):
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    _threading_Thread_run(self)
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 982, in run
    _threading_Thread_run(self)
  File 

val:   0%|          | 0/60 [00:00<?, ?it/s]

test:   0%|          | 0/30 [00:00<?, ?it/s]

    self.run()
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    save_processed_data(user_feats, label_specific_feats, user_ids, song_embeds, song_labels, file=user_file)
  File "/home/paul/Documents/uni/ir/TTIR/utils/dataset_functions.py", line 72, in save_processed_data
Exception in thread Thread-8 (extract_and_save_features):
Traceback (most recent call last):
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-9 (extract_and_save_features):
Traceback (most recent call last):
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    user_feats = torch.from_numpy(np.stack(user_feats)).float().clone()
                                  ^^^^^^^^^^^^^^^^^^^^
  File "/home/paul/miniconda3/envs/uni/lib/python3.11/site-packages/numpy/_core/shape_base.py", line 456, in stack
    self._target(*self._args

### 3.1: merge the seperate user files

In [None]:
for data_type in ['train']:#, 'val', 'test']:
    combined_file = Path("dataset") / "processed" / f'{data_type}.pt' 
    
    if combined_file.exists():
        files = Path("dataset") / "processed" / data_type
        user_feats = []
        label_feats = []
        user_ids = []
        song_embeds = []
        labels = []

        for file in progress_bar(list(files.glob("*.pt")), desc=f"loading and merging {data_type} files"):
            loaded = torch.load(file, map_location="cpu", weights_only=False)
            user_feats.append(loaded["user_feats"])
            label_feats.append(loaded["label_specific_feats"])
            user_ids.append(loaded["user_ids"])
            song_embeds.append(loaded["song_embeds"])
            labels.append(loaded["labels"])

            del loaded
        
        
        uf.save_processed_data(user_feats, label_feats, user_ids, song_embeds, labels, file=combined_file)

    

In [None]:
train_set = df.load_tensor_dataloader("train", Path("dataset") / 'processed', 5, 0)

In [None]:
for user_feats, label_feats, song_embed, label in train_set:
    print(user_feats.shape)
    print(label_feats.shape)
    print(song_embed.shape)
    print(label.shape)
    break

## 4. Training the model

### 4.1 set the parameters

In [None]:
import utils.dataset_functions as df
import utils.user_features as uf
import utils.two_towers as ttn
import pandas as pd

import torch
from threading import Thread
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as progress_bar



data_dir = Path("dataset") / "unprocessed"
data_dir.mkdir(parents=True, exist_ok=True)

# Input dimensions
                # item_dim + user features
user_dim        = 128 + 6
item_dim        = 128
aug_dim         = 32

# Dimensions of the tower FFN
hidden_dim      = 64
embed_dim       = 32

# lambda1 for loss_u & lambda2 for loss_V
lambda1         = 1
lambda2         = 1

# Training
num_epochs      = 200
learning_rate   = 1e-3
batch_size      = 128
device='cuda' if torch.cuda.is_available() else 'cpu'
print("Training models on:", device)

In [None]:
train_set = df.load_tensor_dataloader("train", Path("dataset"), batch_size, 0)
val_set = df.load_tensor_dataloader("val", Path("dataset"), batch_size, 0)


for user_features, user_id, song_embedding, labels in train_set:
    print(user_features.shape)
    break

### 4.2 Training our models

In [None]:
models = ["interactions_model", "multiple_listens_model", "pct_100_model", "pct_80_model"]

for label_id, model_name in enumerate(models):
    train_set = df.load_tensor_dataloader("train", Path("dataset"), batch_size, label_id)
    val_set = df.load_tensor_dataloader("val", Path("dataset"), batch_size, label_id)


    model = ttn.DualAugmentedTwoTower(model_name, user_dim, item_dim, aug_dim, hidden_dim, embed_dim)
    optimiser = torch.optim.Adam(model.parameters(), lr = learning_rate)
    ttn.train_model(model, train_set, val_set, optimiser, num_epochs=num_epochs, device=device)