## 0. Imports

In [1]:
import utils.dataset_functions as df
import utils.user_features as uf
import utils.two_towers as ttn
import pandas as pd
import torch
from threading import Thread
from pathlib import Path
from tqdm.notebook import tqdm as progress_bar


(Path("dataset") / "processed").mkdir(parents=True, exist_ok=True)
data_dir = Path("dataset") / "unprocessed"
data_dir.mkdir(parents=True, exist_ok=True)

## 1. Download and write locally to CSV's

In [2]:
# Write files locally
dataset_types = ["likes", "listens", "dislikes", "unlikes", "undislikes"]
dataset = df.YambdaDataset('flat', '50m')
for dt in dataset_types:
    df.download_df(dataset=dataset, dataset_type=dt)


if not (data_dir / "embeddings.csv").exists():
    embeddings = dataset.audio_embeddings().to_pandas()
    embeddings.to_csv(data_dir / "embeddings.csv", index=False)
    del embeddings

## 2. Load Dataframes

In [None]:
# If not done before
if (Path("dataset") / "processed" / "merged.csv").exists():
    user_item_data = pd.read_csv(Path("dataset") / "processed" / "merged.csv", index_col=False)
    user_item_data["normalized_embed"] = user_item_data["normalized_embed"].apply(df.parse_embedding)
    # User like-dislike interactions
    likes = pd.read_csv(Path("dataset") / "processed" / "likes.csv", usecols=['uid', 'timestamp', 'item_id'])
    dislikes = pd.read_csv(Path("dataset") / "processed" / "dislikes.csv", usecols=['uid', 'timestamp', 'item_id'])

else:
    # User listen interactions
    listens = pd.read_csv(data_dir / "listens.csv", index_col=False)
    listens[listens["is_organic"] == 1]
    listens.drop("is_organic", axis=1, inplace=True)

    # User like-dislike interactions
    cols = ["uid", "timestamp", "item_id"]


    likes = pd.read_csv(data_dir / "likes.csv", usecols=['uid', 'timestamp', 'item_id'])
    dislikes = pd.read_csv(data_dir / "dislikes.csv", usecols=['uid', 'timestamp', 'item_id'])
    unlikes = pd.read_csv(data_dir / "unlikes.csv", usecols=['uid', 'timestamp', 'item_id'])
    undislikes = pd.read_csv(data_dir / "undislikes.csv", usecols=['uid', 'timestamp', 'item_id'])


    # due to computational limitations, we constrain our dataset to users to have between 500 and 5000 timestamps.
    listens = listens.groupby('uid').filter(lambda x: 100 <= len(x) <=5000)
    # We only take data containing songs that appear at least a 1000 times in the interaction dataset
    cut_off = 500
    counts = listens['item_id'].value_counts()
    temp = []
    for id, count in counts.items():
        if count >= cut_off:
            temp.append(id)
    listens  = listens[listens['item_id'].isin(temp)]
    

    # Embeddings
    embeddings = pd.read_csv(data_dir/'embeddings.csv', usecols=['item_id', 'normalized_embed'], index_col=False)
    embeddings["normalized_embed"] = embeddings["normalized_embed"].apply(df.parse_embedding)

    # Merge the song embeddings and user listens dataset 
    user_item_data = pd.merge(listens, embeddings, on='item_id', how='inner')

    valid_items = user_item_data['item_id'].unique()
    likes  = likes[likes['item_id'].isin(valid_items)]
    dislikes  = dislikes[dislikes['item_id'].isin(valid_items)]
    unlikes  = unlikes[unlikes['item_id'].isin(valid_items)]
    undislikes  = undislikes[undislikes['item_id'].isin(valid_items)]

    # Filter rows in liked that are unliked and rows in disliked that are in undisliked
    likes = likes[~likes.set_index(cols).index.isin(unlikes.set_index(cols).index)].reset_index(drop=True).drop_duplicates()
    dislikes = dislikes[~dislikes.set_index(cols).index.isin(undislikes.set_index(cols).index)].reset_index(drop=True).drop_duplicates()


    # save memory
    del listens
    del embeddings
    del temp
    del unlikes
    del undislikes


    uid_map = {}

    for id, uid in enumerate(user_item_data['uid'].unique()):
        uid_map[uid] = id

    item_id_map = {}
    for id, sid in enumerate(user_item_data['item_id'].unique()):
        item_id_map[sid] = id
    
    
    
    user_item_data['uid'] = user_item_data["uid"].replace(uid_map)
    user_item_data['item_id'] = user_item_data["item_id"].replace(item_id_map)

    likes['uid'] = likes["uid"].replace(uid_map)
    likes['item_id'] = likes["item_id"].replace(item_id_map)

    dislikes['uid'] = dislikes["uid"].replace(uid_map)
    dislikes['item_id'] = dislikes["item_id"].replace(item_id_map)


    # Convert array column to a hashable type
    df = user_item_data.copy()
    df['normalized_embed_tuple'] = df['normalized_embed'].apply(lambda x: tuple(x))

    # Keep only unique rows based on item_id + embed
    df_unique = df.drop_duplicates(subset=['item_id', 'normalized_embed_tuple'])

    # Keep only the original two columns
    df_unique = df_unique[['item_id', 'normalized_embed']]


    # Save our processed dataset.
    user_item_data.to_csv(Path("dataset") / "processed" / "merged.csv", index=False)
    likes.to_csv(Path("dataset") / "processed" / "likes.csv", index=False)
    dislikes.to_csv(Path("dataset") / "processed" / "dislikes.csv", index=False)


user_item_data

Unnamed: 0,uid,timestamp,item_id,played_ratio_pct,track_length_seconds,normalized_embed
0,0,39420,0,100,170,"[-0.169998115, -0.0959603293, 0.0354052303, -0..."
1,0,40380,1,100,205,"[-0.13720872, -0.07012163, -0.05605687, -0.220..."
2,0,40640,2,100,205,"[-0.201428336, -0.0348165734, -0.115619186, -0..."
3,0,41130,3,100,245,"[-0.08191244, -0.0886203, -0.09830238, -0.1605..."
4,0,42115,4,1,200,"[-0.0704478517, -0.0471708378, -0.0264391324, ..."
...,...,...,...,...,...,...
3880035,5425,25961230,979,100,200,"[-0.13856041, 0.07792329, -0.08014846, -0.0198..."
3880036,5425,25961615,1117,99,200,"[-0.1448793, 0.08461289, 0.02028048, 0.0217482..."
3880037,5425,25961805,1054,99,190,"[-0.0609348332, 0.0349645983, -0.0275787699, 0..."
3880038,5425,25962060,1629,100,255,"[0.0730053227, 0.0352250292, 0.0868287894, 0.0..."


In [4]:
likes

Unnamed: 0,uid,timestamp,item_id
0,0,3939350,49
1,0,6903355,52
2,0,10314390,63
3,0,23471875,171
4,0,23540595,93
...,...,...,...
247535,5424,19821670,1228
247536,5424,24634975,11
247537,5424,25710740,2924
247538,5425,14299485,1354


In [5]:
dislikes

Unnamed: 0,uid,timestamp,item_id
0,400,5519800,253
1,800,5668430,2522
2,800,10649375,1290
3,800,12673195,145
4,800,13626180,628
...,...,...,...
28328,5420,13508230,2294
28329,5420,13893340,495
28330,5422,21879830,1154
28331,5422,21880010,489


## 3. Create and save user features
We do this in train/val/test splits

In [4]:
users = user_item_data['uid'].unique()

# It is HIGHLY recommended to use more than 1 thread per set
# You can split the data equally over threads like so:

todo_users = [u for u in users if not (Path("dataset")/ "processed" / "users"/ f"{u}.pt").exists()]
num_threads = 14
k, m = divmod(len(todo_users), num_threads)
user_split = [todo_users[i*k + min(i, m) : (i+1)*k + min(i+1, m)] for i in range(num_threads)]



# Multithread it to make it somewhat time managable
t1 = Thread(target=uf.extract_and_save_features, args=(user_split[0], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t2 = Thread(target=uf.extract_and_save_features, args=(user_split[1], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t3 = Thread(target=uf.extract_and_save_features, args=(user_split[2], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t4 = Thread(target=uf.extract_and_save_features, args=(user_split[3], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t5 = Thread(target=uf.extract_and_save_features, args=(user_split[4], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t6 = Thread(target=uf.extract_and_save_features, args=(user_split[5], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t7 = Thread(target=uf.extract_and_save_features, args=(user_split[6], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t8 = Thread(target=uf.extract_and_save_features, args=(user_split[7], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t9 = Thread(target=uf.extract_and_save_features, args=(user_split[8], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t10 = Thread(target=uf.extract_and_save_features, args=(user_split[9], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t11 = Thread(target=uf.extract_and_save_features, args=(user_split[10], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t12 = Thread(target=uf.extract_and_save_features, args=(user_split[11], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t13 = Thread(target=uf.extract_and_save_features, args=(user_split[12], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))
t14 = Thread(target=uf.extract_and_save_features, args=(user_split[13], Path("dataset")/ "processed" / "users", user_item_data, likes, dislikes))



t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
t7.start()
t8.start()
t9.start()
t10.start()
t11.start()
t12.start()
t13.start()
t14.start()



t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
t7.join()
t8.join()
t9.join()
t10.join()
t11.join()
t12.join()
t13.join()
t14.join()


  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/387 [00:00<?, ?it/s]

  0%|          | 0/388 [00:00<?, ?it/s]

In [5]:
# free up memory, we don't need this anymore
del user_item_data 
del likes
del dislikes

NameError: name 'user_item_data' is not defined

* 1.1 Kijk of we goeie split krijgen op deze manier
* 1.2 Anderrs maak weer 70% 15% 15% split (van git pakken)

* 2.0 Hertrain bin_model

* evaluate zie whatsapp


### 3.1: merge the seperate user files

In [4]:
files = Path("dataset") / "processed" / "users"
train = []
val = []
test = []


user_feats = []
ids = []
songids = []
embeddings = []
labels = []
interactions = []


files = list(files.glob("*.pt"))

for file in progress_bar(files, desc="Processing users"):
    data = torch.load(file, map_location="cpu")
    f = data['user_feats']         # shape: [N, F]
    uid = data['user_ids']        # shape: [N]
    sid = data['song_ids']        # shape: [N]
    em = data['song_embeds']       # shape: [N, E]
    lb = data['labels']            # shape: [N, L]
    it = data['interactions'] # shape: [N]


    # Slice per user
    user_feats.append(f)
    ids.append(uid)
    songids.append(sid)
    embeddings.append(em)
    labels.append(lb)
    interactions.append(it)


user_feats = torch.cat(user_feats, dim=0)
ids = torch.cat(ids, dim=0)
songids = torch.cat(songids, dim=0)
embeddings = torch.cat(embeddings, dim=0)
labels = torch.cat(labels, dim=0)
interactions = torch.cat(interactions, dim=0)

Processing users:   0%|          | 0/5426 [00:00<?, ?it/s]

In [5]:
# Boolean mask
train_mask = interactions == 0        # True where interaction > 0
valtest_mask = ~train_mask                 # interaction <= 0

# Train 
train_ufs = user_feats[train_mask]
train_uid = ids[train_mask]
train_sid = songids[train_mask]
train_emb = embeddings[train_mask]
train_lab = labels[train_mask]
train_int = interactions[train_mask]

# Non-positive interactions
val_sample_count = 629926//2

val_ufs = user_feats[valtest_mask][:val_sample_count]
val_uid = ids[valtest_mask][:val_sample_count]
val_sid = songids[valtest_mask][:val_sample_count]
val_emb = embeddings[valtest_mask][:val_sample_count]
val_lab = labels[valtest_mask][:val_sample_count]
val_int = interactions[valtest_mask][:val_sample_count]


test_ufs = user_feats[valtest_mask][val_sample_count:]
test_uid = ids[valtest_mask][val_sample_count:]
test_sid = songids[valtest_mask][val_sample_count:]
test_emb = embeddings[valtest_mask][val_sample_count:]
test_lab = labels[valtest_mask][val_sample_count:]
test_int = interactions[valtest_mask][val_sample_count:]

combined_ufs = user_feats[valtest_mask]
combined_uid = ids[valtest_mask]
combined_sid = songids[valtest_mask]
combined_emb = embeddings[valtest_mask]
combined_lab = labels[valtest_mask]
combined_int = interactions[valtest_mask]

print("Number of train samples:", train_ufs.shape[0])
print("Number of val samples:", val_ufs.shape[0])
print("Number of test samples:", test_ufs.shape[0])


torch.save({
    "user_feats":           train_ufs,
    "user_ids":             train_uid,
    "song_ids":             train_sid,
    "song_embeds":          train_emb,
    "labels":               train_lab,
    "interactions":         train_int
}, Path("dataset") / "processed"/ 'train.pt')

torch.save({
    "user_feats":           val_ufs,
    "user_ids":             val_uid,
    "song_ids":             val_sid,
    "song_embeds":          val_emb,
    "labels":               val_lab,
    "interactions":         val_int
}, Path("dataset") / "processed"/ 'val.pt')

torch.save({
    "user_feats":           test_ufs,
    "user_ids":             test_uid,
    "song_ids":             test_sid,
    "song_embeds":          test_emb,
    "labels":               test_lab,
    "interactions":         test_int
}, Path("dataset") / "processed"/ 'test.pt')

torch.save({
    "user_feats":           combined_ufs,
    "user_ids":             combined_uid,
    "song_ids":             combined_sid,
    "song_embeds":          combined_emb,
    "labels":               combined_lab,
    "interactions":         combined_int
}, Path("dataset") / "processed"/ 'combined.pt')



Number of train samples: 3142861
Number of val samples: 314963
Number of test samples: 314963


## 4. Training the model

### 4.1 set the parameters

In [23]:
import utils.dataset_functions as df
import utils.user_features as uf
import utils.two_towers as ttn
import pandas as pd
import torch
from torch.nn import MSELoss
from torch.nn.functional import binary_cross_entropy
from threading import Thread
from pathlib import Path
from tqdm.notebook import tqdm as progress_bar


(Path("dataset") / "processed").mkdir(parents=True, exist_ok=True)
data_dir = Path("dataset") / "unprocessed"
data_dir.mkdir(parents=True, exist_ok=True)
# Dimensions of the tower FFN
output_dim      = 16
hidden_dim      = 256
id_dim          = 16


# Training
num_epochs      = 60
learning_rate   = 1e-3
batch_size      = 256
patience        = 20
device='cuda' if torch.cuda.is_available() else 'cpu'
print("Training models on:", device)


Training models on: cuda


### 4.2 Training our models

In [24]:
models = ["binary_label", "continueous_label"]

for label_id, model_name in enumerate(models):
    train_set = df.load_tensor_dataloader("train", Path("dataset")/"processed", batch_size, label_id)
    val_set = df.load_tensor_dataloader("val", Path("dataset")/"processed", batch_size, label_id)


    model = ttn.DualAugmentedTwoTower(model_name, hidden_dim, output_dim, id_dim)
    optimiser = torch.optim.Adam(model.parameters(), lr = learning_rate)
    if label_id == 1:
        loss = MSELoss()
    else:
        loss = binary_cross_entropy

    ttn.train_model(model, train_set, val_set, optimiser, patience, num_epochs=50, device=device, loss_function=loss)



Epoch: [31/50]
early stopped
