## 0. Imports

In [None]:
import utils.dataset_functions as df
import utils.user_features as uf
import numpy as np
import pandas as pd
from threading import Thread
from pathlib import Path
from sklearn.model_selection import train_test_split




data_dir = Path("Dataset") / "unprocessed"
data_dir.mkdir(parents=True, exist_ok=True)

## 1. Download and write locally to CSV's

In [None]:
# Write files locally
dataset_types = ["likes", "listens", "dislikes", "unlikes", "undislikes"]
dataset = df.YambdaDataset('flat', '50m')
for dt in dataset_types:
    df.download_df(dataset=dataset, dataset_type=dt)


if not (data_dir / "embeddings.csv").exists():
    embeddings = dataset.audio_embeddings().to_pandas()
    embeddings.to_csv(data_dir / "embeddings.csv", index=False)
    del embeddings

## 2. Load Dataframes

In [None]:
# User like-dislike interactions
likes = pd.read_csv(data_dir / "likes.csv", usecols=['uid', 'timestamp', 'item_id'])
dislikes = pd.read_csv(data_dir / "dislikes.csv", usecols=['uid', 'timestamp', 'item_id'])
unlikes = pd.read_csv(data_dir / "unlikes.csv", usecols=['uid', 'timestamp', 'item_id'])
undislikes = pd.read_csv(data_dir / "dislikes.csv", usecols=['uid', 'timestamp', 'item_id'])

# Embeddings
embeddings = pd.read_csv(data_dir/'embeddings.csv', usecols=['item_id', 'normalized_embed'])
embeddings["normalized_embed"] = embeddings["normalized_embed"].apply(df.parse_embedding)

# User listen interactions
listens = pd.read_csv(data_dir / "listens.csv", index_col=False)
listens.drop(columns=['is_organic'])
listens = listens[listens['uid'].map(listens['uid'].value_counts()) >= 10] # we only want users with at least 10 interactions

# Merge the song embeddings and user listens dataset 
user_item_data = pd.merge(listens, embeddings, on='item_id', how='outer')

# save memory
del listens
del embeddings

## 3. Create and save user features
We do this in train/val/test splits

In [None]:
users = listens['uid'].unique()

_train_set, test_set = train_test_split(
    users,
    test_size=0.10,   # 10â€¯% is test data
    random_state=42,        # reproducible shuffling
    shuffle=True
)

train_set, val_set = train_test_split(
    _train_set,
    test_size=0.22,   # ~20% validation
    random_state=42,
    shuffle=True
)


print("train:", len(train_set), "users")
print("val  :", len(val_set), "users")
print("test :", len(test_set), "users")

train_split = len(train_set)//2

# Multithread it to make it somewhat managable
t1 = Thread(target=uf.extract_and_save_features, args=(train_set[:train_split], user_item_data, likes, unlikes, dislikes, undislikes, 'train1'))
t2 = Thread(target=uf.extract_and_save_features, args=(train_set[train_split:], user_item_data, likes, unlikes, dislikes, undislikes, 'train2'))
t3 = Thread(target=uf.extract_and_save_features, args=(val_set, user_item_data, likes, unlikes, dislikes, undislikes, 'val'))
t4 = Thread(target=uf.extract_and_save_features, args=(test_set, user_item_data, likes, unlikes, dislikes, undislikes, 'test'))

t1.start()
t2.start()
t3.start()
t4.start()

t1.join()
t2.join()
t3.join()
t4.join()