In [1]:
import utils.dataset_functions as df
import utils.user_features as uf
import utils.two_towers as ttn
import torch
import numpy as np
from pathlib import Path
import pandas as pd
import faiss
 
# df path
save_path = Path("results")
save_path.mkdir(parents=True, exist_ok=True)

# Dimensions of the tower FFN
output_dim      = 16
hidden_dim      = 256
id_dim          = 16


device='cuda' if torch.cuda.is_available() else 'cpu'
print("Training models on:", device)

Training models on: cuda


In [2]:
# Load user data
user_item_data = pd.read_csv(Path("dataset") / "processed" / "merged.csv", index_col=False).drop(["normalized_embed"], axis=1)

# Songs is a pre-computed forward pass on all the songs for both 'binary' and 'continueous' models.
songs = torch.load(Path("dataset") / "processed" / "song_subset.pt")
binary_embeddings = songs['binary']
non_binary_embeddings = songs['continueous']

# Load our models
binary_model = model = ttn.DualAugmentedTwoTower("binary_label", hidden_dim, output_dim, id_dim)
binary_model.load(Path("models")/ "binary_label.pt")
non_binary_model = ttn.DualAugmentedTwoTower("continueous_label", hidden_dim, output_dim, id_dim)
non_binary_model.load(Path("models")/ "continueous_label.pt")

# Load test set:
test_set = df.load_tensor_dataloader("val", Path("dataset")/"processed", 1, label_id=2)

In [3]:
# We need this to drop it in the next step
already_seen = {} # user: [num_ids of songs already listened to]

for u in user_item_data['uid'].unique():
    a = user_item_data[(user_item_data['uid'] == u)]
    already_seen[u] = a[0:int(0.7 * len(a))]['item_id'].unique().tolist()


In [4]:
# For each user; take the first (it's chronological) test sample, and view it as our present time.
# We extract the user features and embed it using doing a user tower forward pass (model.user_pass)

# And for every other sample for a particular user collect the label and interaction for each song 
# ONLY IF that song has not been listened to before because that would mean it was used for training.

users = {}
for user_features, song_embedding, label, interaction, user_id, song_id in test_set:
    if song_id.item() not in already_seen[user_id.item()]:
        if users.get(user_id.item()) is None:
            users[user_id.item()] = {}

            users[user_id.item()]['user_embed_bin'] = binary_model.user_pass(user_features, user_id).detach().squeeze(0)
            users[user_id.item()]['user_embed_nbin'] = non_binary_model.user_pass(user_features, user_id).detach().squeeze(0)

            users[user_id.item()]['user_interact_ratio'] = user_features.squeeze(0)[4].item()
            users[user_id.item()]['song_data'] = {}

        
        else:
            users[user_id.item()]['song_data'][song_id.item()] = (label.squeeze(0).tolist(), interaction.item())


# Create the index for all song embeddings
non_binary_index = np.array([i.tolist() for i in non_binary_embeddings.values()])
binary_index = np.array([i.tolist() for i in binary_embeddings.values()])


non_binary_model.create_index(non_binary_index)
binary_model.create_index(binary_index)

In [None]:
# Now, for each user we make a recommendation of size 'num_rec'. And we score these recommendations using the dot product.

# If a song from the recommendation was ALSO in our test set, we overwrite the base label and interection, 
# which was set at -1 and 0 resprectively as that indicates whether this user has any history with the recommended song.   


# Number of recommendations to make
num_rec = 100

total_users = []
total_nbin_song_ids = []
total_nbin_score = []
total_nbin_labels = []
total_nbin_interactions = []
total_bin_song_ids = []
total_bin_score = []
total_bin_labels = []
total_bin_interactions = []



for u in users.keys():

    user = users[u]

    # 1. Make binary and non-binary recommendations.
    nbin_query = user["user_embed_nbin"][np.newaxis, :]
    nbin_distances, nbin_indices = non_binary_model.recommendations(nbin_query, num_rec)

    bin_query = user["user_embed_bin"][np.newaxis, :]
    bin_distances, bin_indices = binary_model.recommendations(bin_query, num_rec)
    
    # 2. Score them and init the label and interaction lists
    nbin_score  = [(nbin_query * non_binary_embeddings[e]).sum().item() for e in nbin_indices]
    nbin_labels = [-1 for _ in range(num_rec)]
    nbin_interactions = [0 for _ in range(num_rec)]

    bin_score = [(bin_query * binary_embeddings[e]).sum().item() for e in bin_indices]
    bin_labels = [-1 for _ in range(num_rec)]
    bin_interactions = [0 for _ in range(num_rec)]

   

    
    # 3. Now for both binary and non_binary, check whether each song_id check if it has been listened to before in the test set and overwrite the label and interaction with actual values.
    for i in range(len(nbin_indices)):
        song_id = nbin_indices[i]
        song_data = user["song_data"].get(song_id) 
        
        if song_data is not None:
            nbin_labels[i] = song_data[0][1]
            nbin_interactions[i] = song_data[1]

    
    for i in range(len(bin_indices)):
        song_id = bin_indices[i]
        song_data = user["song_data"].get(song_id) 
    
        if song_data is not None:
            bin_labels[i] = song_data[0][1]
            bin_interactions[i] = song_data[1]

    # Add the total together for our dataframes
    total_users += [u for _ in range(num_rec)]
    total_nbin_score += nbin_score
    total_nbin_labels += nbin_labels
    total_nbin_interactions += nbin_interactions
    total_bin_score += bin_score
    total_bin_labels += bin_labels
    total_bin_interactions += bin_interactions

In [6]:
bin_df = pd.DataFrame(
            data = {"uid": total_users,
                    "score": total_bin_score,
                    "label": total_bin_labels,
                    "interaction": total_bin_interactions
            })

bin_df

Unnamed: 0,uid,score,label,interaction
0,4485,0.000000,-1.0,0
1,4485,0.504869,-1.0,0
2,4485,0.397719,-1.0,0
3,4485,0.290814,-1.0,0
4,4485,0.327991,-1.0,0
...,...,...,...,...
484095,2465,-0.145337,-1.0,0
484096,2465,-0.228459,-1.0,0
484097,2465,0.093757,-1.0,0
484098,2465,-0.199833,-1.0,0


In [7]:
nbin_df = pd.DataFrame(
            data = {"uid": total_users,
                    "score": total_nbin_score,
                    "label": total_nbin_labels,
                    "interaction": total_nbin_interactions
            })

nbin_df

Unnamed: 0,uid,score,label,interaction
0,4485,0.893750,-1.0,0
1,4485,0.824787,-1.0,0
2,4485,0.558120,-1.0,0
3,4485,1.096835,-1.0,0
4,4485,0.000000,-1.0,0
...,...,...,...,...
484095,2465,1.293205,-1.0,0
484096,2465,0.418250,-1.0,0
484097,2465,0.693795,-1.0,0
484098,2465,0.125826,-1.0,0


In [8]:
# Save our df's
bin_df.to_csv(save_path / "bin_df.csv", index=False)
nbin_df.to_csv(save_path/ "nbin_df.csv", index=False)

In [11]:
# Load our df's like so:
bin_df = pd.read_csv(save_path / "bin_df.csv", index_col=False)
nbin_df= pd.read_csv(save_path/ "nbin_df.csv", index_col=False)

In [12]:
bin_df

Unnamed: 0,uid,score,label,interaction
0,4485,0.000000,-1.0,0
1,4485,0.504869,-1.0,0
2,4485,0.397719,-1.0,0
3,4485,0.290814,-1.0,0
4,4485,0.327991,-1.0,0
...,...,...,...,...
484095,2465,-0.145337,-1.0,0
484096,2465,-0.228459,-1.0,0
484097,2465,0.093757,-1.0,0
484098,2465,-0.199833,-1.0,0


In [10]:
nbin_df

Unnamed: 0,uid,score,label,interaction
0,4485,0.893750,-1.0,0
1,4485,0.824787,-1.0,0
2,4485,0.558120,-1.0,0
3,4485,1.096835,-1.0,0
4,4485,0.000000,-1.0,0
...,...,...,...,...
484095,2465,1.293205,-1.0,0
484096,2465,0.418250,-1.0,0
484097,2465,0.693795,-1.0,0
484098,2465,0.125826,-1.0,0
