# Notebook with a basic example of how to run the code for diversification
## Assumes availability of "repro" folder including its contents (pre-computed matrices), that are available from our OSF repository (link in README)

In [27]:
import json
import os
import sys
import random

import numpy as np

%load_ext autoreload
%autoreload 2


# Adjust the path so that we can import packages from src folder (sibling to examples)
sys.path.append("..")

K = 10
SEED = 42

import metrics, diversification

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
random.seed(SEED)
np.random.seed(SEED)

## Path setting, adjust accordingly

In [29]:
# Adjust accordingly
path_to_repro = "./repro"
path_to_data = os.path.join(path_to_repro)

os.listdir(path_to_data)

['item_index_to_id.npy',
 'item_item.npy',
 'movie_data_small.json',
 'rating_matrix.npy']

## Data loading

In [30]:
# Load the item_item matrix for EASE^R algorithm
item_item = np.load(os.path.join(path_to_data, "item_item.npy"))

1. CF-raw-ILD corresponded to intra_list_diversity over distance_matrix_rating.npy (available in OSF), see notes above on CF-ILD
2. CB-plot-ILD corresponded to intra_list_diversity over CLIP extracted features from item plots, see notes above on CB-ILD
3. MD-Genres-BinDiv corresponded to binomial_diversity, see notes above
4. MD-Genres-ILD corresponded to intra_list_diversity evaluated over distance_matrix_genres.npy (available in OSF)
5. CF-latent-ILD corresponded to intra_list_diversity evaluated over distance_matrix_ease.npy (available in OSF)
6. MD-Tags-ILD corresponded to intra_list_diversity evaluated over distance_matrix_tags.npy (available in OSF)

In [31]:

rating_matrix = np.load(os.path.join(path_to_data, "rating_matrix.npy"))

data_small_path = os.path.join(path_to_data, "movie_data_small.json")

with open(data_small_path, "r") as f:
    data_small = json.load(f)
    data_small = {int(k) : v for k,v in data_small.items()}

# Prepare genre data needed for BIN-DIV
all_genres = set()
for m_id, x in data_small.items():
    if x and "genres" in x:
        all_genres.update(x["genres"])

item_index_to_id = np.load(os.path.join(path_to_data, "item_index_to_id.npy"))
        
all_categories = all_genres
item_to_genres = {item_id : x["genres"] if x is not None and "genres" in x else [] for item_id, x in data_small.items()}
get_item_categories = lambda item_idx: item_to_genres[item_index_to_id[item_idx]]

diversity_func = metrics.binomial_diversity(all_categories, get_item_categories, rating_matrix, 0.0, "movielens")
all_items = np.arange(rating_matrix.shape[1])

In [32]:
## NOTE: if you are using binomial diversity, then this might be extremely slow (~minutes) for the first time, before full-cache is built
K = 10
diversity_func(np.random.choice(all_items, K))

np.float64(0.37023154113539514)

In [33]:
ease = diversification.EASER_pretrained(all_items)
ease.load(os.path.join(path_to_data, "item_item.npy"))

<diversification.EASER_pretrained at 0x1ca238377f0>

In [34]:
# Simulate that user selected 20 random items during elicitation
elicitation_selected = np.random.choice(all_items, 20)
rel_scores, user_vector, ease_pred = ease.predict_with_score(elicitation_selected, elicitation_selected, K)

In [35]:
user_vector.sum() # The vector has 1 at positions corresponding to items selected during elicitation

np.float32(20.0)

In [36]:
ease_pred # Recommendation generated by ease

[492, 7795, 3738, 1681, 9664, 7099, 12898, 7937, 300, 2911]

In [37]:
rel_scores # For each item, the estimated relevance

array([-1.6217396 , -0.47954893,  0.6084669 , ..., -0.00264691,
        0.03774971, -0.08172837], shape=(15685,), dtype=float32)

In [38]:
diversity_func(ease_pred)

np.float64(0.45790431443081014)

In [39]:
# Now run the diversification

def relevance_func(top_k):
    return rel_scores[top_k].sum()

print(f"Running diversification BIN-DIV")
print(f"Initial relevance-only recommendation: {np.array(ease_pred)}")
print("")

for alpha in [0.0, 0.5, 1.0]:
    diversified_top_k = diversification.diversify(K, rel_scores, alpha, all_items, relevance_func, diversity_func, rating_row=user_vector, filter_out_items=elicitation_selected, n_items_subset=500)
    print(f"alpha={alpha}, gives: {diversified_top_k}")
    print(f"\twhich has relevance={relevance_func(diversified_top_k)}, and diversity: {diversity_func(diversified_top_k)}")

Running diversification BIN-DIV
Initial relevance-only recommendation: [  492  7795  3738  1681  9664  7099 12898  7937   300  2911]

alpha=0.0, gives: [  492  7795  3738  1681  9664  7099 12898  7937   300  2911]
	which has relevance=51.865089416503906, and diversity: 0.45790431443081014
alpha=0.5, gives: [ 9706  7795  4086 12898   497  1681   492  6343    88  7039]
	which has relevance=43.560516357421875, and diversity: 0.8534637648438821
alpha=1.0, gives: [ 9706  7970 14173 12829 10009  9486  7908 13579  1681  2180]
	which has relevance=12.157386779785156, and diversity: 0.973088997001406
