In [1]:
import pandas as pd
import numpy as np
import random
import time

# Functions Used

## For Train Test split

In [2]:
def leave_users_out(full_data, leave_out, seed=1234):
    np.random.seed(seed)
    full_data['index'] = full_data.index
    user_index_df = full_data.groupby('user')['index'].apply(list)
    users = np.random.choice(list(user_index_df.index), leave_out, replace=False)
    users_indices = []
    
    for user in users:
        users_indices.extend(user_index_df.loc[user])
    
    sub_set = full_data.loc[users_indices]
    remaining = full_data.drop(users_indices)
    
    return remaining.drop(columns=['index']), sub_set.drop(columns=['index'])

In [20]:
def leave_last_x_out(full_data, n_users, leave_out=1, seed=1234):
    # Input: data must contain user_id
    # Output: full_data = without all last (time order) entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    np.random.seed(seed)
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('user_id')['index'].apply(list)
    users = full_data.user_id.unique()
    leave_out_indices = []
    users_picked = []
    
    for i in range(n_users):
        random_user = np.random.choice(users)
        item_indices = user_items_ind[random_user] # random user's items indices
        while random_user not in users_picked and len(item_indices) <= leave_out: # needs to have more items than to leave out, or deleting users
            random_user = np.random.choice(users)
            item_indices = user_items_ind[random_user]
            
        users_picked.append(random_user)
        leave_out_indices.extend(item_indices[-leave_out:])
    
    leave_out_set = full_data.loc[leave_out_indices] # the last items of n_users users with n_item > leave_out
    full_data_leave_one_out = full_data.drop(leave_out_indices) # drops last items for n_users users
    
    return full_data_leave_one_out.drop(columns=['index']), leave_out_set.drop(columns=['index'])

## Get Final Metrics

In [4]:
def get_metrics(ranked_df, steps, max_rank):
    s = time.time()
    ranks_at = [1] + [i for i in range(steps, max_rank + steps, steps)]
    hitcounts = []
    recs_at = []
    precs_at = []
    metrics = pd.DataFrame(columns=['rank_at', 'hitcounts', 'recall', 'precision'])
    for rank in ranks_at:
        hitcount = 0
        for i, row in ranked_df.iterrows():
            hitcount +=  len(set(row['true_id']) & set(row['pred_items_ranked'][:rank]))

        prec_at = hitcount / rank / len(ranked_df)
        rec_at = hitcount / len(ranked_df.iloc[0]['true_id']) / len(ranked_df)

        hitcounts.append(hitcount)                     
        recs_at.append(rec_at)
        precs_at.append(prec_at)

    metrics['rank_at'] = ranks_at
    metrics['hitcounts'] = hitcounts
    metrics['recall'] = recs_at
    metrics['precision'] = precs_at
    print('Obtaining metrics time:', round(time.time() - s,2))
    return metrics

## Popularity Benchmark

In [38]:
def get_pop_bench(dataset, train_set, rank_at, steps):
    counts = train_set.groupby('item_id')['item'].count()
    most_pop_items = counts.sort_values(ascending=False)[:20]
    users = dataset.user_id.unique()
    pop_df = pd.DataFrame(columns=['pred_items_ranked', 'true_id'], index=users)    
    
    for u in users:
        pop_df.loc[u]['pred_items_ranked'] = list(most_pop_items)
        pop_df.loc[u]['true_id'] = list(dataset[dataset['user_id']==u]['item_id'])
    
    metrics = get_metrics(pop_df, steps, rank_at)
    return metrics

## Random Benchmark

In [39]:
def get_random_bench(dataset, total_items, rank_at, steps):
    users = dataset.user_id.unique()
    random_df = pd.DataFrame(columns=['pred_items_ranked', 'true_id'], index=users)
    
    for u in users:
        random_df.loc[u]['pred_items_ranked'] = np.random.choice(total_items, size=rank_at)
        random_df.loc[u]['true_id'] = list(dataset[dataset['user_id']==u]['item_id'])
        
    metrics = get_metrics(random_df, steps, rank_at)
    return metrics

## Creating Both Benchmark Metrics Function

In [40]:
def create_benchmarks(path, file_name, rank_at, steps):   
    # Read
    df = pd.read_pickle(path + file_name)
    df.head()

    df['item_id'] = df.item.astype('category').cat.codes
    df['user_id'] = df.user.astype('category').cat.codes

    # Create train test splits
    BATCH_SIZE = 64

    df_og = df

    users_to_remove = len(df_og.user_id.unique())%BATCH_SIZE #Batch size compatible for CFRNN
    df, deleted_users = leave_users_out(df_og, users_to_remove)

    total_users = len(df_og.user_id.unique()) # Need all users for BPR
    total_items = len(df_og.item_id.unique()) # Need all items for CFRNN

    test_users = int(0.1*total_users) # Number of users to be used for testing
    test_last_items = 1 # Items to be removed from test users in train set and used in test set

    val_users = int(0.1*total_users) -1
    val_last_items = 1

    # Split
    train_set, test_set = leave_last_x_out(df, test_users, test_last_items)
    train_set, val_set = leave_last_x_out(train_set, val_users, val_last_items)

    print('Total number of items:', total_items)
    print('Total users:', total_users)
    print('Number of train users:', len(train_set.user_id.unique()))
    print('Number of test users:', test_users)
    print('Number of validation users:', val_users, '\n')
    print('Users deleted:', len(deleted_users.user_id.unique()))

    # Get random bench
    random_bench = get_random_bench(test_set, df_og.item_id.unique(), rank_at, steps)
    pop_bench = get_pop_bench(test_set, train_set, rank_at, steps)
    
    return random_bench, pop_bench

# Creating Benchmarks

## Init Path and Names

In [41]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
names_am = ['Amazon_01_users', 'Amazon_005_users']
names_ml = ['ML_01_users', 'ML_005_users']

## Create all metrics for Benchmarks
- Random
- Popularity

In [42]:
amazon_path = path + 'Data/Amazon/'
ml_path = path + 'Data/ML/'
res_path = path + 'Results/Benchmarks/'

file_paths = [amazon_path]*len(names_am) + [ml_path]*len(names_ml)
file_names = names_am + names_ml

rank_at = 20
steps = 5

for file_name, file_path in zip(file_names, file_paths):
    print('\n','='*50)
    print(file_name)
    random_bench, pop_bench = create_benchmarks(file_path, file_name, rank_at, steps)
    random_bench.to_pickle(res_path + 'rand_bench_' + file_name)
    pop_bench.to_pickle(res_path + 'pop_bench_' + file_name)
    print('RANDOM\n', random_bench)
    print('POPULAR\n', pop_bench)
    


Amazon_01_users
Total number of items: 247465
Total users: 121372
Number of train users: 121344
Number of test users: 12137
Number of validation users: 12136 

Users deleted: 28
Obtaining metrics time: 10.5
Obtaining metrics time: 9.51
RANDOM
    rank_at  hitcounts    recall  precision
0        1          0  0.000000   0.000000
1        5          1  0.000087   0.000017
2       10          1  0.000087   0.000009
3       15          2  0.000173   0.000012
4       20          2  0.000173   0.000009
POPULAR
    rank_at  hitcounts    recall  precision
0        1          0  0.000000   0.000000
1        5          1  0.000087   0.000017
2       10          1  0.000087   0.000009
3       15          2  0.000173   0.000012
4       20          2  0.000173   0.000009

Amazon_005_users
Total number of items: 176152
Total users: 60686
Number of train users: 60672
Number of test users: 6068
Number of validation users: 6067 

Users deleted: 14
Obtaining metrics time: 4.22
Obtaining metrics time: 4

In [9]:
rank_at = 20
steps = 5

In [21]:
ml_path = path + 'Data/ML/'
df = pd.read_pickle(ml_path + 'ML_full')

In [22]:
df = df.sample(frac=0.3)

In [23]:
len(df)

7500028

In [24]:
df['item_id'] = df.item.astype('category').cat.codes
df['user_id'] = df.user.astype('category').cat.codes

# Create train test splits
BATCH_SIZE = 64

df_og = df

users_to_remove = len(df_og.user_id.unique())%BATCH_SIZE #Batch size compatible for CFRNN
df, deleted_users = leave_users_out(df_og, users_to_remove)

total_users = len(df_og.user_id.unique()) # Need all users for BPR
total_items = len(df_og.item_id.unique()) # Need all items for CFRNN

test_users = int(0.1*total_users) # Number of users to be used for testing
test_last_items = 1 # Items to be removed from test users in train set and used in test set

val_users = int(0.1*total_users) -1
val_last_items = 1

# Split
train_set, test_set = leave_last_x_out(df, test_users, test_last_items)
train_set, val_set = leave_last_x_out(train_set, val_users, val_last_items)

In [31]:
random_df, random_bench = get_random_bench(test_set, df_og.item_id.unique(), rank_at, steps)
pop_df, pop_bench = get_pop_bench(test_set, train_set, rank_at, steps)

Obtaining metrics time: 26.92
Obtaining metrics time: 16.23


In [35]:
pop_df.iloc[0]['pred_items_ranked']

[24358,
 24245,
 23875,
 22141,
 21757,
 20492,
 19188,
 18045,
 17704,
 17423,
 17224,
 17062,
 17013,
 16731,
 16671,
 16384,
 16318,
 15792,
 15505,
 15267]

# Popularity Benchmark SHOULD BE COUNTS OF TRAIN SET
Popularity decides rank of item for everyone

In [None]:
file_names

In [None]:
max_rank_at = 20
steps = 5
ranks_at = [1] + [i for i in range(steps, rank_at + steps, steps)]
items_in_test_set = 1
pop_results = pd.DataFrame(columns=file_names)

for name in file_names:
    print('\n', name)
    df = pd.read_pickle('Data/' + name)
    df_new_ids = transform(df)
    df_new_ids['item_counts'] = df_new_ids.groupby('item_id')['user_id'].transform('count') #for populairty
    train_set, test_set = leave_x_out(df_new_ids, items_in_test_set*2)
    val_set, test_set = leave_x_out(test_set, items_in_test_set)
    
    most_pop_items = test_set.sort_values('item_counts')['item_id'].unique()[-max_rank_at:]
    user_items = test_set.groupby('user_id')['item_id'].apply(list)
    
    hitcounts = []
    for rank in ranks_at:
        hitcount = 0
        for u in test_set.user_id.unique():
            for item in user_items[u]:
                if item in most_pop_items[:rank]:
                    hitcount += 1
        print('rank_at', rank, ' hitcount:', hitcount)
        hitcounts.append(hitcount)
    
    pop_results[name] = hitcounts

In [None]:
pop_results_old = pd.read_pickle('Results/BPR/pop_rank_hits')

In [None]:
pop_results_old['ml_0.7_u_above_5_3_r_thres'] = pop_results['ml_0.7_u_above_5_3_r_thres']

In [None]:
pop_results.to_pickle('Results/BPR/pop_rank_hits')

In [None]:
pop_results_old

In [None]:
len(df)