In [124]:
import pandas as pd
import numpy as np
import random
import time

# Functions Used

## Train Test Split
### Leave last item out of subset of users

In [125]:
def leave_users_out(full_data, leave_out, seed=1234):
    np.random.seed(seed)
    full_data['index'] = full_data.index
    user_index_df = full_data.groupby('user')['index'].apply(list)
    users = np.random.choice(list(user_index_df.index), leave_out, replace=False)
    users_indices = []
    
    for user in users:
        users_indices.extend(user_index_df.loc[user])
    
    sub_set = full_data.loc[users_indices]
    remaining = full_data.drop(users_indices)
    
    return remaining.drop(columns=['index']), sub_set.drop(columns=['index'])

In [126]:
def leave_last_x_out(full_data, n_users, leave_out=1, seed=1234):
    # Input: data must contain user_id
    # Output: full_data = without all last (time order) entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    np.random.seed(seed)
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('user_id')['index'].apply(list)
    users = full_data.user_id.unique()
    leave_out_indices = []
    users_picked = []
    
    for i in range(len(full_data.user_id.unique())):
        random_user = np.random.choice(users)
        item_indices = user_items_ind[random_user] # random user's items indices
        if random_user in users_picked or len(item_indices) <= leave_out:
            random_user = np.random.choice(users)
            item_indices = user_items_ind[random_user] # random user's items indices
        else:
            users_picked.append(random_user)
            leave_out_indices.extend(item_indices[-leave_out:])
        
        if len(users_picked) == n_users:
            break
        
    if len(users_picked) < n_users:
        error = 'Cannot pick ' + str(n_users) + ' users with more than ' + str(leave_out) + ' items'
        solution = '\nTry a smaller test and/or validation percentage of the data'
        raise ValueError(error + solution) 
            
    leave_out_set = full_data.loc[leave_out_indices] # the last items of n_users users with n_item > leave_out
    full_data_leave_one_out = full_data.drop(leave_out_indices) # drops last items for n_users users
    
    return full_data_leave_one_out.drop(columns=['index']), leave_out_set.drop(columns=['index'])

In [127]:
def train_val_test_split(df, batch_size, val_perc, test_perc, n_items_val, n_items_test, stats=True):
    # Input: df with user and item id, batch size for CFRNN data, val and test perc of users
    #        number of last items to leave out for val and test set
    # Output:full_data = total users and items of the original df, 
    #        Train, validation and test sets
    
    total_users = len(df.user_id.unique()) # Need all users for BPR
    total_items = len(df.item_id.unique()) # Need all items for CFRNN
    
    users_to_remove = len(df.user_id.unique())%batch_size #Batch size compatible for CFRNN
    df_new, deleted_users = leave_users_out(df, users_to_remove)

    test_users = int(test_perc*total_users / 64 + 1) * 64 # Number of users to be used for testing
    test_last_items = n_items_test # Items to be removed from test users in train set and used in test set

    val_users = int(val_perc*total_users / 64 + 1) * 64
    val_last_items = n_items_val
    
    train_set, test_set = leave_last_x_out(df_new, test_users, test_last_items)
    train_set, val_set = leave_last_x_out(train_set, val_users, val_last_items)
    
    if stats:
        print('Total number of items:', total_items)
        print('Total users:', total_users)
        print('Number of train users:', len(train_set.user_id.unique()))
        print('Number of test users:', test_users)
        print('Number of validation users:', val_users, '\n')
        print('Users deleted:', len(deleted_users.user_id.unique()))
    
    return total_users, total_items, train_set, val_set, test_set

## Get Final Metrics

In [128]:
def get_metrics(ranked_df, steps, max_rank, stats=False):
    s = time.time()
    ranks_at = [1] + [i for i in range(steps, max_rank + steps, steps)]
    hitcounts = []
    recs_at = []
    precs_at = []
    metrics = pd.DataFrame(columns=['rank_at', 'hitcounts', 'recall', 'precision'])
    for rank in ranks_at:
        hitcount = 0
        for i, row in ranked_df.iterrows():
            hitcount +=  len(set(row['true_id']) & set(row['pred_items_ranked'][:rank]))

        prec_at = hitcount / rank / len(ranked_df)
        rec_at = hitcount / len(ranked_df.iloc[0]['true_id']) / len(ranked_df)

        hitcounts.append(hitcount)                     
        recs_at.append(rec_at)
        precs_at.append(prec_at)

    metrics['rank_at'] = ranks_at
    metrics['hitcounts'] = hitcounts
    metrics['recall'] = recs_at
    metrics['precision'] = precs_at
    if stats:
        print('Obtaining metrics time:', round(time.time() - s,2))
    return metrics

## Popularity Benchmark

In [171]:
def get_pop_bench(dataset, train_set, rank_at, steps):
    """
    Creates pop_df (pandas) in which pred_items_ranked will be filled with the most popular items,
    ranked based on their rate count in train_set. True_id in random_df is a list of the items per 
    user from dataset.
    
    Args:
    dataset: the data to create a popularity benchmark with (most likely a test/val set)
    train_set: data on which to base the counts 
    steps: stepsize for the ranking of the results
    rank_at: max rank to produce the metrics on
    """
    most_pop = list(train_set.groupby('item_id')['rating'].count().sort_values(ascending=False)[:20].index)
    users = dataset.user_id.unique()
    pop_df = pd.DataFrame(columns=['pred_items_ranked', 'true_id'], index = users)
    
    for u in users:
        pop_df.loc[u]['pred_items_ranked'] = most_pop
        pop_df.loc[u]['true_id'] = list(dataset[dataset['user_id']==u]['item_id'])
    
    metrics = get_metrics(pop_df, steps, rank_at)
    return metrics

## Random Benchmark

In [172]:
def get_random_bench(dataset, total_items, rank_at, steps):
    """
    Creates random_df (pandas) in which pred_items_ranked will be filled with a randomized subset of 
    the item_id's of size rank_at. true_id in random_df is a list of the items per user from dataset.
    
    Args:
    dataset: the data to create a random benchmark with (most likely a test/val set)
    total_items: all items a user can pick from
    steps: stepsize for the ranking of the results
    rank_at: max rank to produce the metrics on
    """
    users = dataset.user_id.unique()
    random_df = pd.DataFrame(columns=['pred_items_ranked', 'true_id'], index=users)
    
    for u in users:
        random_df.loc[u]['pred_items_ranked'] = np.random.choice(total_items, size=rank_at)
        random_df.loc[u]['true_id'] = list(dataset[dataset['user_id']==u]['item_id'])
        
    metrics = get_metrics(random_df, steps, rank_at)
    return metrics

## Creating Both Benchmark Metrics Function

In [173]:
def create_benchmarks(path, file_name, rank_at, steps, rand_trials, cut_ml_rate): 
    """
    Given the path and file_name, creates a popularity benchmark and a random benchmark, benchmarks show 
    Recall@steps, Precision@steps and Hitcount@steps
    
    Args:
    path: the path where the dataset is located
    file_name: name of file to read (should be a ratings pickle file with cols: item, user, rating)
    steps: stepsize for the ranking of the results
    rank_at: max rank to produce the metrics on
    
    Returns:
    final_random_bench based on rand_trials randomized result metrics
    pop_bench, benchmark based on the number of rating counts of the dataset
    """
    # Read
    df = pd.read_pickle(path + file_name)
    df.head()
    
    # Keep only ratings > 3 for movielens dataset
    if cut_ml_rate and file_name[0] == 'M':
        print('prev length:', len(df))
        df = df[df['rating'] > 3]
        print('new length:', len(df))
    
    # Get new user and item ids
    df['item_id'] = df.item.astype('category').cat.codes
    df['user_id'] = df.user.astype('category').cat.codes

    # Create train test splits
    BATCH_SIZE = 64
    val_perc = test_perc = 0.1
    n_last_items_val = n_last_items_test = 1
    total_users, total_items, train_set, val_set, test_set = \
    train_val_test_split(df, BATCH_SIZE, val_perc, test_perc, n_last_items_val, n_last_items_test)

    # Get random bench, avg of 10 runs
    random_bench = get_random_bench(test_set, len(df.item_id.unique()), 20, 5)
    if rand_trials > 0:
        for i in range(rand_trials - 1):
            random_bench += get_random_bench(test_set, len(df.item_id.unique()), 20, 5)

    final_random_bench = random_bench / trials
    # Get pop bench, based on counts in train set
    pop_bench = get_pop_bench(test_set, train_set, rank_at, steps)
    
    return final_random_bench, pop_bench

# Creating Benchmarks

## Init Path and Names

In [177]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
names_am = ['Amazon_01_users']
names_ml = ['ML_01_users']

## Create all metrics for Benchmarks
- Random
- Popularity

In [179]:
amazon_path = path + 'Data/Amazon/'
ml_path = path + 'Data/ML/'
res_path = path + 'Results/Results_17_04/'

file_paths = [amazon_path]*len(names_am) + [ml_path]*len(names_ml)
file_names = names_am + names_ml

file_paths = [ml_path]*len(names_ml)
file_names = names_ml

rank_at = 20
steps = 5
rand_trials = 10

for file_name, file_path in zip(file_names, file_paths):
    print('\n','='*50)
    print(file_name)
    random_bench, pop_bench = create_benchmarks(file_path, file_name, rank_at, steps, rand_trials, True)
    random_bench.to_pickle(res_path + 'rand_bench_' + file_name + '_rate_above_3')
    pop_bench.to_pickle(res_path + 'pop_bench_' + file_name + '_rate_above_3')
    print('RANDOM\n', random_bench)
    print('POPULAR\n', pop_bench)
    


ML_01_users
prev length: 2446037
new length: 1542930
Total number of items: 21569
Total users: 16241
Number of train users: 16192
Number of test users: 1664
Number of validation users: 1664 

Users deleted: 49
RANDOM
    rank_at  hitcounts    recall  precision
0      1.0        0.0  0.000000   0.000000
1      5.0        0.4  0.000240   0.000048
2     10.0        0.8  0.000481   0.000048
3     15.0        1.0  0.000601   0.000040
4     20.0        1.2  0.000721   0.000036
POPULAR
    rank_at  hitcounts    recall  precision
0        1          4  0.002404   0.002404
1        5         35  0.021034   0.004207
2       10         61  0.036659   0.003666
3       15         92  0.055288   0.003686
4       20        119  0.071514   0.003576
