In [2]:
import pandas as pd
import numpy as np
import random

# Data

## Amazon Fashion

In [3]:
#Full data
# file_name = 'amazon_clothing_shoes_jewelry_data' 

#2m user above 5 ratings
# file_name = 'amazon_csj_2m'

#0.63m user above 5 ratings
# file_name = 'df_amazon_csj_with_styles_0.63m_u_above_5_rui' 

## MovieLens

In [4]:
# Full data
# file_name = '25m_ml'

# 2m subset
# file_name = '2m-ml', 

# 0.7m subset
file_name = 'ml_0.7_u_above_5'

In [19]:
file_names = ['amazon_csj_2m', 'df_amazon_csj_with_styles_0.63m_u_above_5_rui', '2m-ml', 'ml_0.7_u_above_5', 'ml_0.7_u_above_5_3_r_thres', '2m-ml_3_r_thres']

In [254]:
path = 'Data/'
df = pd.read_pickle(path + file_name)
df.head()

Unnamed: 0,user,item,rating,verified
19412305,126018,1247,5.0,1
19670428,127741,27706,3.5,1
23908501,155314,1203,3.5,1
4673811,30576,541,5.0,1
15299153,99133,1220,4.0,1


# Data Prep
Create new ids for users and items that match the row and column indices of the user-item interaction matrix

In [20]:
def transform(df):
    items = df['item'].unique()
    itemsDF = pd.DataFrame(data=items, columns=['original_item_id'])
    itemsDF['new_item_id'] = itemsDF.index

    users = df['user'].unique()
    usersDF = pd.DataFrame(data=users, columns=['original_user_id'])
    usersDF['new_user_id'] = usersDF.index

    ratingDF = df.merge(itemsDF, left_on='item', right_on='original_item_id')
    ratingDF = ratingDF.drop(columns=['original_item_id'])

    ratingDF = ratingDF.merge(usersDF, left_on='user', right_on='original_user_id')
    ratingDF = ratingDF.drop(columns=['original_user_id'])

    df_new_ids = ratingDF
    print('Full data #row: ', df_new_ids.shape[0])
    
    return df_new_ids

# df_new_ids = transform(df)

## Leave item train test split

In [21]:
def leave_x_out(full_data, leave_out):
    # Input: data must be formatted by func: tranfsorm
    # Output: full_data = without all entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('new_user_id')['index'].apply(list)
    index_to_drop = []
    
    for indices in user_items_ind:
        if len(indices) > leave_out:
            for to_leave_out in range(leave_out):
                index = indices[- to_leave_out]
                index_to_drop.append(index)
    
    leave_one_out_set = full_data.loc[index_to_drop]
    full_data_leave_one_out = full_data.drop(index_to_drop)
    
    return full_data_leave_one_out.drop(columns=['index']), leave_one_out_set.drop(columns=['index'])

In [22]:
def create_matrices(data, n_users, n_items):
        r = data['new_user_id']
        c = data['new_item_id']
        d = data['rating']
        m = sparse.csr_matrix((d, (r, c)), shape=(n_users, n_items))
        m_ones = m.copy()
        m_ones[m_ones > 0] = 1
                               
        return m, m_ones

# Random Benchmark

## Random user item dict

In [23]:
def create_random_ui_dict(test_set, rank_at=20):
    users = test_set.new_user_id.unique()
    items = test_set.new_item_id.unique()

    user_item_dict = {}

    random.shuffle(items)
    for u in users:
        item_index = random.randint(rank_at, len(items))
        user_item_dict[u] = items[item_index-rank_at:item_index]        

    return user_item_dict

In [25]:
steps = 5
rank_at = 20
ranks_at = [1] + [i for i in range(steps, rank_at + steps, steps)]
random_results = pd.DataFrame(columns=file_names)

for name in file_names:
    print(name)
    df = pd.read_pickle('Data/' + name)
    df_new_ids = transform(df)
    train_set, test_set = leave_x_out(df_new_ids, 2)
    val_set, test_set = leave_x_out(test_set, 1)
    
    user_items = test_set.groupby('new_user_id')['new_item_id'].apply(list)
    hits = np.zeros((1,5))
    iterations = 10

    for iteration in range(iterations):
        ui_rand_dict = create_random_ui_dict(test_set)
        hitcounts = []
        recs_at = []
        precs_at = []
        for rank in ranks_at:
            hitcount = 0
            for u in test_set.new_user_id.unique():
                for item in user_items[u]:
                    if item in ui_rand_dict[u][:rank]: #for 1 item test sets only
                        hitcount += 1

            hitcounts.append(hitcount)                     
        hits += np.array(hitcounts).T

    avg_hits = hits / iterations
    random_results[name] = avg_hits[0]
    print(avg_hits)

amazon_csj_2m
Full data #row:  2059552
[[  6.3  37.2  75.  113.7 150.4]]
df_amazon_csj_with_styles_0.63m_u_above_5_rui
Full data #row:  629889
[[  5.5  27.2  51.9  74.4 103.4]]
2m-ml
Full data #row:  1974692
[[  74.3  371.2  740.2 1107.5 1478.5]]
ml_0.7_u_above_5
Full data #row:  707447
[[ 43.1 200.  414.2 626.1 841.2]]
ml_0.7_u_above_5_3_r_thres
Full data #row:  574132
[[ 38.8 182.9 365.9 544.7 732.5]]
2m-ml_3_r_thres
Full data #row:  1614609
[[  67.3  330.8  666.8  979.9 1310.2]]


In [None]:
pd.read_pickle('Results/BPR/rand_rank_hits')

In [None]:
random_results['rank_at'] = ranks_at

In [26]:
random_results.to_pickle('Results/BPR/rand_rank_hits')

# Popularity Benchmark
Popularity decides rank of item for everyone

In [27]:
file_names

['amazon_csj_2m',
 'df_amazon_csj_with_styles_0.63m_u_above_5_rui',
 '2m-ml',
 'ml_0.7_u_above_5',
 'ml_0.7_u_above_5_3_r_thres',
 '2m-ml_3_r_thres']

In [29]:
max_rank_at = 20
steps = 5
ranks_at = [1] + [i for i in range(steps, rank_at + steps, steps)]
items_in_test_set = 1
pop_results = pd.DataFrame(columns=file_names)

for name in file_names:
    print('\n', name)
    df = pd.read_pickle('Data/' + name)
    df_new_ids = transform(df)
    df_new_ids['item_counts'] = df_new_ids.groupby('new_item_id')['new_user_id'].transform('count') #for populairty
    train_set, test_set = leave_x_out(df_new_ids, items_in_test_set*2)
    val_set, test_set = leave_x_out(test_set, items_in_test_set)
    
    most_pop_items = test_set.sort_values('item_counts')['new_item_id'].unique()[-max_rank_at:]
    user_items = test_set.groupby('new_user_id')['new_item_id'].apply(list)
    
    hitcounts = []
    for rank in ranks_at:
        hitcount = 0
        for u in test_set.new_user_id.unique():
            for item in user_items[u]:
                if item in most_pop_items[:rank]:
                    hitcount += 1
        print('rank_at', rank, ' hitcount:', hitcount)
        hitcounts.append(hitcount)
    
    pop_results[name] = hitcounts


 amazon_csj_2m
Full data #row:  2059552
rank_at 1  hitcount: 15
rank_at 5  hitcount: 1390
rank_at 10  hitcount: 5840
rank_at 15  hitcount: 6460
rank_at 20  hitcount: 10185

 df_amazon_csj_with_styles_0.63m_u_above_5_rui
Full data #row:  629889
rank_at 1  hitcount: 23
rank_at 5  hitcount: 256
rank_at 10  hitcount: 1314
rank_at 15  hitcount: 3843
rank_at 20  hitcount: 6959

 2m-ml
Full data #row:  1974692
rank_at 1  hitcount: 184
rank_at 5  hitcount: 1900
rank_at 10  hitcount: 6933
rank_at 15  hitcount: 9426
rank_at 20  hitcount: 17776

 ml_0.7_u_above_5
Full data #row:  707447
rank_at 1  hitcount: 14
rank_at 5  hitcount: 1777
rank_at 10  hitcount: 3853
rank_at 15  hitcount: 5296
rank_at 20  hitcount: 7063

 ml_0.7_u_above_5_3_r_thres
Full data #row:  574132
rank_at 1  hitcount: 275
rank_at 5  hitcount: 2690
rank_at 10  hitcount: 4016
rank_at 15  hitcount: 5979
rank_at 20  hitcount: 8040

 2m-ml_3_r_thres
Full data #row:  1614609
rank_at 1  hitcount: 3483
rank_at 5  hitcount: 5015
rank_

In [28]:
pop_results_old = pd.read_pickle('Results/BPR/pop_rank_hits')

FileNotFoundError: [Errno 2] No such file or directory: 'Results/BPR/pop_rank_hits'

In [None]:
pop_results_old['ml_0.7_u_above_5_3_r_thres'] = pop_results['ml_0.7_u_above_5_3_r_thres']

In [30]:
pop_results.to_pickle('Results/BPR/pop_rank_hits')

In [None]:
pop_results_old

In [None]:
len(df)