In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import time
import copy
import calc_vector
import multiprocessing as mp
import random

from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse.linalg import spsolve

pd.set_option('display.max_columns', 100)

In [None]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

# Read Data

## Amazon Fashion

### Full CSJ fashion file

In [None]:
# df = pd.read_pickle('Data/df_amazon_csj_with_styles')
# df.head()

### CSJ 0.63m user above 5, r_u_i

In [None]:
df = pd.read_pickle('Data/df_amazon_csj_with_styles_0.63m_u_above_5_rui')
df.head()

## MovieLens

### 0.7m user above 5 r_u_i

In [None]:
# df = pd.read_pickle('Data/ml_0.7_u_above_5')
# print('rating interval:', df.rating.unique().min(), ',', df.rating.unique().max())
# df.head()

# Data Exploration

First filtering active users and rated items with x or more ratings:

In [None]:
user_ratings = df.groupby('user')['rating'].count()
item_ratings = df.groupby('item')['rating'].count()
norpu = user_ratings.mean()
norpi = item_ratings.mean()
total_users = df.user.unique().size
total_items = df.item.unique().size
sparseness = 1 - len(df) / (len(df['user'].unique()) * len(df['item'].unique()))

## Sub Sample

In [None]:
# df = df.sample(frac=0.9, random_state=1234)

In [None]:
print('rows ', len(df), '\n#ratings', len(df[df['rating'] != 0]), '\n#ratings/user', round(norpu,2), '\n#ratings/item', round(norpi,2), '\naverage rating', "{0:.2f}".format(np.average(df['rating'])), '\n#users ', df['user'].unique().size, '\n#items ', df['item'].unique().size, '\nsparse ', round(sparseness,5), '%')
df.hist(column='rating', bins=5, grid=False)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.xticks(range(1,6))
plt.savefig('Plots/Deliverables/rating_dist_ml')
plt.show()

plt.hist(item_ratings, bins = 1000)
plt.xlim([0,100])
plt.title('#ratings per item distribution (1000 bins)')
plt.xlabel('Items')
plt.ylabel('Count')
plt.savefig('Plots/Deliverables/#ratings_per_item_dist_ml')
plt.show()

plt.hist(user_ratings, bins = 100)
plt.xlim([0,30])
plt.title('#ratings per user distribution (100 bins)')
plt.xlabel('Users')
plt.ylabel('Count')
plt.savefig('Plots/Deliverables/#ratings_per_user_dist_ml')
plt.show()

## Verification Analysis
Only keep verified ratings

In [None]:
print('verified:', df['verified'].sum() / len(df))
df = df[df['verified']==True]
print('verified:', df['verified'].sum() / len(df))

In [None]:
df['verified'] = 1

In [None]:
# r = df['rating']
# df['rating'] = df['verified']

In [None]:
df_og = df
df = df_og.sample(frac=0.1, random_state=1234)
total_users = df.user.unique().size
total_items = df.item.unique().size

In [None]:
df.head()

# Data Prep
Create new ids for users and items that match the row and column indices of the user-item interaction matrix

In [None]:
def transform(df):
    items = df['item'].unique()
    itemsDF = pd.DataFrame(data=items, columns=['original_item_id'])
    itemsDF['new_item_id'] = itemsDF.index

    users = df['user'].unique()
    usersDF = pd.DataFrame(data=users, columns=['original_user_id'])
    usersDF['new_user_id'] = usersDF.index

    ratingDF = df.merge(itemsDF, left_on='item', right_on='original_item_id')
    ratingDF = ratingDF.drop(columns=['original_item_id'])

    ratingDF = ratingDF.merge(usersDF, left_on='user', right_on='original_user_id')
    ratingDF = ratingDF.drop(columns=['original_user_id'])

    df_new_ids = ratingDF
    print('Full data #row: ', df_new_ids.shape[0])
    
    return df_new_ids

df_new_ids = transform(df)

## Leave-one-out train test split

In [None]:
def leave_x_out(full_data, leave_out):
    # Input: data must be formatted by func: tranfsorm
    # Output: full_data = without all entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('new_user_id')['index'].apply(list)
    index_to_drop = []
    
    for indices in user_items_ind:
        if len(indices) > leave_out:
            for to_leave_out in range(leave_out):
                index = indices[- to_leave_out]
                index_to_drop.append(index)
    
    leave_one_out_set = full_data.loc[index_to_drop]
    full_data_leave_one_out = full_data.drop(index_to_drop)
    
    return full_data_leave_one_out.drop(columns=['index']), leave_one_out_set.drop(columns=['index'])

In [None]:
def create_matrices(data, n_users, n_items):
        r = data['new_user_id']
        c = data['new_item_id']
        d = data['rating']
        m = sparse.csr_matrix((d, (r, c)), shape=(n_users, n_items))
        m_ones = m.copy()
        m_ones[m_ones > 0] = 1
                               
        return m, m_ones

In [None]:
train_set, test_set = leave_x_out(df_new_ids, 2)
val_set, test_set = leave_x_out(test_set, 1)

# Model

## Bayesian Personalized Ranking
- Paper: https://arxiv.org/pdf/1205.2618.pdf
- Code:  https://github.com/valerystrizh/bpr/blob/master/BPR.java

In [None]:
class BPR():
    def __init__(self, total_users, total_items, params):
        self.total_users = total_users
        self.total_items = total_items
        self.nolf = params['nolf']
        self.n_iterations = params['n_iterations']
        self.alpha = params['alpha']
        self.reg_user = params['reg_user']
        self.reg_item = params['reg_item']
        self.alpha_decay = self.alpha / self.n_iterations
        self.incr_to_decay = False
        self.model = {}
        
    def fit(self, train_set, val_set):
        #Init
        p = np.random.normal(0, .1, (self.total_users, self.nolf))  # users
        q = np.random.normal(0, .1, (self.total_items, self.nolf))  # items
        train_ratings, train_ones = self.create_matrices(train_set)
#         ui_tuples = [tuple(x) for x in train_set[['new_user_id', 'new_item_id', 'verified']].to_numpy()] #positive
        user_items = train_set.groupby('new_user_id')['new_item_id'].apply(list)
        
        #Loop
        loss_list = []
        alphas = []
        for iteration in range(self.n_iterations):
            it_loss = 0
            for sample in range(train_ones.size):
                u = int(np.random.uniform(0, self.total_users))
                u_items = user_items[u]
                i = random.choice(u_items)
            
                j = int(np.random.uniform(0, self.total_items)) # neg item
                j_v = int(train_ones[u,j]) # Value, NEEDED?
                
                while j == i or j_v > 0: # j cannot be the same item or an item with a 1
                    j = int(random.sample(range(self.total_items), 1)[0])
                    j_v = int(train_ones[u,j])

                pos_item_pred = self.predict(u,i,p,q)
                neg_item_pred = self.predict(u,j,p,q)
                diff = pos_item_pred - neg_item_pred

                loss_value = - np.log(self.sigmoid(diff)) #NEGATIVE?
                it_loss += loss_value

                diff_deriv = self.sigmoid(- diff)

                for f in range(self.nolf): # update each factor (see notes for derivatives)
                    p[u,f] += self.alpha * (diff_deriv * (q[i,f] - q[j,f]) - self.reg_user * p[u,f])
                    q[i,f] += self.alpha * (diff_deriv * p[u,f] - self.reg_item * q[i,f])
                    q[j,f] += self.alpha * (diff_deriv * (-p[u,f]) - self.reg_item * q[j,f])

                    it_loss += self.reg_user * p[u,f] * p[u,f] + self.reg_item * q[i,f] * q[i,f] + self.reg_item * q[j,f] * q[j,f]
            
            if iteration > 0:
                self.update_alpha(loss_list[-1], it_loss)
                
            print('iteration:', iteration, 'loss:', round(it_loss,2), 'alpha:', self.alpha)
            alphas.append(self.alpha)
            loss_list.append(it_loss)
        
        self.model['p'] = p
        self.model['q'] = q
        self.model['train_loss'] = loss_list
        self.model['learning_rate'] = alphas
        
        
    def create_matrices(self, data):
        r = data['new_user_id']
        c = data['new_item_id']
        d = data['rating']
        m = sparse.csr_matrix((d, (r, c)), shape=(self.total_users, self.total_items))
        m_ones = m.copy()
        m_ones[m_ones > 0] = 1
                               
        return m, m_ones
    
    def sigmoid(self, x):
        return 1 / (1 + math.exp(-x))
    
    def predict(self, u, i, p, q):
        """Using MF now but can be any prediction algorithm"""
        return np.dot(p[u], q[i].T)
    
    def update_alpha(self, last_loss, it_loss):
        if(last_loss < it_loss):
            self.alpha = 0.5 * self.alpha
            self.incr_to_decay = True
            return
        elif(self.incr_to_decay):
            self.alpha = (1 - self.alpha_decay) * self.alpha
            print(self.alpha)
            return
        
        self.alpha = (1 + self.alpha_decay) * self.alpha

## Params

In [None]:
params = {
"nolf":20, #Size of latent feature vectors
"n_iterations":10,
"alpha":0.004, # Impact of confidence
          
#Regularizers, still tweaking the values
"reg_user":0.01,
"reg_item":0.01,
}

# Train Model

In [None]:
model = BPR(total_users, total_items, params)
model.fit(train_set, val_set)
result = model.model

In [None]:
res = model.model

In [None]:
np.dot(res['p'][0], res['q'][0].T)