In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import time
import copy
import calc_vector
import multiprocessing as mp

from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse.linalg import spsolve

pd.set_option('display.max_columns', 100)

In [2]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

# Read Data

## Amazon Fashion

### Full CSJ fashion file

In [3]:
# df = pd.read_pickle('Data/df_amazon_csj_with_styles')
# df.head()

### CSJ 0.63m user above 5, r_u_i

In [4]:
df = pd.read_pickle('Data/df_amazon_csj_with_styles_0.63m_u_above_5_rui')
df.head()

Unnamed: 0,user,item,rating,verified
0,A2OLY7TMIYHOQQ,B00EAKJUUW,5.0,True
1,A3F6ZP5VM8QUC6,B00D98EGE6,5.0,True
2,A21PFJA2O7Z5GY,B01DTEXSHA,2.0,True
3,AV9HIUYXBZODJ,B0045DBUBQ,3.0,True
4,A73X3PFCRTJVX,B00DEWBMU8,5.0,True


## MovieLens

### 0.7m user above 5 r_u_i

In [5]:
# df = pd.read_pickle('Data/ml_0.7_u_above_5')
# print('rating interval:', df.rating.unique().min(), ',', df.rating.unique().max())
# df.head()

# Data Exploration

First filtering active users and rated items with x or more ratings:

In [6]:
user_ratings = df.groupby('user')['rating'].count()
item_ratings = df.groupby('item')['rating'].count()
norpu = user_ratings.mean()
norpi = item_ratings.mean()
total_users = df.user.unique().size
total_items = df.item.unique().size
sparseness = 1 - len(df) / (len(df['user'].unique()) * len(df['item'].unique()))

## Sub Sample

In [7]:
# df = df.sample(frac=0.9, random_state=1234)

In [8]:
# print('rows ', len(df), '\n#ratings', len(df[df['rating'] != 0]), '\n#ratings/user', round(norpu,2), '\n#ratings/item', round(norpi,2), '\naverage rating', "{0:.2f}".format(np.average(df['rating'])), '\n#users ', df['user'].unique().size, '\n#items ', df['item'].unique().size, '\nsparse ', round(sparseness,5), '%')
# df.hist(column='rating', bins=5, grid=False)
# plt.title('Rating Distribution')
# plt.xlabel('Rating')
# plt.xticks(range(1,6))
# plt.savefig('Plots/Deliverables/rating_dist_ml')
# plt.show()

# plt.hist(item_ratings, bins = 1000)
# plt.xlim([0,100])
# plt.title('#ratings per item distribution (1000 bins)')
# plt.xlabel('Items')
# plt.ylabel('Count')
# plt.savefig('Plots/Deliverables/#ratings_per_item_dist_ml')
# plt.show()

# plt.hist(user_ratings, bins = 100)
# plt.xlim([0,30])
# plt.title('#ratings per user distribution (100 bins)')
# plt.xlabel('Users')
# plt.ylabel('Count')
# plt.savefig('Plots/Deliverables/#ratings_per_user_dist_ml')
# plt.show()

## Verification Analysis
Only keep verified ratings

In [9]:
print('verified:', df['verified'].sum() / len(df))
df = df[df['verified']==True]
print('verified:', df['verified'].sum() / len(df))

verified: 0.9118257343754217
verified: 1.0


In [10]:
df['verified'] = 1

In [11]:
# r = df['rating']
# df['rating'] = df['verified']

In [12]:
df_og = df
df = df_og.sample(frac=0.0001, random_state=1234)
total_users = df.user.unique().size
total_items = df.item.unique().size

In [13]:
df.head()

Unnamed: 0,user,item,rating,verified
24161,ABMRK1O13436Y,B00JBJCLG2,5.0,1
396411,A36IZ3GX3RI33W,B00VWKWWMG,5.0,1
426457,AZWGNYTKSK9JS,B000KPXYCG,5.0,1
587848,AQLT5XRHA958G,B000CEM6FG,5.0,1
198173,A31IV9AHRLVY3O,B0077BX3GI,5.0,1


# Models

## Direct Implicit Feedback Implementation
Slow and inaccurate 

In [14]:
# def create_matrix(data, n_users, n_items):
#         r = data['new_user_id']
#         c = data['new_item_id']
#         d = data['rating']
#         train_matrix = sparse.coo_matrix((d, (r, c)), shape=(n_users, n_items))
    
#         return train_matrix.tocsr()

In [15]:
# #init
# m_train = create_matrix(train_set, total_users, total_items)
# m_val = create_matrix(train_set, total_users, total_items)
# #
# p = np.random.normal(0, .1, (total_users, params['nolf']))  # users
# q = np.random.normal(0, .1, (total_items, params['nolf']))  # items

In [16]:
# # iterate
# s = time.time()
# errors = []
# for e in range(params['n_epochs']):
#     for u in range(total_users):
#         for i in range(total_items):
#             b_ui = m_train[u,i]
#             error = b_ui - np.dot(p[u], q[i])

#             #update
#             p[u] += params['alpha'] * (error * q[i])# - self.pu_reg * p[u])
#             q[i] += params['alpha'] * (error * p[u])# - self.qi_reg * q[i])
#             errors.append(np.square(error))
        
# t = time.time() - s
# print(t)

In [17]:
# t/60

## ALS Implicit Feedback Implementation with Confidence
- From Paper: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.167.5120&rep=rep1&type=pdf
- Code Found: https://github.com/MrChrisJohnson/implicit-mf/blob/master/mf.py

In [18]:
params = {
"nolf":20, #Size of latent feature vectors
"n_epochs":10,
"alpha":5, # Impact of confidence
          
#Regularizers, still tweaking the values
"reg":0.001,
}

In [37]:
#init
class ALS_impl_fb():
    
    def __init__(self, total_users, total_items, params):
        self.total_users = total_users
        self.total_items = total_items
        self.nolf = params['nolf']
        self.n_epochs = params['n_epochs']
        self.alpha = params['alpha']
        self.reg = params['reg']
        self.model = {}

    
    def fit(self, train_set, val_set):
        #Init
        m_train, m_ones_train = self.create_matrices(train_set, self.total_users, self.total_items)
        m_val, m_ones_val = self.create_matrices(val_set, self.total_users, self.total_items)
        p = np.random.normal(0, .1, (total_users, self.nolf))  # users
        q = np.random.normal(0, .1, (total_items, self.nolf))  # items
        
        p_old = copy.deepcopy(p)
        q_old = copy.deepcopy(q)
        self_items = {'total_users': self.total_users, 'total_items': self.total_items, 
                        'alpha': self.alpha, 'reg':self.reg, 'nolf':self.nolf}
        
        #Solve with ALS
        if __name__ == '__main__':
            manager = mp.Manager()
            res = manager.dict([('p',p),('q',q)])
            s = time.time()
            for e in range(self.n_epochs):
                    user_job = mp.Process(target=calc_vector.calc_vector, args=(True, sparse.csr_matrix(res['q']), m_train, m_ones_train, res, 'p', self_items))
                    item_job = mp.Process(target=calc_vector.calc_vector, args=(False, sparse.csr_matrix(res['p']), m_train, m_ones_train, res, 'q', self_items))

                    user_job.start()
                    print('solving for user vectors')
                    item_job.start()
                    print('solving for item vectors')                

                    user_job.join()
                    item_job.join()
                    print('p', np.sum(p_old - res['p']))
                    print('q', np.sum(q_old - res['q']))
                    print(res['test'])
#             print('solving for user vectors')
#             p = self.calc_vector(True, sparse.csr_matrix(q), m_train, m_ones_train, res, 'p', self_items)
#             print('solving for item vectors')
#             q = self.calc_vector(False, sparse.csr_matrix(p), m_train, m_ones_train, res, 'q', self_items)

        t = time.time() - s
        print('Total time:', t)
        self.model['p'] = res['p']
        self.model['q'] = res['q']
        self.model['time'] = t
        
#         self.evaluate(m_ones_val, val=True)
        return self.model

    def create_matrices(self, data, n_users, n_items):
        r = data['new_user_id']
        c = data['new_item_id']
        d = data['rating']
        m = sparse.csr_matrix((d, (r, c)), shape=(n_users, n_items))
        m_ones = m.copy()
        m_ones[m_ones > 0] = 1
                               
        return m, m_ones
    
    def evaluate(self, data, val):
        errors = []
        for u in range(data.shape[0]):
            sq_total_error_u = 0
            for i in range(data.shape[1]):
                sq_total_error_u += np.square(data[u,i] - np.dot(self.model['p'].T, self.model['q']))
                
            errors.append(sq_total_error_u / (data.shape[0]*data.shape[1]))
            
        rmse = np.sqrt(np.sum(errors))
        
        return rmse

# Data Prep
Create new ids for users and items that match the row and column indices of the user-item interaction matrix

In [38]:
def transform(df):
    items = df['item'].unique()
    itemsDF = pd.DataFrame(data=items, columns=['original_item_id'])
    itemsDF['new_item_id'] = itemsDF.index

    users = df['user'].unique()
    usersDF = pd.DataFrame(data=users, columns=['original_user_id'])
    usersDF['new_user_id'] = usersDF.index

    ratingDF = df.merge(itemsDF, left_on='item', right_on='original_item_id')
    ratingDF = ratingDF.drop(columns=['original_item_id'])

    ratingDF = ratingDF.merge(usersDF, left_on='user', right_on='original_user_id')
    ratingDF = ratingDF.drop(columns=['original_user_id'])

    df_new_ids = ratingDF
    print('Full data #row: ', df_new_ids.shape[0])
    df_new_ids.head()
    
    return df_new_ids

df_new_ids = transform(df)

Full data #row:  57


## Train Test split
Train 0.8, Train 0.2, Test 0.1, could add validation set

In [39]:
random_state = 1234
train_set, test_set = train_test_split(df_new_ids, test_size=0.20, shuffle=True, random_state=random_state)
val_set, test_set = train_test_split(test_set, test_size=0.50, shuffle=True, random_state=random_state)

print('Size of train set: ', len(train_set))
print('Size of validation set: ', len(val_set))
print('Size of test set: ', len(test_set))

Size of train set:  45
Size of validation set:  6
Size of test set:  6


# Train Model

In [40]:
import worker
manager = mp.Manager()
res = manager.dict([('x1',0), ('x2',0)])
if __name__ == '__main__':
        user_job = mp.Process(target=worker.worker, args=(1,'x1', res))
        item_job = mp.Process(target=worker.worker, args=(2,'x2', res))

        user_job.start()
        print('solving for user vectors')
        item_job.start()
        print('solving for item vectors')                

        user_job.join()
        item_job.join()
        print(res['x1'], res['x2'])

solving for user vectors
solving for item vectors
1 4


In [41]:
model = ALS_impl_fb(total_users, total_items, params)
results = model.fit(train_set, val_set)

solving for user vectors
solving for item vectors
p 4.343974563958723
q 1.8207970694697098
10
solving for user vectors
solving for item vectors
p 2.4113212314136057
q -0.5329522997048056
10
solving for user vectors
solving for item vectors
p 4.621371619471093
q 1.7096724106778138
10
solving for user vectors
solving for item vectors
p 2.324573277305411
q -0.4747176432695819
10
solving for user vectors
solving for item vectors
p 4.669185392501221
q 1.5495860229997054
10
solving for user vectors
solving for item vectors
p 2.299261266398002
q -0.4684401769521378
10
solving for user vectors
solving for item vectors
p 4.670849374274292
q 1.4976544649467045
10
solving for user vectors
solving for item vectors
p 2.294387055225246
q -0.46815757799711033
10
solving for user vectors
solving for item vectors
p 4.664996340704826
q 1.4840854956133
10
solving for user vectors
solving for item vectors
p 2.2951937461423983
q -0.468502736505068
10
Total time: 4.025933027267456


In [42]:
def evaluate(data, val, model):
        errors = []
        for u in range(data.shape[0]):
            sq_total_error_u = 0
            for i in range(data.shape[1]):
                sq_total_error_u += np.square(data[u,i] - np.dot(model['p'][u], model['q'][i].T))
                
            errors.append(sq_total_error_u / (data.shape[0]*data.shape[1]))
            
        rmse = np.sqrt(np.sum(errors))
        
        return rmse

In [43]:
def create_matrices(data, n_users, n_items):
        r = data['new_user_id']
        c = data['new_item_id']
        d = data['rating']
        m = sparse.csr_matrix((d, (r, c)), shape=(n_users, n_items))
        m_ones = m.copy()
        m_ones[m_ones > 0] = 1
                               
        return m, m_ones

In [48]:
m_val, m_ones_val = create_matrices(val_set, total_users, total_items)
evaluate(m_ones_val, True, results)

0.06611126332603486