# Recommender system

__UserID, ProfileID, Rating__

- __UserID__ is user who provided rating
- __ProfileID__ is user who has been rated
- __UserIDs__ range between 1 and 135,359
- __ProfileIDs__ range between 1 and 220,970 (not every profile has been rated)
- __Ratings__ are on a 1-10 scale where 10 is best (integer ratings only)
- Only users who provided at least 20 ratings were included
- Users who provided constant ratings were excluded

In [57]:
import pandas as pd

df = pd.read_csv("ratings_data.csv")
df = df.drop([col for col in df.columns if col.startswith('Unnamed')], axis=1)
df = df.loc[df['UserID'] <= 1000]
df.head()

Unnamed: 0,UserID,ProfileID,Ratings
0,1,133,8
1,1,720,6
2,1,971,10
3,1,1095,7
4,1,1616,10


## Data preprocessing

In [63]:
import numpy as np

users = list(set(df['UserID']))
user_id_index = dict((user_id, index) for user_id, index in zip(users, range(len(users))))
items = list(set(df['ProfileID']))
item_id_index = dict((item_id, index) for item_id, index in zip(items, range(len(items))))

# shuffle dataframe
df = df.sample(frac=1).reset_index(drop=True)
old_data = df.values

data = np.array([[int(user_id_index[item[0]]), int(item_id_index[item[1]]), int(item[2])] for item in old_data])

## Bulding a matrix for our train dataset

In [67]:
# split the data
ratio = 0.6
train_data = data[:int(ratio*data.shape[0])]
vali_data = data[int(ratio*data.shape[0]):int((ratio+(1-ratio)/2)*data.shape[0])]
test_data = data[int((ratio+(1-ratio)/2)*data.shape[0]):]

NUM_USERS = len(set(df['UserID']))
NUM_ITEMS = len(set(df['ProfileID']))
print('Dataset density: {}%'.format(round(len(df)/(NUM_USERS*NUM_ITEMS)*100, 3)))

R = np.zeros([NUM_USERS, NUM_ITEMS])
for ele in train_data:
    R[ele[0], ele[1]] = float(ele[2])
R

Dataset density: 0.31%


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Define model and evaluation

In [74]:
def RMSE(preds, truth):
    return np.sqrt(np.mean(np.square(preds-truth)))

class PMF():
    def __init__(self, R, lambda_alpha=1e-2, lambda_beta=1e-2, latent_size=50, momuntum=0.8,
                 lr=0.001, iters=200, seed=None):
        # initialize parameters
        self.lambda_alpha = lambda_alpha
        self.lambda_beta = lambda_beta
        self.momuntum = momuntum
        self.R = R
        self.random_state = RandomState(seed)
        self.iterations = iters
        self.lr = lr
        self.I = copy.deepcopy(self.R)
        self.I[self.I != 0] = 1

        self.U = 0.1*self.random_state.rand(np.size(R, 0), latent_size)
        self.V = 0.1*self.random_state.rand(np.size(R, 1), latent_size)

    def loss(self):
        # defining loss function
        loss = np.sum(self.I*(self.R-np.dot(self.U, self.V.T))**2) + self.lambda_alpha*np.sum(np.square(self.U)) + self.lambda_beta*np.sum(np.square(self.V))
        return loss
    
    def predict(self, data):
        # predicting array of values
        index_data = np.array([[int(ele[0]), int(ele[1])] for ele in data], dtype=int)
        u_features = self.U.take(index_data.take(0, axis=1), axis=0)
        v_features = self.V.take(index_data.take(1, axis=1), axis=0)
        preds_value_array = np.sum(u_features*v_features, 1)
        return preds_value_array

    def train(self, train_data=None, vali_data=None):
        # training and validating
        train_loss_list = []
        vali_rmse_list = []
        last_vali_rmse = None

        momuntum_u = np.zeros(self.U.shape)
        momuntum_v = np.zeros(self.V.shape)

        for it in range(self.iterations):
            grads_u = np.dot(self.I*(self.R-np.dot(self.U, self.V.T)), -self.V) + self.lambda_alpha*self.U
            grads_v = np.dot((self.I*(self.R-np.dot(self.U, self.V.T))).T, -self.U) + self.lambda_beta*self.V

            momuntum_u = (self.momuntum * momuntum_u) + self.lr * grads_u
            momuntum_v = (self.momuntum * momuntum_v) + self.lr * grads_v
            self.U = self.U - momuntum_u
            self.V = self.V - momuntum_v

            train_loss = self.loss()
            train_loss_list.append(train_loss)

            vali_preds = self.predict(vali_data)
            vali_rmse = RMSE(vali_data[:,2], vali_preds)
            vali_rmse_list.append(vali_rmse)

            print('traning iteration:{: d} ,loss:{: f}, vali_rmse:{: f}'.format(it, train_loss, vali_rmse))

            if last_vali_rmse and (last_vali_rmse - vali_rmse) <= 0:
                print('convergence at iterations:{: d}'.format(it))
                break
            else:
                last_vali_rmse = vali_rmse

        return self.U, self.V, train_loss_list, vali_rmse_list

## Train

In [66]:
from numpy.random import RandomState
import copy

lambda_alpha = 0.01
lambda_beta = 0.01
latent_size = 20
lr = 3e-5
iters = 1000
model = PMF(R=R, lambda_alpha=lambda_alpha, lambda_beta=lambda_beta, latent_size=latent_size, momuntum=0.9, lr=lr, iters=iters, seed=1)
print('parameters are: ratio={}, reg_u={}, reg_v={}, latent_size={}, lr={}, iters={}'.format(ratio, lambda_alpha, lambda_beta, latent_size,lr, iters))
U, V, train_loss_list, vali_rmse_list = model.train(train_data=train_data, vali_data=vali_data)

print('Testing model.')
preds = model.predict(data=test_data)
test_rmse = RMSE(preds, test_data[:, 2])

print('Test RMSE: {}'.format(test_rmse))

Training model.
parameters are: ratio=0.6, reg_u=0.01, reg_v=0.01, latent_size=20, lr=3e-05, iters=1000
traning iteration: 0 ,loss: 3813857.186933, vali_rmse: 6.583944
traning iteration: 1 ,loss: 3804321.630254, vali_rmse: 6.575810
traning iteration: 2 ,loss: 3790722.909650, vali_rmse: 6.564212
traning iteration: 3 ,loss: 3773447.194736, vali_rmse: 6.549491
traning iteration: 4 ,loss: 3752806.358725, vali_rmse: 6.531937
traning iteration: 5 ,loss: 3729031.304247, vali_rmse: 6.511790
traning iteration: 6 ,loss: 3702269.650224, vali_rmse: 6.489237
traning iteration: 7 ,loss: 3672588.647574, vali_rmse: 6.464409
traning iteration: 8 ,loss: 3639983.622504, vali_rmse: 6.437392
traning iteration: 9 ,loss: 3604391.899214, vali_rmse: 6.408221
traning iteration: 10 ,loss: 3565711.937038, vali_rmse: 6.376893
traning iteration: 11 ,loss: 3523827.221521, vali_rmse: 6.343376
traning iteration: 12 ,loss: 3478634.167392, vali_rmse: 6.307618
traning iteration: 13 ,loss: 3430072.835287, vali_rmse: 6.269

traning iteration: 126 ,loss: 405884.393284, vali_rmse: 3.427385
traning iteration: 127 ,loss: 401420.862162, vali_rmse: 3.422237
traning iteration: 128 ,loss: 397054.149508, vali_rmse: 3.417207
traning iteration: 129 ,loss: 392781.605200, vali_rmse: 3.412293
traning iteration: 130 ,loss: 388600.648094, vali_rmse: 3.407492
traning iteration: 131 ,loss: 384508.758455, vali_rmse: 3.402800
traning iteration: 132 ,loss: 380503.475892, vali_rmse: 3.398215
traning iteration: 133 ,loss: 376582.402407, vali_rmse: 3.393733
traning iteration: 134 ,loss: 372743.207258, vali_rmse: 3.389352
traning iteration: 135 ,loss: 368983.629886, vali_rmse: 3.385068
traning iteration: 136 ,loss: 365301.479061, vali_rmse: 3.380878
traning iteration: 137 ,loss: 361694.629086, vali_rmse: 3.376779
traning iteration: 138 ,loss: 358161.015587, vali_rmse: 3.372769
traning iteration: 139 ,loss: 354698.633241, vali_rmse: 3.368844
traning iteration: 140 ,loss: 351305.535987, vali_rmse: 3.365002
traning iteration: 141 ,l

traning iteration: 253 ,loss: 168810.273999, vali_rmse: 3.196720
traning iteration: 254 ,loss: 167978.265262, vali_rmse: 3.196330
traning iteration: 255 ,loss: 167152.995756, vali_rmse: 3.195948
traning iteration: 256 ,loss: 166334.399805, vali_rmse: 3.195575
traning iteration: 257 ,loss: 165522.411972, vali_rmse: 3.195210
traning iteration: 258 ,loss: 164716.967019, vali_rmse: 3.194853
traning iteration: 259 ,loss: 163917.999891, vali_rmse: 3.194504
traning iteration: 260 ,loss: 163125.445718, vali_rmse: 3.194162
traning iteration: 261 ,loss: 162339.239833, vali_rmse: 3.193829
traning iteration: 262 ,loss: 161559.317802, vali_rmse: 3.193503
traning iteration: 263 ,loss: 160785.615471, vali_rmse: 3.193184
traning iteration: 264 ,loss: 160018.069025, vali_rmse: 3.192873
traning iteration: 265 ,loss: 159256.615050, vali_rmse: 3.192569
traning iteration: 266 ,loss: 158501.190606, vali_rmse: 3.192273
traning iteration: 267 ,loss: 157751.733312, vali_rmse: 3.191983
traning iteration: 268 ,l

RMSE for validation dataset is approximately similar to test RMSE, which is good!
Unfortunatly, we have low density of dataset (0.31%) and we used only part of our dataset (for 1000 ids), so results could be more promising.