In [1]:
import pandas as pd
import numpy as np
import time 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
import timeit

In [2]:
data = pd.read_csv('../data/ml-latest-small/ratings.csv')

In [3]:
train, test = train_test_split(data, test_size = 0.2)

#train_user = list(train["userId"])
#train_movie = list(train["movieId"])
#train_rate = list(train["rating"])

#test_user = list(test["userId"])
#test_movie = list(test["movieId"])
#test_rate = list(test["rating"])

In [4]:
def sgd_bias(data, train, test, f = 10, lam = 0.3, lrate = 0.01, epoch = 10, stopping_deriv = 0.01):

    #define the length of unique userid and movieid 
    U = len(data['userId'].unique())
    I = len(data['movieId'].unique())
    
    p = np.random.randn(f, U)
    q = np.random.randn(f, I)
    
    tmp1 = [i for i in range(I)]
    tmp2 = data['movieId'].unique()
    movie_dic = dict(zip(tmp2,tmp1))
    
    
    train_data = np.array(train)
    
    user_mean = data.groupby('userId').mean()['rating']
    user_mean = np.array(user_mean)
    
    item_mean = data.groupby('movieId').mean()['rating']
    tmp_item_index = item_mean.index.tolist()
    tmp_movie_dic = dict(zip(tmp_item_index, [i for i in range(I)]))
    item_mean = np.array(item_mean)
    
    
    
    total_mean = np.mean(data['rating'])
    user_bias = user_mean - total_mean
    item_bias = item_mean - total_mean
    
    sample_index = [index for index in range(train_data.shape[0])]
    
    
    
    
    for e in range(epoch):
        random.shuffle(sample_index)
        for index in sample_index:
            u = int(train_data[index,0])
            i = int(train_data[index,1])
            r_ui = train_data[index,2]
            bias_u = user_bias[u-1]
            bias_i = item_bias[tmp_movie_dic[i]]
            e_ui = r_ui - total_mean - bias_u - bias_i - np.dot(q[:,movie_dic[i]].T, p[:,u-1])
            
            grad_user = e_ui * p[:,u-1] - lam * q[:, movie_dic[i]]
            if(all(np.abs(grad)) > stopping_deriv for grad in grad_user):
                q[:,movie_dic[i]] = q[:,movie_dic[i]] + lrate * grad_user
                
            grad_item = e_ui * q[:,movie_dic[i]] - lam* p[:,u-1]
            if(all(np.abs(grad)) > stopping_deriv for grad in grad_item):
                p[:,u-1] = p[:,u-1] + lrate * grad_item
                
            grad_user_bias = e_ui - lam * bias_u
            if (np.abs(grad_user_bias) > stopping_deriv):
                user_bias[u-1] = bias_u + lrate * grad_user_bias
            
            grad_item_bias = e_ui - lam * bias_i
            if (np.abs(grad_item_bias) > stopping_deriv):
                item_bias[tmp_movie_dic[i]] = bias_i + lrate * grad_item_bias 
        
        
        
    r_ij = total_mean + user_bias + np.dot(q.T,p)
    r_ij = (r_ij.T + item_bias).T
    return p,q, r_ij

In [5]:
start1 = timeit.default_timer()
p,q,r_ij = sgd_bias(data, train, test)
stop1 = timeit.default_timer()
            
print('Running Time for SGD+R1+R2: ', stop1 - start1, 's')

Running Time for SGD+R1+R2:  17.993965900000003 s


In [6]:
def RMSE(rating, est_rating):
    sqr_error = []
    for r in range(rating.shape[0]):
        u = int(rating[r,0])
        i = int(rating[r,1])
        r_ui = rating[r,2]
        est_r_ui = est_rating[i, u]
        sqr_error.append((r_ui - est_r_ui) ** 2)
    return np.sqrt(np.mean(sqr_error))      

In [7]:
est_rating = r_ij
est_rating = pd.DataFrame(est_rating)