In [67]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from matplotlib import pyplot as plt

In [24]:
header = ['itemmean', 'usermean', 'itemid', 'rating', 'userid', ]
pdf = pd.read_csv('./csv/Pre_27k.csv', sep = ',', names = header)

In [25]:
pdf

Unnamed: 0,itemmean,usermean,itemid,rating,userid
0,5.00000,5.0,26,5.0,584
1,1.00000,1.0,29,1.0,986
2,5.00000,5.0,474,5.0,181
3,4.47619,4.0,65,4.0,1038
4,4.47619,5.0,65,5.0,909
5,4.47619,5.0,65,5.0,271
6,4.47619,3.0,65,3.0,865
7,4.47619,5.0,65,5.0,901
8,4.47619,5.0,65,5.0,250
9,4.47619,5.0,65,5.0,1155


In [26]:
n_users = pdf.userid.unique().shape[0]
n_items = pdf.itemid.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)

Number of users = 1156 | Number of movies = 695


In [56]:
data_matrix = np.zeros((n_users, n_items))
usermean = np.zeros(n_users)
itemmean = np.zeros(n_items)
for line in pdf.itertuples():
    data_matrix[line[5] - 1, line[3] - 1] = line[4]
    usermean[line[5] - 1] = line[2]
    itemmean[line[3] - 1] = line[1]

In [30]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [31]:
for i in range(n_users):
    user_similarity[i][i] = -1
    
for i in range(n_items):
    item_similarity[i][i] = -1

In [85]:
k_user = 20
k_item = 20
alpha_list = [0.0015, 0.002, 0.0025]
beta_list = [0.0015, 0.002, 0.0025] 

In [33]:
user_kNN = np.argsort(user_similarity, axis = 1)[:, -k_user:]
item_kNN = np.argsort(item_similarity, axis = 1)[:, -k_item:]

In [61]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [86]:
rmse_record = np.zeros((len(alpha_list), len(beta_list)))
abs_record = np.zeros((len(alpha_list), len(beta_list)))
for ai in range(len(alpha_list)):
    for bi in range(len(beta_list)):
        alpha = alpha_list[ai]
        beta = beta_list[bi]
        prediction = np.zeros((n_users, n_items))
        for i in range(n_users):
            for j in range(n_items):
                prediction[i][j] = (usermean[i] + itemmean[j]) / 2.0
                for k in range(k_user):
                    prediction[i][j] = prediction[i][j] + \
                    + alpha * (user_similarity[i][k] * (data_matrix[k, j] - itemmean[j])) 
                
                for k in range(k_item):
                    prediction[i][j] = prediction[i][j] + \
                    + beta * (user_similarity[i][k] * (data_matrix[k, j] - itemmean[j])) 
                    
        rmse_record[ai][bi] = rmse(prediction, data_matrix)
        abs_record[ai][bi] = abs(rmse_record[ai][bi] - 0.5)
        print 'RMSE for alpha = ' + str(alpha) + ', beta = ' + str(beta) + ' = ' + str(rmse_record[ai][bi])

RMSE for alpha = 0.0015, beta = 0.0015 = 0.443923325065
RMSE for alpha = 0.0015, beta = 0.002 = 0.470380049007
RMSE for alpha = 0.0015, beta = 0.0025 = 0.499148112264
RMSE for alpha = 0.002, beta = 0.0015 = 0.470380049007
RMSE for alpha = 0.002, beta = 0.002 = 0.499148112264
RMSE for alpha = 0.002, beta = 0.0025 = 0.529851168683
RMSE for alpha = 0.0025, beta = 0.0015 = 0.499148112264
RMSE for alpha = 0.0025, beta = 0.002 = 0.529851168683
RMSE for alpha = 0.0025, beta = 0.0025 = 0.562172268473


In [87]:
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(alpha_list, abs_record.T[0])
title = 'beta = ' + str(beta_list[0])
plt.title(title)
plt.subplot(212)
plt.plot(alpha_list, rmse_record.T[0])
plt.title(title)
filename = 'b' + str(beta_list[0]) + '.png'
plt.savefig(filename)
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(alpha_list, abs_record.T[1])
title = 'beta = ' + str(beta_list[1])
plt.title(title)
plt.subplot(212)
plt.plot(alpha_list, rmse_record.T[1])
plt.title(title)
filename = 'b' + str(beta_list[1]) + '.png'
plt.savefig(filename)
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(alpha_list, abs_record.T[2])
title = 'beta = ' + str(beta_list[2])
plt.title(title)
plt.subplot(212)
plt.plot(alpha_list, rmse_record.T[2])
plt.title(title)
filename = 'b' + str(beta_list[2]) + '.png'
plt.savefig(filename)
'''
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(alpha_list, abs_record.T[3])
title = 'beta = ' + str(beta_list[3])
plt.title(title)
plt.subplot(212)
plt.plot(alpha_list, rmse_record.T[3])
plt.title(title)
filename = 'b' + str(beta_list[3]) + '.png'
plt.savefig(filename)
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(alpha_list, abs_record.T[4])
title = 'beta = ' + str(beta_list[4])
plt.title(title)
plt.subplot(212)
plt.plot(alpha_list, rmse_record.T[4])
plt.title(title)
filename = 'b' + str(beta_list[4]) + '.png'
plt.savefig(filename)
'''
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(beta_list, abs_record[0])
title = 'alpha = ' + str(alpha_list[0])
plt.title(title)
plt.subplot(212)
plt.plot(beta_list, rmse_record[0])
plt.title(title)
filename = 'a' + str(beta_list[0]) + '.png'
plt.title(title)
plt.savefig(filename)
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(beta_list, abs_record[1])
title = 'alpha = ' + str(alpha_list[1])
plt.title(title)
plt.subplot(212)
plt.plot(beta_list, rmse_record[1])
plt.title(title)
filename = 'a' + str(beta_list[1]) + '.png'
plt.title(title)
plt.savefig(filename)
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(beta_list, abs_record[2])
title = 'alpha = ' + str(alpha_list[2])
plt.title(title)
plt.subplot(212)
plt.plot(beta_list, rmse_record[2])
plt.title(title)
filename = 'a' + str(beta_list[2]) + '.png'
plt.title(title)
plt.savefig(filename)
'''
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(beta_list, abs_record[3])
title = 'alpha = ' + str(alpha_list[3])
plt.title(title)
plt.subplot(212)
plt.plot(beta_list, rmse_record[3])
plt.title(title)
filename = 'a' + str(beta_list[3]) + '.png'
plt.title(title)
plt.savefig(filename)
plt.figure(figsize = (5, 8))
plt.subplot(211)
plt.plot(beta_list, abs_record[4])
title = 'alpha = ' + str(alpha_list[4])
plt.title(title)
plt.subplot(212)
plt.plot(beta_list, rmse_record[4])
plt.title(title)
filename = 'a' + str(beta_list[4]) + '.png'
plt.title(title)
plt.savefig(filename)
'''


"\nplt.figure(figsize = (5, 8))\nplt.subplot(211)\nplt.plot(beta_list, abs_record[3])\ntitle = 'alpha = ' + str(alpha_list[3])\nplt.title(title)\nplt.subplot(212)\nplt.plot(beta_list, rmse_record[3])\nplt.title(title)\nfilename = 'a' + str(beta_list[3]) + '.png'\nplt.title(title)\nplt.savefig(filename)\nplt.figure(figsize = (5, 8))\nplt.subplot(211)\nplt.plot(beta_list, abs_record[4])\ntitle = 'alpha = ' + str(alpha_list[4])\nplt.title(title)\nplt.subplot(212)\nplt.plot(beta_list, rmse_record[4])\nplt.title(title)\nfilename = 'a' + str(beta_list[4]) + '.png'\nplt.title(title)\nplt.savefig(filename)\n"

In [60]:
prediction

array([[ 3.435 ,  4.05  ,  3.23  , ...,  3.23  ,  4.05  ,  4.05  ],
       [ 3.935 ,  4.55  ,  3.73  , ...,  3.73  ,  4.55  ,  4.55  ],
       [ 3.435 ,  4.05  ,  3.23  , ...,  3.23  ,  4.05  ,  4.05  ],
       ..., 
       [ 3.9   ,  4.5   ,  3.7   , ...,  3.7   ,  4.5   ,  4.5   ],
       [ 3.9175,  4.525 ,  3.715 , ...,  3.715 ,  4.525 ,  4.525 ],
       [ 3.4   ,  4.    ,  3.2   , ...,  3.2   ,  4.    ,  4.    ]])

In [51]:
usermean

array([ 4.33333333,  5.        ,  4.47619048, ...,  5.        ,
        4.47619048,  4.33333333])

In [57]:
itemmean[0]

3.5