# 协同过滤算法(item-item, user-item)

# load data

In [1]:
import pandas as pd

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)
print(df.head(5))
print(len(df))

n_users = df.user_id.unique().shape[0]  
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))
#split
from sklearn import model_selection as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
100000
Number of users = 943 | Number of movies = 1682


In [3]:
import numpy as np

In [4]:
train_data_matrix = np.zeros((n_users, n_items))
print(train_data_matrix.shape)
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

(943, 1682)


In [5]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
print(user_similarity.shape)
print(item_similarity.shape)

(943, 943)
(1682, 1682)


# predict

In [6]:
mean_matrix=train_data_matrix.mean(axis=1)
mean_matrix_T=mean_matrix[:, np.newaxis]
mean_matrix_T.shape
print('mean_matrix_T.shape',mean_matrix_T.shape)
p=train_data_matrix-mean_matrix_T
p_0=p[:,2][:,np.newaxis]
print('p_0.shape',p_0.shape)
p_1=(train_data_matrix[:,2][:,np.newaxis]-mean_matrix_T)
print('train_data_matrix[:,2].shape',train_data_matrix[:,2][:,np.newaxis].shape)
print(p_1.shape)
print(p_0[:2])
print(p_1[:2])


mean_matrix_T.shape (943, 1)
p_0.shape (943, 1)
train_data_matrix[:,2].shape (943, 1)
(943, 1)
[[ 3.55588585]
 [-0.10166468]]
[[ 3.55588585]
 [-0.10166468]]


In [7]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred

In [8]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

# NMSE

In [9]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))   

User-based CF RMSE: 3.122628847248923
Item-based CF RMSE: 3.44787300609786
