# Collaborative Filtering-movieLen

movieLen数据集 
- [ml-100k](http://files.grouplens.org/datasets/movielens/ml-100k.zip)

In [25]:
import numpy as np
import pandas as pd
 
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("./data/ml-100k/u.data", sep='\t', names=header)

In [26]:
df.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


观察数据前两行。接下来，让我们统计其中的用户和电影总数。

In [32]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


# Create two user-item matrices

In [57]:
from sklearn import model_selection as cv

train_data, test_data = cv.train_test_split(df, test_size=0.25)

train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [58]:
train_data_matrix.shape

(943, 1682)

In [65]:
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
test_data_matrix.shape

(943, 1682)

# user和item余弦相似性

In [36]:
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine') # 矩阵的转置实现主题的相似度

In [37]:
user_similarity.shape

(943, 943)

In [60]:
user_similarity

array([[0.        , 0.87114049, 0.94588932, ..., 0.8994177 , 0.88007823,
        0.68353484],
       [0.87114049, 0.        , 0.91578244, ..., 0.92669684, 0.81804738,
        0.892803  ],
       [0.94588932, 0.91578244, 0.        , ..., 0.92878526, 0.8682469 ,
        0.98485194],
       ...,
       [0.8994177 , 0.92669684, 0.92878526, ..., 0.        , 0.90634613,
        0.90337934],
       [0.88007823, 0.81804738, 0.8682469 , ..., 0.90634613, 0.        ,
        0.81501853],
       [0.68353484, 0.892803  , 0.98485194, ..., 0.90337934, 0.81501853,
        0.        ]])

In [38]:
item_similarity.shape

(1682, 1682)

In [61]:
item_similarity

array([[0.        , 0.67555716, 0.75414516, ..., 1.        , 0.94551642,
        0.94551642],
       [0.67555716, 0.        , 0.79852957, ..., 1.        , 0.90719677,
        1.        ],
       [0.75414516, 0.79852957, 0.        , ..., 1.        , 1.        ,
        0.88835156],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.94551642, 0.90719677, 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.94551642, 1.        , 0.88835156, ..., 1.        , 1.        ,
        0.        ]])

# 预测

构造了相似度矩阵user_similarity和item_similarity，可以利用下面公式为user-based CF做一个预测： 

$\hat{x_{k,m}}=\bar{x_k}+\dfrac{\sum_{u_a}sim_u(u_k,u_a)(x_{a,m}-\bar{x_{u_a}})}{\sum_{u_a}|sim_u(u_k,u_a)|}$

用户k和用户a之间的相似度根据一个相似用户a的一系列评价的乘积（修正为该用户的平均评价）的权重。你将需要标准化相似度这样可以使评价维持在1到5之间，最后统计你想预测用户平均评价的总和。 

这里考虑到的问题是一些用户评价所有电影时可能要么给最高分，要么给最低分。这些用户给出评价的相对不同比绝对值更重要。例如：设想，用户k对他最喜欢的电影评价4颗星，其他的好电影则评价3颗星。假设现在另一个用户t对他/她喜欢的一部电影评价为5颗星，看了想睡觉的一部电影评价为3颗星。这两位用户电影口味可能很相似但使用评价体系的方法不同。 

当为item-based CF做一个推荐时候，你不要纠正用户的平均评价，因为用户本身用查询来做预测。

$\hat{x_{k,m}}=\dfrac{\sum_{i_b}sim_i(i_k,i_b)(x_{b,m}-\bar{x_{i_b}})}{\sum_{i_b}|sim_i(i_k,i_b)|}$

In [104]:
def predict_user(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1) # row mean
    #You use np.newaxis so that mean_user_rating has same format as ratings
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred

In [105]:
def predict_item(ratings, similarity):
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [106]:
user_prediction = predict_user(train_data_matrix, user_similarity)
item_prediction = predict_item(train_data_matrix, item_similarity)

In [107]:
user_prediction.shape

(943, 1682)

In [108]:
user_prediction

array([[ 1.56156579,  0.60676171,  0.49835842, ...,  0.31215363,
         0.31455243,  0.31444702],
       [ 1.28191167,  0.29965098,  0.14308548, ..., -0.07539772,
        -0.07222772, -0.07195249],
       [ 1.27060571,  0.26220891,  0.11659383, ..., -0.10373083,
        -0.10053755, -0.10032376],
       ...,
       [ 1.16485093,  0.23163663,  0.08001336, ..., -0.12762628,
        -0.12436611, -0.12450773],
       [ 1.32378914,  0.33349654,  0.20826655, ..., -0.00581088,
        -0.00298969, -0.00261213],
       [ 1.36764051,  0.40923845,  0.29760172, ...,  0.10947574,
         0.1120059 ,  0.11188933]])

In [109]:
item_prediction.shape

(943, 1682)

In [110]:
item_prediction

array([[0.38639639, 0.40190997, 0.41328004, ..., 0.46586371, 0.4571401 ,
        0.45194305],
       [0.07808381, 0.09175547, 0.08859339, ..., 0.09337027, 0.09433321,
        0.09499032],
       [0.06557679, 0.06705845, 0.06641595, ..., 0.06335952, 0.0655608 ,
        0.0666864 ],
       ...,
       [0.03098591, 0.03628655, 0.03492048, ..., 0.04003142, 0.03903108,
        0.0393101 ],
       [0.13088014, 0.14228256, 0.14907797, ..., 0.15474453, 0.15286004,
        0.15472241],
       [0.21099939, 0.21750349, 0.22850706, ..., 0.26464826, 0.25621177,
        0.25507174]])

# 评价指标Root Mean Squared Error (RMSE)

In [56]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    # nonzero(a)返回数组a中值不为零的元素的下标,相当于对稀疏矩阵进行提取
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [52]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.132518607280199
Item-based CF RMSE: 3.462469030387912


# ref

In [87]:
import numpy as np

data=np.arange(0, 9).reshape(3,3)

In [94]:
data

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [100]:
data.sum(axis=1)

array([ 3, 12, 21])

In [102]:
np.array([data.sum(axis=1)]).T

array([[ 3],
       [12],
       [21]])

In [96]:
data.mean(axis=1)

array([1., 4., 7.])

In [97]:
data.mean(axis=1)[:, np.newaxis]

array([[1.],
       [4.],
       [7.]])

In [98]:
data-data.mean(axis=1)[:, np.newaxis]

array([[-1.,  0.,  1.],
       [-1.,  0.,  1.],
       [-1.,  0.,  1.]])