In [4]:
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds
from sklearn.model_selection import cross_validate, train_test_split
import sys
import os
import numpy as np
import pandas as pd
apps_floder = os.path.dirname(os.path.abspath('.'))
ratings_file = os.path.join(apps_floder, 'data\\ml-1m\\ratings.csv')
ratings_df = pd.read_csv(ratings_file, sep=',', engine='python')
ratings_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
n_users = max(ratings_df.UserID.unique())
n_movies = max(ratings_df.MovieID.unique())

print('Number of users = ' + str(n_users) +
      ' | Number of movies = ' + str(n_movies))

Number of users = 6040 | Number of movies = 3952


In [6]:
# 按照1：3的比例分割数据
train_data = ratings_df.head(round(len(ratings_df)*0.25))
test_data = ratings_df.tail(round(len(ratings_df)*0.75))

In [7]:
# 计算数据集的稀疏度
sparsity = round(ratings_df.size/float(n_users*n_movies), 3)
print('The sparsity level of MovieLens is ' + str(sparsity))

The sparsity level of MovieLens is 0.168


In [8]:
# 创建uesr-item矩阵，此处需创建训练和测试两个UI矩阵,6040 cols * 3952 rows
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [9]:
u, s, vt = svds(train_data_matrix, k=20)
u.shape, s.shape, vt.shape

((6040, 20), (20,), (20, 3952))

In [10]:
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [11]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 3.7586157404436826


In [14]:
pd.DataFrame(X_pred)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951
0,2.976352,0.744767,-0.009963,-0.057408,0.022610,-0.195839,-0.138229,0.200984,-0.131726,0.076522,...,-0.031364,0.007606,0.081938,-0.043994,-0.166404,0.385526,-0.133773,-0.041047,-0.002533,0.230722
1,0.931130,0.389267,0.364006,0.052644,0.124922,0.607960,0.141177,0.007597,0.250530,1.153990,...,-0.054179,-0.005977,-0.011452,0.082114,-0.059308,0.091251,-0.210865,-0.108354,-0.021863,-0.228336
2,1.247751,0.213155,0.167924,0.011876,0.054110,0.084708,-0.087069,0.036980,0.039084,0.411934,...,-0.024923,-0.023607,0.026514,0.009862,-0.022911,0.403922,-0.117882,-0.074477,-0.034485,-0.286855
3,0.152505,-0.182852,-0.028998,0.047482,0.044121,0.276155,-0.025682,-0.005139,-0.037145,0.263491,...,-0.001420,0.008835,-0.002732,-0.025536,-0.063314,0.066459,0.087017,0.033660,0.016800,-0.176528
4,1.325901,0.024249,-0.243512,0.215841,-0.346452,1.241074,-0.313412,-0.055865,0.006133,0.320826,...,0.069419,0.005063,-0.026947,0.017156,0.027916,0.014633,0.497768,-0.014404,0.113194,0.120356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6036,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6037,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6038,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
