## 10.基于协同过滤、矩阵分解推荐算法的电影的评分预测


In [1]:
import pandas as pd
from surprise import SVD
from surprise import KNNBasic 
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

In [2]:
data_df=pd.read_csv('data/data_u.csv',encoding='gbk')
data_df

Unnamed: 0,用户原始ID,电影原始ID,评分,时间戳
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
data_df[data_df['用户原始ID']==192]['电影原始ID'][0:10]

269       476
555      1061
725       948
4333     1160
7210      127
8995      301
10129     252
11609     340
12119    1405
12917     302
Name: 电影原始ID, dtype: int64

In [4]:
data_df[data_df['电影原始ID']==242]['用户原始ID'][0:10]

0       196
253      63
629     226
1232    154
2159    306
2335    296
2600     34
3484    271
4082    201
5548    209
Name: 用户原始ID, dtype: int64

In [5]:
data_df=data_df.drop(['时间戳'],axis=1)
reader = Reader()
dataset = Dataset.load_from_df(data_df,reader)

In [6]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.02,0.005],'reg_all': [0.2,0.4]}
model_svdGridSearch = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3,refit='rmse')

In [7]:
model_svdGridSearch.fit(dataset)
results_df = pd.DataFrame.from_dict(model_svdGridSearch.cv_results)
results_df[['mean_test_rmse','rank_test_rmse','mean_test_mae','rank_test_mae','param_n_epochs','param_lr_all','param_reg_all']]

Unnamed: 0,mean_test_rmse,rank_test_rmse,mean_test_mae,rank_test_mae,param_n_epochs,param_lr_all,param_reg_all
0,0.952777,2,0.758671,2,5,0.02,0.2
1,0.96269,5,0.77142,5,5,0.02,0.4
2,0.966064,7,0.771428,6,5,0.005,0.2
3,0.973715,8,0.781783,8,5,0.005,0.4
4,0.949842,1,0.755769,1,10,0.02,0.2
5,0.960653,4,0.769434,4,10,0.02,0.4
6,0.954877,3,0.7609,3,10,0.005,0.2
7,0.964023,6,0.772614,7,10,0.005,0.4


In [8]:
uid=196
iid=242
r_ui=r_ui=data_df[(data_df['用户原始ID']==uid)&( data_df['电影原始ID']==iid)]['评分']
y_pred = model_svdGridSearch.predict(uid,iid,r_ui[0])
y_pred

Prediction(uid=196, iid=242, r_ui=3, est=3.8779056876039353, details={'was_impossible': False})

In [9]:
uid=196
iid=302
y_pred = model_svdGridSearch.predict(uid,iid)
y_pred

Prediction(uid=196, iid=302, r_ui=None, est=4.029077185102199, details={'was_impossible': False})

In [10]:
model_userCF = KNNBasic()

In [11]:
cross_validate(model_userCF, dataset, measures=['RMSE','MAE'],cv=3,verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9847  0.9894  0.9906  0.9882  0.0025  
MAE (testset)     0.7768  0.7829  0.7841  0.7813  0.0032  
Fit time          0.69    0.72    0.72    0.71    0.01    
Test time         9.69    9.64    9.85    9.73    0.09    


{'test_rmse': array([0.98468372, 0.9893994 , 0.99056603]),
 'test_mae': array([0.77679368, 0.78289859, 0.78409387]),
 'fit_time': (0.6896541118621826, 0.7181503772735596, 0.7181098461151123),
 'test_time': (9.690216541290283, 9.637779951095581, 9.84878158569336)}

In [12]:
uid=196
iid=242
r_ui=r_ui=data_df[(data_df['用户原始ID']==uid)&( data_df['电影原始ID']==iid)]['评分']
y_pred = model_userCF.predict(uid,iid,r_ui[0])
y_pred

Prediction(uid=196, iid=242, r_ui=3, est=3.6725013020279658, details={'actual_k': 40, 'was_impossible': False})

In [13]:
uid=196
iid=302
y_pred = model_userCF.predict(uid,iid)
y_pred

Prediction(uid=196, iid=302, r_ui=None, est=4.163020132457883, details={'actual_k': 40, 'was_impossible': False})

In [14]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}
model_itemCF = KNNBasic(sim_options=sim_options)

In [15]:
trainset,testset = train_test_split(dataset)
model_itemCF.fit(trainset)
predictions = model_itemCF.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [16]:
accuracy.rmse(predictions)

RMSE: 0.9929


0.9928937938643257

In [17]:
accuracy.mae(predictions)

MAE:  0.7799


0.7799317205010555

In [18]:
uid=196
iid=242
r_ui=r_ui=data_df[(data_df['用户原始ID']==uid)&( data_df['电影原始ID']==iid)]['评分']
y_pred = model_userCF.predict(uid,iid,r_ui[0])
y_pred

Prediction(uid=196, iid=242, r_ui=3, est=3.6725013020279658, details={'actual_k': 40, 'was_impossible': False})

In [19]:
uid=196
iid=302
y_pred = model_userCF.predict(uid,iid)
y_pred

Prediction(uid=196, iid=302, r_ui=None, est=4.163020132457883, details={'actual_k': 40, 'was_impossible': False})

In [29]:
name_rid_df=pd.read_csv('data/data_item.csv')
name_rid_df

Unnamed: 0,电影名称,电影原始ID
0,Toy Story (1995),1
1,GoldenEye (1995),2
2,Four Rooms (1995),3
3,Get Shorty (1995),4
4,Copycat (1995),5
...,...,...
1677,Mat' i syn (1997),1678
1678,B. Monkey (1998),1679
1679,Sliding Doors (1998),1680
1680,You So Crazy (1994),1681


In [30]:
rid_to_name=dict(zip(list(name_rid_df['电影原始ID'].values),list(name_rid_df['电影名称'].values)))
name_to_rid=dict(zip(list(name_rid_df['电影名称'].values),list(name_rid_df['电影原始ID'].values)))

In [31]:
print("字典rid_to_name中的前五个键值对如下所示:")
list(rid_to_name.items())[:5]

字典rid_to_name中的前五个键值对如下所示:


[(1, 'Toy Story (1995)'),
 (2, 'GoldenEye (1995)'),
 (3, 'Four Rooms (1995)'),
 (4, 'Get Shorty (1995)'),
 (5, 'Copycat (1995)')]

In [32]:
print("字典name_to_rid中的前五个键值对如下所示:")  
list(name_to_rid.items())[:5]

字典name_to_rid中的前五个键值对如下所示:


[('Toy Story (1995)', 1),
 ('GoldenEye (1995)', 2),
 ('Four Rooms (1995)', 3),
 ('Get Shorty (1995)', 4),
 ('Copycat (1995)', 5)]

In [33]:
raw_id = name_to_rid['GoldenEye (1995)']
print("电影原始ID为：",raw_id)

name = rid_to_name[raw_id]
print("电影名称为：",name)

电影原始ID为： 2
电影名称为： GoldenEye (1995)


In [34]:
inner_id = model_itemCF.trainset.to_inner_iid(raw_id)
print("GoldenEye的内部ID为：",inner_id)

GoldenEye的内部ID为： 732


In [35]:
neighbors_iid_temp = model_itemCF.get_neighbors(inner_id, k=10)
print("与GoldenEye最相似的10部电影的内部ID为：",neighbors_iid_temp)

与GoldenEye最相似的10部电影的内部ID为： [171, 140, 163, 470, 229, 446, 80, 154, 90, 268]


In [36]:
neighbors_rid_temp= (model_itemCF.trainset.to_raw_iid(inner_id)for inner_id in neighbors_iid_temp)
type(neighbors_rid_temp)

generator

In [37]:
neighbors_rid=list(neighbors_rid_temp)

In [38]:
print("最相似的10部电影原始ID为：",neighbors_rid)

最相似的10部电影原始ID为： [405, 233, 163, 684, 566, 239, 54, 89, 210, 274]


In [39]:
neighbors_name = (rid_to_name[rid] for rid in neighbors_rid)
print('The 10 nearest neighbors of GoldenEye are:')
for movie in neighbors_name:
    print(movie)

The 10 nearest neighbors of GoldenEye are:
Mission: Impossible (1996)
Under Siege (1992)
Return of the Pink Panther, The (1974)
In the Line of Fire (1993)
Clear and Present Danger (1994)
Sneakers (1992)
Outbreak (1995)
Blade Runner (1982)
Indiana Jones and the Last Crusade (1989)
Sabrina (1995)
