# 3. Collaborative Filtering(협업 필터링: 사용자 리뷰 기반)

In [5]:
import surprise

In [3]:
surprise.__version__

'1.1.3'

In [6]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [7]:
ratings = pd.read_csv('ratings_small.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [8]:
ratings['rating'].min()

0.5

In [9]:
ratings['rating'].max()

5.0

In [10]:
reader = Reader(rating_scale=(0.5, 5))

In [11]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x1b8a9116e00>

In [12]:
svd = SVD(random_state=0)

In [13]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9016  0.8945  0.8998  0.8967  0.8937  0.8973  0.0030  
MAE (testset)     0.6915  0.6907  0.6942  0.6921  0.6847  0.6906  0.0032  
Fit time          1.34    1.35    1.33    1.34    1.39    1.35    0.02    
Test time         0.22    0.30    0.16    0.25    0.18    0.22    0.05    


{'test_rmse': array([0.90161835, 0.89452256, 0.89984947, 0.89670861, 0.8936803 ]),
 'test_mae': array([0.69147537, 0.69066798, 0.69420653, 0.69214778, 0.68468235]),
 'fit_time': (1.3423995971679688,
  1.3464348316192627,
  1.3278915882110596,
  1.3442506790161133,
  1.3948888778686523),
 'test_time': (0.2152104377746582,
  0.3002586364746094,
  0.16360187530517578,
  0.2480316162109375,
  0.17662525177001953)}

In [14]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b8a8405990>

In [15]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [21]:
svd.predict(1, 1061, 3) # userId 1번인 사람이 movieId 1061에 대해 실제 평가 3점일 때, 예측 평가 점수?

Prediction(uid=1, iid=1061, r_ui=3, est=2.766971602070149, details={'was_impossible': False})

In [22]:
ratings[ratings['userId']==100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [23]:
svd.predict(100, 1061) # userId 100번인 사람이 movieId 1061에 대해 예측 평가 점수?

Prediction(uid=100, iid=1061, r_ui=None, est=3.3929019948413752, details={'was_impossible': False})