In [1]:
import pandas as pd

In [2]:
rating = pd.read_csv('ratings.csv').iloc[:,:-1]

In [3]:
rating

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


## EDA

In [4]:
rating['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [5]:
# 가장 다양한 영화에 평점을 기록한 유저
rating.groupby('userId')['movieId'].nunique().sort_values(ascending = False)

userId
414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
442      20
569      20
320      20
576      20
53       20
Name: movieId, Length: 610, dtype: int64

In [6]:
# 준 평점 평균이 가장 높은 유저
rating.groupby('userId')['rating'].mean().sort_values(ascending = False)

userId
53     5.000000
251    4.869565
515    4.846154
25     4.807692
30     4.735294
         ...   
567    2.245455
153    2.217877
508    2.145833
139    2.144330
442    1.275000
Name: rating, Length: 610, dtype: float64

In [7]:
# 53번 유저는 자신이 본 영화를 후하게 평가하는 경향이 있는 것을 알 수 있다. 
rating.query('userId==53')

Unnamed: 0,userId,movieId,rating
7911,53,203,5.0
7912,53,249,5.0
7913,53,381,5.0
7914,53,413,5.0
7915,53,481,5.0
7916,53,748,5.0
7917,53,880,5.0
7918,53,916,5.0
7919,53,922,5.0
7920,53,1049,5.0


In [8]:
# 반대로 442번 유저는 높은 평점을 주지 않는 것을 확인할 수 있다.
rating.query('userId==442')

Unnamed: 0,userId,movieId,rating
68317,442,362,2.5
68318,442,468,1.5
68319,442,524,2.0
68320,442,610,1.0
68321,442,616,1.5
68322,442,1186,1.0
68323,442,1231,1.0
68324,442,1272,0.5
68325,442,1644,0.5
68326,442,2020,1.0


## Modeling

https://techblog-history-younghunjo1.tistory.com/117

In [9]:
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise import Dataset

In [10]:
reader = Reader(line_format = 'user item rating', sep=',', rating_scale=(0.5, 5))
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader=reader)
train, test = train_test_split(data, test_size=0.25, random_state=2020)

In [11]:
from surprise import KNNBasic
from surprise import KNNWithZScore #z-score를 사용하여 유사도를 계산
from surprise import KNNWithMeans # 평균값을 사용하여 유사도를 계산
from surprise import SVD # Model_based
from surprise import accuracy
from surprise import SlopeOne
# NMF도 있음

## KNNBasic

In [12]:
sim_options = {
    'name': 'cosine',
    'user_based': True  # User-based CF 사용
}
# Memory-based CF를 적용할 때 user_based, item_based 선택 가능

model = KNNBasic(sim_options=sim_options)
model.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f4fcde154b0>

In [13]:
predictions = model.test(test)

In [14]:
predictions

[Prediction(uid=105, iid=116797, r_ui=4.0, est=4.006820812272593, details={'actual_k': 39, 'was_impossible': False}),
 Prediction(uid=608, iid=4643, r_ui=3.0, est=2.7230794284668827, details={'actual_k': 34, 'was_impossible': False}),
 Prediction(uid=590, iid=903, r_ui=4.5, est=4.075706915243922, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=202, iid=2194, r_ui=4.0, est=3.9884137429821966, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=122, iid=63131, r_ui=4.0, est=3.5014636076491388, details={'actual_k': 14, 'was_impossible': False}),
 Prediction(uid=186, iid=2716, r_ui=3.0, est=3.9996100880748426, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=470, iid=150, r_ui=3.0, est=3.937709271725271, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=530, iid=356, r_ui=5.0, est=4.05, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=425, iid=2716, r_ui=2.5, est=3.8506630592146385, details={'actual

* uid : 사용자 ID
* iid : 아이템 ID
* r_ui : 실제 평점
* est : 모델이 예측한 평점
* details : 예측에 대한 추가 정보
    - actual_k : 해당 예측에 사용된 최근접 이웃 수
    - was_impossible : 예측이 불가능한지 여부

In [15]:
accuracy.rmse(predictions)

RMSE: 0.9722


0.9721767067163586

In [16]:
accuracy.mae(predictions)

MAE:  0.7495


0.7494680411064215

## SlopeOne

In [17]:
model = SlopeOne()
model.fit(train)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x7f4fcdb058d0>

In [18]:
predictions = model.test(test)

In [19]:
accuracy.rmse(predictions)

RMSE: 0.9006


0.9005845749099356

In [20]:
accuracy.mae(predictions)

MAE:  0.6881


0.6881327247490217

## SVD

In [21]:
model = SVD()
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4fcdb06080>

In [22]:
predictions = model.test(test)

In [23]:
accuracy.rmse(predictions)

RMSE: 0.8772


0.8772036167450219

In [24]:
accuracy.mae(predictions)

MAE:  0.6758


0.6758270081699843

SVD의 성능이 가장 좋다