In [2]:
import pandas as pd
ratings = pd.read_csv('data/ratings_small.csv')
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [3]:
ratings['rating'].min(), ratings['rating'].max()


(0.5, 5.0)

In [4]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
reader = Reader(rating_scale=(0.5, 5.0))

In [5]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data #Surprise 는 사용자 아이디, item 아이디, 평점 데이터가 로우 레벨로 된 데이터 세트만 적용할 수 있다.


<surprise.dataset.DatasetAutoFolds at 0x1fe62ade540>

In [6]:
svd = SVD(random_state=0)


In [7]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8929  0.9004  0.8943  0.8953  0.8964  0.8958  0.0026  
MAE (testset)     0.6825  0.6947  0.6880  0.6890  0.6908  0.6890  0.0040  
Fit time          0.83    0.82    0.85    0.84    0.81    0.83    0.01    
Test time         0.12    0.08    0.06    0.13    0.08    0.10    0.03    


{'test_rmse': array([0.89286294, 0.9003716 , 0.89425534, 0.89525292, 0.89644252]),
 'test_mae': array([0.6825276 , 0.6947394 , 0.68798013, 0.68896429, 0.69076528]),
 'fit_time': (0.8263168334960938,
  0.8242568969726562,
  0.8545236587524414,
  0.8363237380981445,
  0.809870719909668),
 'test_time': (0.12302470207214355,
  0.08339667320251465,
  0.062381744384765625,
  0.13314175605773926,
  0.08293676376342773)}

In [8]:
trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fe79360c20>

In [9]:
user_id=9
movie_id=42
movies = ratings[ratings['userId']==user_id]['movieId']
movies.count(), movies


(45,
 699       1
 700      17
 701      26
 702      36
 703      47
 704     318
 705     497
 706     515
 707     527
 708     534
 709     593
 710     608
 711     733
 712    1059
 713    1177
 714    1357
 715    1358
 716    1411
 717    1541
 718    1584
 719    1680
 720    1682
 721    1704
 722    1721
 723    1784
 724    2028
 725    2125
 726    2140
 727    2249
 728    2268
 729    2273
 730    2278
 731    2291
 732    2294
 733    2302
 734    2391
 735    2396
 736    2427
 737    2490
 738    2501
 739    2539
 740    2571
 741    2628
 742    2762
 743    2857
 Name: movieId, dtype: int64)

In [10]:
if movies[movies==movie_id].count() == 0: print(f'사용자 아이디 {user_id}는(은) 영화 아이디 {movie_id}의 평점 없음')


사용자 아이디 9는(은) 영화 아이디 42의 평점 없음


In [11]:
pred = svd.predict('uid', 'mid')
pred


Prediction(uid='uid', iid='mid', r_ui=None, est=3.543608255669773, details={'was_impossible': False})

In [12]:
print(f'사용자 아이디 {user_id}의 예측 평점은 {pred.est}점 입니다.')


사용자 아이디 9의 예측 평점은 3.543608255669773점 입니다.


In [13]:
seen_movies = ratings[ratings['userId']==user_id]['movieId'].tolist()
len(seen_movies)


45

In [14]:
total_movies = ratings['movieId'].drop_duplicates().sort_values().tolist()
len(total_movies)

9066

In [15]:
import numpy as np
unseen_movies = np.setdiff1d(total_movies, seen_movies)
len(unseen_movies)


9021

In [16]:
def get_unseen_movies(ratings, user_id):
    seen_movies = ratings[ratings['userId']==user_id]['movieId'].tolist()
    total_movies = ratings['movieId'].drop_duplicates().sort_values().tolist()
    unseen_movies = np.setdiff1d(total_movies, seen_movies)
    print(f'평점 매긴 영화 수:{len(seen_movies)}')
    print(f'추천 대상 영화 수:{len(unseen_movies)}')
    print(f'모든 영화 수:{len(total_movies)}')
    return unseen_movies

In [17]:
unseen_movies = get_unseen_movies(ratings, 9)

평점 매긴 영화 수:45
추천 대상 영화 수:9021
모든 영화 수:9066


In [18]:
predictions = [svd.predict(user_id, movie_id) for movie_id in unseen_movies]
predictions

[Prediction(uid=9, iid=2, r_ui=None, est=3.3022661202336026, details={'was_impossible': False}),
 Prediction(uid=9, iid=3, r_ui=None, est=3.1564257268610603, details={'was_impossible': False}),
 Prediction(uid=9, iid=4, r_ui=None, est=2.700195831006973, details={'was_impossible': False}),
 Prediction(uid=9, iid=5, r_ui=None, est=3.114698903163602, details={'was_impossible': False}),
 Prediction(uid=9, iid=6, r_ui=None, est=3.904805942864381, details={'was_impossible': False}),
 Prediction(uid=9, iid=7, r_ui=None, est=3.059621203539996, details={'was_impossible': False}),
 Prediction(uid=9, iid=8, r_ui=None, est=3.644808013555, details={'was_impossible': False}),
 Prediction(uid=9, iid=9, r_ui=None, est=2.842665999241858, details={'was_impossible': False}),
 Prediction(uid=9, iid=10, r_ui=None, est=3.399962356260424, details={'was_impossible': False}),
 Prediction(uid=9, iid=11, r_ui=None, est=3.479578700970142, details={'was_impossible': False}),
 Prediction(uid=9, iid=12, r_ui=None, e

In [19]:
predictions.sort(key=lambda pred:pred.est, reverse=True)
predictions


[Prediction(uid=9, iid=858, r_ui=None, est=4.542866877335705, details={'was_impossible': False}),
 Prediction(uid=9, iid=912, r_ui=None, est=4.484090707192216, details={'was_impossible': False}),
 Prediction(uid=9, iid=4993, r_ui=None, est=4.471004680156093, details={'was_impossible': False}),
 Prediction(uid=9, iid=926, r_ui=None, est=4.427937145395248, details={'was_impossible': False}),
 Prediction(uid=9, iid=745, r_ui=None, est=4.41983077978538, details={'was_impossible': False}),
 Prediction(uid=9, iid=904, r_ui=None, est=4.4091355113400486, details={'was_impossible': False}),
 Prediction(uid=9, iid=908, r_ui=None, est=4.399526575526584, details={'was_impossible': False}),
 Prediction(uid=9, iid=969, r_ui=None, est=4.389459379084835, details={'was_impossible': False}),
 Prediction(uid=9, iid=1278, r_ui=None, est=4.375797151947231, details={'was_impossible': False}),
 Prediction(uid=9, iid=8132, r_ui=None, est=4.361136646233039, details={'was_impossible': False}),
 Prediction(uid=9

In [20]:
top_predictions = predictions[:10]
top_predictions

[Prediction(uid=9, iid=858, r_ui=None, est=4.542866877335705, details={'was_impossible': False}),
 Prediction(uid=9, iid=912, r_ui=None, est=4.484090707192216, details={'was_impossible': False}),
 Prediction(uid=9, iid=4993, r_ui=None, est=4.471004680156093, details={'was_impossible': False}),
 Prediction(uid=9, iid=926, r_ui=None, est=4.427937145395248, details={'was_impossible': False}),
 Prediction(uid=9, iid=745, r_ui=None, est=4.41983077978538, details={'was_impossible': False}),
 Prediction(uid=9, iid=904, r_ui=None, est=4.4091355113400486, details={'was_impossible': False}),
 Prediction(uid=9, iid=908, r_ui=None, est=4.399526575526584, details={'was_impossible': False}),
 Prediction(uid=9, iid=969, r_ui=None, est=4.389459379084835, details={'was_impossible': False}),
 Prediction(uid=9, iid=1278, r_ui=None, est=4.375797151947231, details={'was_impossible': False}),
 Prediction(uid=9, iid=8132, r_ui=None, est=4.361136646233039, details={'was_impossible': False})]

In [21]:
top_movies = [(pred.iid, pred.est) for pred in top_predictions]
top_movies

[(858, 4.542866877335705),
 (912, 4.484090707192216),
 (4993, 4.471004680156093),
 (926, 4.427937145395248),
 (745, 4.41983077978538),
 (904, 4.4091355113400486),
 (908, 4.399526575526584),
 (969, 4.389459379084835),
 (1278, 4.375797151947231),
 (8132, 4.361136646233039)]

In [34]:
def recomm_movies(predictions):
    predictions = [svd.predict(user_id, movie_id) for movie_id in unseen_movies]
    predictions.sort(key=lambda pred:pred.est, reverse=True)
    top_predictions = predictions[:5]
    top_movies = [(pred.iid, pred.est) for pred in top_predictions]
    return top_movies

In [44]:
movies = recomm_movies(predictions)
movies
print('추천영화')
for movie in movies:
    print(f'영화 아이디:{movie[0]} (예상평점:{movie[1]})')
   

추천영화
영화 아이디:858 (예상평점:4.542866877335705)
영화 아이디:912 (예상평점:4.484090707192216)
영화 아이디:318 (예상평점:4.481885652778941)
영화 아이디:4993 (예상평점:4.471004680156093)
영화 아이디:926 (예상평점:4.427937145395248)
