In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1675372 sha256=2ca3f6c17d1607e02e7540ab0a650775c49532eab80ac0c48325efeb9a14d18d
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


In [0]:
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import surprise
from surprise import SVD
import time
from datetime import datetime
import pandas as pd
import os

# Data Load

- 1682개 영화를 943 명의 사용자가 평점을 매김(1-5)
- 각 사용자는 최소 20 개의 영화를 평가

In [3]:
data = surprise.Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [4]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f576878eb00>

# KNN

- 특정 사용자의 평점을 예측하기 위해 사용하는 것이 아니라 해당 사용자와 유사한(similar) 사용자에 대해 가중치를 줌.
- 사용자 기반 (User-based) CF : 특정 사용자와 유사한 사용자를 찾는 방법
- 상품 기반 (Item-based) CF : 특정 상품에 대해 사용자가 준 점수 즉, 평점 행렬의 상품 열 벡터의 유사성을 찾고 특정 상품과 유사한 평점 정보를 가지는 상품들로 해당 상품의 빈 데이터를 예측하는 방법

- KNNBasic : 평점들을 단순 가중 평균
- KNNWithMeans : 평점들을 평균값 기준으로 가중 평균
- KNNBaseline : 평점들을 베이스 모형의 값 기준으로 가중 평균

## User-Based Collaborative Filter

- surprise 패키지의 유사도 설정 옵션은 다음과 같다.
- name: 사용할 유사도의 종류를 나타내는 문자열. 디폴트는 'MSD'.
- (cosine, msd, pearson, pearson_baseline)
- user_based: True면 사용자 기반, False면 상품 기반.
- min_support: 두 사용자나, 상품에서 공통적으로 있는 평점 원소의 수의 최솟값. 공통 평점 원소의 수가 이 값보다 적으면 해당 벡터는 사용하지 않음.
- shrinkage: Shrinkage 가중치. 디폴트는 100.

## User-Based Collaborative Filter(cosine)

In [5]:
sim_options = {'name': 'cosine','user_based':'True'}
trainset, testset = train_test_split(data, test_size=0.25)
KNN_UBCF_algo = surprise.KNNBasic(sim_options=sim_options)
KNN_UBCF_algo.fit(trainset)
predict1 = KNN_UBCF_algo.test(testset)
accuracy.rmse(predict1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0244


1.024364717190392

In [6]:
predict1[:10]

[Prediction(uid='566', iid='327', r_ui=3.0, est=3.3243764725492486, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='938', iid='472', r_ui=4.0, est=3.675792128854786, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='73', iid='127', r_ui=5.0, est=4.174751938194919, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='156', iid='205', r_ui=3.0, est=4.199686973065608, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='245', r_ui=4.0, est=3.198686615418318, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='886', iid='230', r_ui=2.0, est=3.675126072950978, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='271', iid='13', r_ui=4.0, est=3.6258117704466835, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='514', iid='423', r_ui=5.0, est=3.923625669920136, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='660', iid='239', r_ui=2.0, est=

## 예상 평점 top 3 추출

In [0]:
from collections import defaultdict
 
def get_top3_recommendations(predict, topN = 3):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predict:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

## id를 영화 제목으로 변환

In [0]:
import os, io
 
def read_item_names_poster():
    file_name = (os.path.expanduser('~') +
                 '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name_poster = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name_poster[line[0]] = line[1],line[4]
 
    return rid_to_name_poster

## 결과

In [9]:
top3_recommendations = get_top3_recommendations(predict1)
rid_to_name_poster = read_item_names_poster()
for uid, user_ratings in top3_recommendations.items():
    print(uid, [(rid_to_name_poster[iid][0], score) for (iid, score) in user_ratings])

566 [("Schindler's List (1993)", 4.550675778363733), ('North by Northwest (1959)', 4.525540326853236), ('Shawshank Redemption, The (1994)', 4.39757641962637)]
938 [('Fargo (1996)', 4.2515508065729675), ('Godfather, The (1972)', 4.177388024269094), ('Contact (1997)', 4.075247440878813)]
73 [('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)', 4.500258493027839), ('Vertigo (1958)', 4.399874356685898), ('Manchurian Candidate, The (1962)', 4.375370107822238)]
156 [('Killing Fields, The (1984)', 4.323544468778327), ('Patton (1970)', 4.199686973065608), ('Magnificent Seven, The (1954)', 4.125586537783238)]
804 [('Empire Strikes Back, The (1980)', 4.575596424085175), ("Schindler's List (1993)", 4.550217104015861), ('Third Man, The (1949)', 4.499303719615406)]
886 [('Saint of Fort Washington, The (1993)', 5), ('Casablanca (1942)', 4.598385404927801), ('Usual Suspects, The (1995)', 4.5500453155098946)]
271 [('Usual Suspects, The (1995)', 4.598952137311362), ('Shawshan

## User-Based Collaborative Filter(msd)

In [10]:
sim_options = {'name': 'msd','user_based':'True'}
KNN_msd_algo = surprise.KNNBasic(sim_options=sim_options)
KNN_msd_algo.fit(trainset)
predict2 = KNN_msd_algo.test(testset)
accuracy.rmse(predict2)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9870


0.9869736360225833

# Item-Based Collaborative Filter

In [11]:
sim_options = {'name': 'cosine','user_based':'False'}
# user_based에 False를 입력한다는 것이 차이점입니다.
KNN_IBCF_algo = surprise.KNNBasic(sim_options=sim_options)
KNN_IBCF_algo.fit(trainset)
predict3 = KNN_IBCF_algo.test(testset)
accuracy.rmse(predict3)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0244


1.024364717190392

## 결과

In [12]:
top3_recommendations = get_top3_recommendations(predict3)
rid_to_name_poster = read_item_names_poster()
for uid, user_ratings in top3_recommendations.items():
    print(uid, [(rid_to_name_poster[iid][0], score) for (iid, score) in user_ratings])

566 [("Schindler's List (1993)", 4.550675778363733), ('North by Northwest (1959)', 4.525540326853236), ('Shawshank Redemption, The (1994)', 4.39757641962637)]
938 [('Fargo (1996)', 4.2515508065729675), ('Godfather, The (1972)', 4.177388024269094), ('Contact (1997)', 4.075247440878813)]
73 [('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)', 4.500258493027839), ('Vertigo (1958)', 4.399874356685898), ('Manchurian Candidate, The (1962)', 4.375370107822238)]
156 [('Killing Fields, The (1984)', 4.323544468778327), ('Patton (1970)', 4.199686973065608), ('Magnificent Seven, The (1954)', 4.125586537783238)]
804 [('Empire Strikes Back, The (1980)', 4.575596424085175), ("Schindler's List (1993)", 4.550217104015861), ('Third Man, The (1949)', 4.499303719615406)]
886 [('Saint of Fort Washington, The (1993)', 5), ('Casablanca (1942)', 4.598385404927801), ('Usual Suspects, The (1995)', 4.5500453155098946)]
271 [('Usual Suspects, The (1995)', 4.598952137311362), ('Shawshan

## SVD

In [13]:
SVD_algo = surprise.SVD()
SVD_algo.fit(trainset)
predict4 = SVD_algo.test(testset)
accuracy.rmse(predict4)

RMSE: 0.9433


0.9433387907025614

In [0]:
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import SVD
from surprise import NMF
from surprise import CoClustering

## 모델별 성능 비교
- SVD : 특이값 분해(행렬 분해 기반 모델)
- NMF : SVD와 매우 유사하며, 음수 미포함 행렬 분해 기반 모델.
- Co-clustering : 협업 필터링 기반 클러스터링


- KNNBaseline이 RMSE = 0.936283로 모델 중 가장 높은 성능을 보임.
- 단, 테스트 시간이 5.829331초로 가장 느린 속도를 보임.
- 성능, 훈련 시간, 테스트 시간 등을 종합 평가하여 가장 적합한 모델을 선택.

In [22]:
benchmark = []
for algorithm in [SVD(), NMF(), KNNBasic(), KNNBaseline(), KNNWithMeans(), CoClustering()]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

# 모델별로 RMSE값을 계산하고 성능 비교.

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.936283,0.292811,5.829331
SVD,0.944427,3.863995,0.288752
KNNWithMeans,0.955834,0.228325,5.074423
NMF,0.972652,4.088976,0.234832
CoClustering,0.976197,1.254771,0.218879
KNNBasic,0.989284,0.201626,4.74007
