## 15장 협업 필터링

### (1) 사용자 기반 협업 필터링 실습

- 데이터 살펴보기

In [1]:
#1. 모듈 및 함수 불러오기
from surprise import Dataset

#2. 데이터 로딩
data=Dataset.load_builtin('ml-100k')

In [2]:
import pandas as pd
ratings = pd.DataFrame(data.raw_ratings, columns=['user', 'item', 'rate', 'timestamp'])
ratings.head()

Unnamed: 0,user,item,rate,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [3]:
#사용자 정보 출력
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols)
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
#영화 정보 출력
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(5), encoding = "latin1")
print(movies.shape)
movies.head()


(1682, 5)


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


- 모형 학습 및 예측

In [5]:
#1. 모듈 및 함수 불러오기
from surprise.model_selection import train_test_split
from surprise import KNNBasic

#2. 데이터 분할(학습용/평가용 데이터 세트)
trainset, testset = train_test_split(data, test_size=0.3, random_state=0)

#3. 모형 학습 및 예측
model = KNNBasic(name = 'cosine', user_base=True)
predictions = model.fit(trainset).test(testset)
predictions[:3]

Computing the msd similarity matrix...
Done computing similarity matrix.


[Prediction(uid='468', iid='160', r_ui=3.0, est=3.516481083643673, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='215', iid='195', r_ui=5.0, est=4.1433505276160325, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='790', iid='1185', r_ui=2.0, est=2.4311249843348057, details={'actual_k': 12, 'was_impossible': False})]

- 모형 평가

In [6]:
from surprise import accuracy
rmse = accuracy.rmse(predictions)

RMSE: 0.9922


- 고객별 추천 영화 리스트 출력

In [7]:
def recommend(predictions, n, k):
    print("-----고객별 추천 영화 리스트-----")

    #2. n명의 고객 ID 추출
    uids = [pred.uid for pred in predictions][:n]

#3. 고객별 추천 리스트 출력
    for uid in uids:

#3-1. 고객이 관람하지 않은 영화 추출
        seen_movies = ratings[ratings.user== uid]['item'].tolist()
        total_movies=movies['movie_id'].tolist()
        unseen_movies=[movie for movie in total_movies if movie not in seen_movies]

#3-2. k개의 미관람 영화에 대한 평점 예측
        pred=[model.predict(str(uid), str(item)) for item in unseen_movies]
        pred.sort(key=lambda pred: pred[3], reverse=True)
        top_pred=pred[:k]

#3-3. 예측 결과로부터 영화 ID, 제목, 예측 평점 추출
        top_ids = [int(pred.iid) for pred in top_pred]
        top_titles = movies[movies.movie_id.isin(top_ids)]['title']
        top_rating = [pred.est for pred in top_pred]
        top_preds=[(id, title, rating) for id, title, rating in zip(top_ids, top_titles, top_rating)]

#3-4. 추천 리스트 출력
        print("#고객 ID:", uid)
        for top_movie in top_preds:
            print(top_movie[1], ":", top_movie[2])

#4. 함수 호출
recommend(predictions, 10, 3)

-----고객별 추천 영화 리스트-----
#고객 ID: 468
Prefontaine (1997) : 5
Star Kid (1997) : 5
The Deadly Cure (1996) : 5
#고객 ID: 215
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 790
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
The Deadly Cure (1996) : 5
#고객 ID: 498
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 199
Two or Three Things I Know About Her (1966) : 5
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
#고객 ID: 314
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 533
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 541
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
The Deadly Cure (1996) : 5
#고객 ID: 710
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 26
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5


### (2) 아이템 기반 협업 필터링 실습

- 모형 학습 및 예측

In [8]:
model = KNNBasic(name = 'cosine', user_base=False)
predictions = model.fit(trainset).test(testset)
predictions[:3]

Computing the msd similarity matrix...
Done computing similarity matrix.


[Prediction(uid='468', iid='160', r_ui=3.0, est=3.516481083643673, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='215', iid='195', r_ui=5.0, est=4.1433505276160325, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='790', iid='1185', r_ui=2.0, est=2.4311249843348057, details={'actual_k': 12, 'was_impossible': False})]

- 모형 평가

In [9]:
rmse = accuracy.rmse(predictions)

RMSE: 0.9922


- 고객별 추천 영화 리스트 출력

In [10]:
recommend(predictions, 10, 3)

-----고객별 추천 영화 리스트-----
#고객 ID: 468
Prefontaine (1997) : 5
Star Kid (1997) : 5
The Deadly Cure (1996) : 5
#고객 ID: 215
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 790
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
The Deadly Cure (1996) : 5
#고객 ID: 498
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 199
Two or Three Things I Know About Her (1966) : 5
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
#고객 ID: 314
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 533
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 541
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
The Deadly Cure (1996) : 5
#고객 ID: 710
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5
#고객 ID: 26
They Made Me a Criminal (1939) : 5
Prefontaine (1997) : 5
Star Kid (1997) : 5


### (3) 잠재요인 모델 기반 협업 필터링

- 모형 학습 및 예측

In [11]:
#1. 모듈 및 함수 임포트
from surprise import SVD

#2. 모형 학습 및 예측
model = SVD(n_factors=100, random_state=0)
predictions=model.fit(trainset).test(testset)
predictions[:3]


[Prediction(uid='468', iid='160', r_ui=3.0, est=3.7421366112970325, details={'was_impossible': False}),
 Prediction(uid='215', iid='195', r_ui=5.0, est=3.4982494019950483, details={'was_impossible': False}),
 Prediction(uid='790', iid='1185', r_ui=2.0, est=2.5845592338149297, details={'was_impossible': False})]

- 모형 평가

In [12]:
rmse = accuracy.rmse(predictions)

RMSE: 0.9484


- 고객별 추천 영화 리스트 출력

In [13]:
recommend(predictions, 10, 3)

-----고객별 추천 영화 리스트-----
#고객 ID: 468
12 Angry Men (1957) : 5
Schindler's List (1993) : 4.875501997795706
Close Shave, A (1995) : 4.839917736964194
#고객 ID: 215
Silence of the Lambs, The (1991) : 4.672580913926415
Good Will Hunting (1997) : 4.632480177519761
It's a Wonderful Life (1946) : 4.625783972240234
#고객 ID: 790
Shawshank Redemption, The (1994) : 4.348261881599786
Raiders of the Lost Ark (1981) : 4.346463189971051
Close Shave, A (1995) : 4.237583320802747
#고객 ID: 498
Schindler's List (1993) : 4.103138110201718
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) : 4.071671843603244
Dead Man (1995) : 4.021191044582526
#고객 ID: 199
Citizen Kane (1941) : 4.204582105333165
Wrong Trousers, The (1993) : 4.0337041968338845
Casablanca (1942) : 4.0058216284341945
#고객 ID: 314
Jurassic Park (1993) : 5
Top Gun (1986) : 5
Back to the Future (1985) : 5
#고객 ID: 533
Forrest Gump (1994) : 4.562032263019287
Silence of the Lambs, The (1991) : 4.5204146126743066
Raiders of the Los