# 영화 추천

- Surprise 패키지의 예측 모델 SVD() 사용
- MovieLens 데이터 사용

# import

In [96]:
import pandas as pd
from scipy import spatial
from surprise import Dataset, accuracy
from surprise import Reader
from surprise import KNNWithMeans,SVD
from surprise.model_selection import train_test_split

In [5]:
# 한번만 설치하면 됨

!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 10.4 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-macosx_10_9_x86_64.whl size=1155753 sha256=77cd52c23f1b120c7b594eefdc6b3568d40f57c56d24e27f0058e81da760a033
  Stored in directory: /Users/hwa-kim/Library/Caches/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


# 데이터 읽기

In [217]:
ratings = pd.read_csv('https://raw.githubusercontent.com/StillWork/data/master/ratings.csv')
movies = pd.read_csv('https://raw.githubusercontent.com/StillWork/data/master/movies.csv')

In [218]:
# 평가 기록에 영화명 컬럼 추가

ratings_movies = pd.merge(ratings, movies, on = 'movieId')
print(ratings_movies.shape)
ratings_movies[:3]

(100004, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama


In [219]:
# 영화명-이용자 평가표
# 결측치는 0으로 채운다

movie_user_rating = ratings_movies.pivot_table('rating', index = 'title', columns='userId')
movie_user_rating.fillna(0, inplace = True)
print(movie_user_rating.shape)
movie_user_rating.head(3)

(9064, 671)


userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
# 이용자별 평가 수

(movie_user_rating!=0).sum()

userId
1       20
2       76
3       51
4      204
5      100
      ... 
667     68
668     20
669     37
670     31
671    115
Length: 671, dtype: int64

In [150]:
# Surprise를 사용하기 위한 데이터셋 만들기
# 평가표 데이터 프레임으로부터 데이터셋을 만드는 함수 load_from_df()를 사용

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_movies[["userId", "title", "rating"]], reader)

In [222]:
# 모델 성능 평가하기
# 추천 예측 모델로 Surprise가 제공하는 SVD() 사용
# 평가 지표는 1~5 점을 예측하는 오차 RMSE를 측정 (작을수록 성능이 좋은 것임)

trainset, testset = train_test_split(data, test_size=0.2)

model = SVD()
# model = SVD(n_factors=100, n_epochs=20, random_state=123)

model.fit(trainset)
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9058


0.9057574567037519

In [223]:
# 성능개선을 위해서 100% 전체 데이터로 학습시키는 방법
# 위에서는 80% 데이터로 학습시키고 20% 데이터로 성능을 확인 했다

trainingSet = data.build_full_trainset() # 전체 데이터 사용하여 모델 재학습
model.fit(trainingSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb17a1f5c40>

## 모델 사용하기

In [275]:
# 특정 사용자의 특정 영화에 대한 평점 예측
# 예측 값은 네번째 인자에 들어있다

uid = str(2)  # 문자열이여야 함
iid = 'Zulu (1964)'  # 영화명 (문자열)

model.predict(uid, iid)

Prediction(uid='2', iid='Zulu (1964)', r_ui=None, est=3.5859840148377162, details={'was_impossible': False})

# 개인별 영화 추천하기

In [235]:
# 영화-이용자 평가 매트릭스

movie_user_rating[:3]

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [253]:
# 특정 사용자에 대해서 모든 영화의 평점 예측값 얻기

def get_scores(uid):
    uid = str(uid)
    scores = []
    for title in movie_user_rating.index:
        pred = model.predict(uid, title)
        scores.append(pred[3])
    return scores

In [281]:
# 임의 이용자에 대한 모든 영화 평점 예측치 보기

get_scores(4)[:10]

[3.4060270873251506,
 3.5900895170928235,
 3.3755368488666684,
 3.411443929032968,
 3.4848438618450017,
 3.5021340407687083,
 3.3262934852093737,
 3.28381885025158,
 3.8461617119602964,
 3.8906772133291874]

## 영화 추천하기

In [261]:
# 특정 사용자 uid에 대한 모든 평점 예측 테이블 만들기
# 기존의 평가 점수를 같이 보기

def get_personal(uid):
    df = movie_user_rating[[uid]].copy()
    df['scores'] = get_scores(uid)
    df = df.sort_values('scores', ascending=False)
    return df

In [282]:
get_personal(4)[:10]

userId,4,scores
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",0.0,4.463844
"Godfather, The (1972)",5.0,4.42916
All About Eve (1950),0.0,4.386051
Roger & Me (1989),5.0,4.38382
"Godfather: Part II, The (1974)",0.0,4.377371
"African Queen, The (1951)",0.0,4.363389
Modern Times (1936),0.0,4.358236
"Usual Suspects, The (1995)",0.0,4.337249
Raging Bull (1980),0.0,4.33144
It Happened One Night (1934),0.0,4.326392


## 추천하기

- 이용자 uid에게 N개의 영화를 추천하는 경우
- 아직 보지 않은 영화를 추천해야 하므로 기존 평점이 0인 영화중에서 상위 N개를 추천

In [283]:
uid, N = 4, 10
recomend = get_personal(uid)
recomend[recomend[uid] == 0][:N]

userId,4,scores
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",0.0,4.463844
All About Eve (1950),0.0,4.386051
"Godfather: Part II, The (1974)",0.0,4.377371
"African Queen, The (1951)",0.0,4.363389
Modern Times (1936),0.0,4.358236
"Usual Suspects, The (1995)",0.0,4.337249
Raging Bull (1980),0.0,4.33144
It Happened One Night (1934),0.0,4.326392
Band of Brothers (2001),0.0,4.316556
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),0.0,4.313433


## 실제 평점과 비교

In [287]:
# 이미 평점을 준 경우, 실제값과 예측치를 비교하기
# 평점을 준 경우는 실제값 rec[uid]는 1 이상의 양수이다

uid, N = 4, 50
recomend = get_personal(uid)
recomend[recomend[uid] > 0][:N]

userId,4,scores
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",5.0,4.42916
Roger & Me (1989),5.0,4.38382
"Maltese Falcon, The (1941)",5.0,4.27913
Psycho (1960),5.0,4.221948
Pulp Fiction (1994),5.0,4.20926
Monty Python and the Holy Grail (1975),5.0,4.196599
Reservoir Dogs (1992),5.0,4.169923
Star Wars: Episode V - The Empire Strikes Back (1980),5.0,4.163269
"Princess Bride, The (1987)",5.0,4.159288
Vertigo (1958),5.0,4.150801
