# 가짜연구소 4기 - 추천시스템 톺아보기
## Contents-based recommendation using TF-IDF
- 작성자: 김민수(kimminsu.ds@gmail.com)
- 출처: https://grouplens.org/datasets/movielens/latest/

## 00. 환경설정

### 00-01. 패키지

In [1]:
import pandas as pd
import numpy  as np

from math import log
from tqdm import tqdm

## 00-02. MovieLens 데이터

In [2]:
path = "../data/MovieLens/"

#### `ratings.csv`: 평점이 존재하는 모든 영화 정보

In [3]:
ratings_df = pd.read_csv(path + 'ratings.csv', encoding="utf-8")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#### `movies.csv`: 영화에 대한 정보

In [4]:
movies_df = pd.read_csv(path + 'movies.csv', encoding="utf-8", index_col = 'movieId')
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


## 01. tf-idf를 이용한 영화 추천
- 개별 영화에 대한 아이템 프로파일을 구축하기 위해 영화 정보를 Feature로 사용
- 영화에 대한 Feature를 TF-IDF로 만들기 위해서 영화의 정보를 담고 있는 Document와 Word가 필요
- **`개별 영화를 Document, 영화를 표현하는 장르들을 Word`**로 설정하고 TF-IDF 사용

### 01-01. 사용자 정의 함수

In [5]:
def tf(word, document):
    '''
    [Description]
        Input으로 주어진 document에서 word의 TF(term-frequency)값을 반환하는 함수
    [Input]
        word: 단어
        document: 문서
    [Output]
        document에서 word가 등장한 횟수
    '''    
    
    return document.count(word)

In [6]:
def idf(word):
    '''
    [Description]
        Input으로 주어진 word의 IDF(Inverse Document Frequency)값을 반환하는 함수
    [Input]
        word: 단어
    [Output]
        word의 IDF
    '''
    
    df = 0
    for document in documnet_list:
        df += 1 if word in document else 0
        
    return log(N / (df+1))

In [7]:
def tfidf(word, document):
    '''
    [Description]
        Input으로 주어진 document에서 word의 tf-idf값을 반환하는 함수
    [Input]
        word: 단어
        documment: 문서
    [Output]
        document에서 word의 tf-idf값ㅂ    
    '''
    
    return tf(word, document) * idf(word)

### 01-02. tf-idf 값 구하기
- 개별 영화를 **`document`**, 영화를 표현하는 장르들을 **`word`**로 설정

#### 전체 영화(문서)의 개수

In [8]:
N = len(movies_df)
N

9742

#### 전체 영화 장르(단어)의 개수

In [9]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [10]:
genres = []

for g in movies_df.genres:
    genres.extend(g.split("|"))
    
genres[:6]

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Adventure']

- (no genres listed)도 하나의 genre로 보고 하나의 word로 취급 (NLP UNK 토큰 개념)

In [11]:
genres = list(set(genres))
genres

['Horror',
 'Children',
 'Comedy',
 '(no genres listed)',
 'Crime',
 'Adventure',
 'Film-Noir',
 'Animation',
 'Mystery',
 'Thriller',
 'IMAX',
 'Fantasy',
 'Romance',
 'War',
 'Western',
 'Sci-Fi',
 'Musical',
 'Documentary',
 'Action',
 'Drama']

In [12]:
print("전체 단어 개수: {}".format(len(genres)))

전체 단어 개수: 20


In [13]:
movies_df[movies_df['genres']=='(no genres listed)'].head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
114335,La cravate (1957),(no genres listed)
122888,Ben-hur (2016),(no genres listed)
122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed)
129250,Superfast! (2015),(no genres listed)
132084,Let It Be Me (1995),(no genres listed)


#### 전체 단어에 대한 IDF 구하기
- DF(Document Frequency): 각 단어가 문서에 등장하는 횟수

In [14]:
# genres를 key로, 기본값이 0인 dictionary 생성
df_dict = dict.fromkeys(genres, 0)

for genre_list in tqdm(movies_df['genres']):
    for genre in genres:
        df_dict[genre] += genre in genre_list.split("|")
        
df_dict

100%|██████████████████████████████████████████████████████████████████████████| 9742/9742 [00:00<00:00, 143712.99it/s]


{'Horror': 978,
 'Children': 664,
 'Comedy': 3756,
 '(no genres listed)': 34,
 'Crime': 1199,
 'Adventure': 1263,
 'Film-Noir': 87,
 'Animation': 611,
 'Mystery': 573,
 'Thriller': 1894,
 'IMAX': 158,
 'Fantasy': 779,
 'Romance': 1596,
 'War': 382,
 'Western': 167,
 'Sci-Fi': 980,
 'Musical': 334,
 'Documentary': 440,
 'Action': 1828,
 'Drama': 4361}

- 단어별 IDF(Inverse-Document Frequency) 구하기

In [15]:
idf_dict = dict.fromkeys(df_dict.keys())
for key, value in df_dict.items():
    idf_dict[key] = log(N / (value+1))
    
idf_dict

{'Horror': 2.2976700718359777,
 'Children': 2.684414673710634,
 'Comedy': 0.9528256687925191,
 '(no genres listed)': 5.6288536528770745,
 'Crime': 2.0941248785903963,
 'Adventure': 2.0421651396596854,
 'Film-Noir': 4.706864899888282,
 'Animation': 2.767469431854162,
 'Mystery': 2.8315723180469217,
 'Thriller': 1.6372275968499612,
 'IMAX': 4.115297512146256,
 'Fantasy': 2.5249077946828504,
 'Romance': 1.8083195661514755,
 'War': 3.236166725185842,
 'Western': 4.060237734963229,
 'Sci-Fi': 2.295629254801125,
 'Musical': 3.3700711825414214,
 'Documentary': 3.095156838919642,
 'Action': 1.6726770659756223,
 'Drama': 0.8035157676049136}

- 전체 단어에 대한 TF-IDF 구하기

In [16]:
result = []
index_list = []

for movie_id, row in tqdm(movies_df.iterrows()):
    doc_tf = []
    document = row['genres']
    
    for genre in genres:
        doc_tf.append(tf(genre, document) * idf_dict[genre])
        
    result.append(doc_tf)
    index_list.append(movie_id)
    
tfidf_df = pd.DataFrame(result, columns=genres, index=index_list).sort_index()

9742it [00:00, 12350.25it/s]


#### 개별 영화에 대한 TF-IDF 벡터 데이터프레임
- 각 영화(문서)는 차원 수가 단어의 개수인 벡터로 표현

In [17]:
tfidf_df

Unnamed: 0,Horror,Children,Comedy,(no genres listed),Crime,Adventure,Film-Noir,Animation,Mystery,Thriller,IMAX,Fantasy,Romance,War,Western,Sci-Fi,Musical,Documentary,Action,Drama
1,0.0,2.684415,0.952826,0.0,0.0,2.042165,0.0,2.767469,0.0,0.0,0.0,2.524908,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
2,0.0,2.684415,0.000000,0.0,0.0,2.042165,0.0,0.000000,0.0,0.0,0.0,2.524908,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,0.0,0.000000,0.952826,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,1.80832,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.000000,0.952826,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,1.80832,0.0,0.0,0.0,0.0,0.0,0.000000,0.803516
5,0.0,0.000000,0.952826,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.000000,0.952826,0.0,0.0,0.000000,0.0,2.767469,0.0,0.0,0.0,2.524908,0.00000,0.0,0.0,0.0,0.0,0.0,1.672677,0.000000
193583,0.0,0.000000,0.952826,0.0,0.0,0.000000,0.0,2.767469,0.0,0.0,0.0,2.524908,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
193585,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.803516
193587,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,2.767469,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,1.672677,0.000000


## 02. 아이템 유사도 기반 추천
- 어떤 영화가 주어졌을 때 그 영화와 코사인 유사도가 가장 높은 영화를 추천

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index], columns=b.index)
    
    return result_df

#### 각 영화들 사이의 유사도 계산

In [19]:
movie_sim_df = cos_sim_matrix(tfidf_df, tfidf_df)
movie_sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.821277,0.086580,0.080578,0.185731,0.000000,0.086580,0.657468,0.000000,0.261707,...,0.409432,0.518058,0.141984,0.539452,0.0,0.691516,0.753482,0.000000,0.461676,0.185731
2,0.821277,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.800544,0.000000,0.318658,...,0.000000,0.000000,0.000000,0.000000,0.0,0.359250,0.391443,0.000000,0.000000,0.000000
3,0.086580,0.000000,1.000000,0.930671,0.466160,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.108904,0.000000,0.356361,0.000000,0.0,0.105457,0.114907,0.000000,0.000000,0.466160
4,0.080578,0.000000,0.930671,1.000000,0.433841,0.000000,0.930671,0.000000,0.000000,0.000000,...,0.101354,0.102011,0.567512,0.000000,0.0,0.098145,0.106940,0.365857,0.000000,0.433841
5,0.185731,0.000000,0.466160,0.433841,1.000000,0.000000,0.466160,0.000000,0.000000,0.000000,...,0.233619,0.000000,0.764462,0.000000,0.0,0.226224,0.246496,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.691516,0.359250,0.105457,0.098145,0.226224,0.211509,0.105457,0.000000,0.397135,0.213852,...,0.661569,0.631007,0.172940,0.657066,0.0,1.000000,0.917760,0.000000,0.767757,0.226224
193583,0.753482,0.391443,0.114907,0.106940,0.246496,0.000000,0.114907,0.000000,0.000000,0.000000,...,0.543386,0.687551,0.188437,0.715945,0.0,0.917760,1.000000,0.000000,0.612723,0.246496
193585,0.000000,0.000000,0.000000,0.365857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278828,0.644669,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.461676,0.000000,0.000000,0.000000,0.000000,0.275490,0.000000,0.000000,0.517266,0.278541,...,0.792853,0.821883,0.000000,0.855825,0.0,0.767757,0.612723,0.000000,1.000000,0.000000


#### 아이템 유사도 기반 Top K Recommendation
- (예시) Black Butler: Book of the Atlantic (2017)

In [20]:
k = 10

given_movie = 'Black Butler: Book of the Atlantic (2017)'
movie_id = movies_df[movies_df.title == given_movie].index[0]

for index, value in movie_sim_df[movie_id].sort_values(ascending=False)[:k].items():
    recommend_movie_id = index[0]
    print(movies_df.loc[recommend_movie_id]['title'], value)

Black Butler: Book of the Atlantic (2017) 1.0
Superman/Batman: Public Enemies (2009) 0.9740752144041195
Dante's Inferno: An Animated Epic (2010) 0.9740752144041195
Justice League: Doom (2012)  0.9740752144041195
Daddy, I'm A Zombie (2012) 0.9177602335851786
Monkeybone (2001) 0.9177602335851786
Mickey's Once Upon a Christmas (1999) 0.9177602335851786
Anomalisa (2015) 0.9177602335851786
South Park: Imaginationland (2008) 0.9177602335851786
Cool World (1992) 0.9177602335851786


## 03. 유저 유사도 기반 추천시스템 구현
- 데이터를 train/test 셋으로 분리
- 아이템 feature 활용을 위해 TF-IDF 벡터 생성
- 아이템 벡터를 사용하여 **`유저 프로파일 벡터`** 생성
- 유저 프로파일 벡터와 추천 후보 아이템 벡터의 유사도를 계싼하여 사용자에게 Top K개 아이템 추천

- Top K 아이템 추천의 경우, 추천된 아이템 K개 중 사용자가 실제 선호한 아이템이 얼마나 있는지를 평가
- 평점이 존재하는 영화는 모두 선호한다고 가정

In [21]:
user_id_list = ratings_df['userId'].unique()
movie_id_list = movies_df.index.tolist()
movie_id_set = set(movie_id_list)

In [22]:
len(user_id_list)

610

### 03-01. Hit-ratio로 성능 평가
- 사용자가 선호도를 표시한 아이템 가운데 한 개만 제거하여 test셋에 삽입
- 남은 데이터를 학습 데이터로 사용하여 추천시스템 모델을 만들고 결과를 생성하여 테스트 데이터에 있는 영화와 일치하는지 평가

#### 사용자 1명(10번 유저)에 대해 추천을 수행하고 Hit-ratio 성능 평가

In [23]:
user_id = 10
_df = ratings_df[ratings_df.userId == user_id]
_df

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,1455303387
1120,10,356,3.5,1455301685
1121,10,588,4.0,1455306173
1122,10,597,3.5,1455357645
1123,10,912,4.0,1455302254
...,...,...,...,...
1254,10,119145,1.0,1455302650
1255,10,129428,3.5,1455357384
1256,10,136020,5.0,1455302192
1257,10,137595,4.0,1455356898


- 랜덤하게 하나의 영화를 제거하여 test 데이터로 사용

In [24]:
drop_indices = np.random.choice(_df.index, 1, replace=False)
_df_train = _df.drop(drop_indices)
_df_train

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,1455303387
1120,10,356,3.5,1455301685
1121,10,588,4.0,1455306173
1122,10,597,3.5,1455357645
1123,10,912,4.0,1455302254
...,...,...,...,...
1254,10,119145,1.0,1455302650
1255,10,129428,3.5,1455357384
1256,10,136020,5.0,1455302192
1257,10,137595,4.0,1455356898


#### 유저 벡터 생성
- 유저가 봤던 영화들의 평균이 가장 간단한 유저 벡터를 생성하는 방법

In [25]:
rated_movie_list = _df_train['movieId'].tolist()
user_profile_df = tfidf_df.loc[rated_movie_list, :].mean().to_frame().T
user_profile_df

Unnamed: 0,Horror,Children,Comedy,(no genres listed),Crime,Adventure,Film-Noir,Animation,Mystery,Thriller,IMAX,Fantasy,Romance,War,Western,Sci-Fi,Musical,Documentary,Action,Drama
0,0.03306,0.270373,0.534679,0.0,0.195853,0.455447,0.0,0.298648,0.061113,0.153122,0.532916,0.308802,1.001731,0.093127,0.0,0.082577,0.218206,0.0,0.312875,0.41621


- train 데이터에 있는 영화 ID를 추천 대상에서 제거
- 이미 선호했던 영화를 제외한 나머지 영화 가운데에서 추천 수행

In [26]:
rated_movie_set = set(rated_movie_list)
recommend_movie_list = list(movie_id_set - rated_movie_set)
recommend_movie_tfidf_df = tfidf_df.loc[recommend_movie_list, :]
recommend_movie_tfidf_df

Unnamed: 0,Horror,Children,Comedy,(no genres listed),Crime,Adventure,Film-Noir,Animation,Mystery,Thriller,IMAX,Fantasy,Romance,War,Western,Sci-Fi,Musical,Documentary,Action,Drama
1,0.00000,2.684415,0.952826,0.0,0.000000,2.042165,0.0,2.767469,0.0,0.000000,0.0,2.524908,0.00000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
2,0.00000,2.684415,0.000000,0.0,0.000000,2.042165,0.0,0.000000,0.0,0.000000,0.0,2.524908,0.00000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
3,0.00000,0.000000,0.952826,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.80832,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
4,0.00000,0.000000,0.952826,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.80832,0.000000,0.0,0.0,0.0,0.0,0.000000,0.803516
5,0.00000,0.000000,0.952826,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163809,0.00000,0.000000,0.000000,0.0,0.000000,2.042165,0.0,2.767469,0.0,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.803516
32743,2.29767,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,1.637228,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.803516
98279,0.00000,0.000000,0.952826,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
65514,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.00000,3.236167,0.0,0.0,0.0,0.0,1.672677,0.803516


- 유저 벡터와 나머지 아이템 벡터 간의 유사도 계산

In [27]:
top_k_result_df = cos_sim_matrix(recommend_movie_tfidf_df, user_profile_df)
top_k_result_df

Unnamed: 0,0
1,0.470714
2,0.370128
3,0.727021
4,0.774115
5,0.342343
...,...
163809,0.379061
32743,0.144292
98279,0.342343
65514,0.198950


#### Top K 추천 결과

In [28]:
k = 10

top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()
top_k_list

[852, 55451, 5969, 5902, 55261, 55267, 5856, 5829, 33815, 136840]

- Test 데이터 확인

In [29]:
_df_test = _df.loc[drop_indices.tolist(), :]
test_movie_id = _df_test['movieId'].values[0]
test_movie_id

59333

In [30]:
print("test 선호 영화: ", test_movie_id)
print("top k 추천: ", top_k_list)
print("hit 여부: ", test_movie_id in top_k_list)

test 선호 영화:  59333
top k 추천:  [852, 55451, 5969, 5902, 55261, 55267, 5856, 5829, 33815, 136840]
hit 여부:  False


#### 전체 유저에 대해 추천시스템 구현(Simple user vector 기반 average)

In [31]:
k = 20
hit = 0

for user_id in tqdm(user_id_list):
    
    # 개별 유저 데이터 로드
    _df = ratings_df[ratings_df.userId == user_id]
    
    # train / test 데이터 나누기
    drop_indices = np.random.choice(_df.index, 1, replace=False)
    _df_train = _df.drop(drop_indices)
    
    # user vector를 생성
    rated_movie_list = _df_train['movieId'].tolist()
    user_profile_df = tfidf_df.loc[rated_movie_list, :].mean().to_frame().T
    
    # 이미 유저가 평가를 한 영화는 추천 대상에서 제거
    rated_movie_set = set(rated_movie_list)
    recommend_movie_list = list(movie_id_set - rated_movie_set)
    recommend_movie_tfidf_df = tfidf_df.loc[recommend_movie_list, :]
    
    # 추천 대상 영화들의 item vector와에 user vector의 유사도를 구한 뒤, top k 영화를 뽑는다.
    top_k_result_df = cos_sim_matrix(recommend_movie_tfidf_df, user_profile_df)
    top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
    top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()
    
    # 유저 u에 대한 평가 (hit 여부)
    _df_test = _df.loc[drop_indices.tolist(), :]
    test_movie_id = _df_test['movieId'].values[0]
    
    hit += test_movie_id in top_k_list
    
hit_ratio = hit / len(user_id_list)

100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [00:10<00:00, 59.98it/s]


In [32]:
print('hit_ratio: ', hit_ratio)

hit_ratio:  0.009836065573770493


#### Weighted average user vector 기반 추천시스템 구현 (variant)

In [33]:
k = 20
hit = 0

for user_id in tqdm(user_id_list):
    
    # 개별 유저 데이터 로드
    _df = ratings_df[ratings_df.userId == user_id]
    
    # train / test 데이터 나누기
    drop_indices = np.random.choice(_df.index, 1, replace=False)
    _df_train = _df.drop(drop_indices)
    
    # user vector를 생성
    # 유저가 봤던 영화들의 벡터의 평점을 이용한 가중 평균이 user vector (variant)
    rated_movie_list = _df_train['movieId'].tolist()
    numerator = np.matmul(tfidf_df.loc[rated_movie_list].to_numpy().T, _df_train['rating'].to_numpy())
    denominator = _df_train['rating'].sum()
    user_profile_df = pd.DataFrame([numerator], columns = tfidf_df.columns) / denominator
    
    # 이미 유저가 평가를 한 영화는 추천 대상에서 제거
    rated_movie_set = set(rated_movie_list)
    recommend_movie_list = list(movie_id_set - rated_movie_set)
    recommend_movie_tfidf_df = tfidf_df.loc[recommend_movie_list, :]
    
    # 추천 대상 영화들의 item vector와에 user vector의 유사도를 구한 뒤, top k 영화를 뽑는다.
    top_k_result_df = cos_sim_matrix(recommend_movie_tfidf_df, user_profile_df)
    top_k_result_df = top_k_result_df.sort_values(by=0, ascending=False)
    top_k_list = top_k_result_df[:k].reset_index()['level_0'].values.tolist()
    
    # 유저 u에 대한 평가 (hit 여부)
    _df_test = _df.loc[drop_indices.tolist(), :]
    test_movie_id = _df_test['movieId'].values[0]
    
    hit += test_movie_id in top_k_list
    
hit_ratio = hit / len(user_id_list)

100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [00:14<00:00, 41.58it/s]


In [34]:
print('hit_ratio: ', hit_ratio)

hit_ratio:  0.011475409836065573


### 04. rating prediction
- 평점 예측 문제의 경우 추천 결과를 생성하지 않고 테스트 데이터의 평점을 직접 예측
- RMSE, MAE 같은 지표로 추천시스템의 성능 평가

In [35]:
movie_sim_df.shape

(9742, 9742)

In [36]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=10)

In [37]:
print('train 데이터 개수:', len(train_df))
print('test  데이터 개수:', len(test_df))

train 데이터 개수: 80668
test  데이터 개수: 20168


In [38]:
test_user_list = test_df.userId.unique()
len(test_user_list)

610

#### RMSE/MAE

- 사용자 1명(10번 유저)에 대해서 추천을 수행하고 성능 평가

In [39]:
user_id = 10
user_train_df = train_df[train_df.userId == user_id]
user_train_df.index = user_train_df['movieId']
user_train_df = user_train_df.sort_index()
user_train_df

Unnamed: 0_level_0,userId,movieId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
296,10,296,1.0,1455303387
356,10,356,3.5,1455301685
588,10,588,4.0,1455306173
597,10,597,3.5,1455357645
912,10,912,4.0,1455302254
...,...,...,...,...
113275,10,113275,4.5,1455357698
113394,10,113394,4.0,1455398275
136020,10,136020,5.0,1455302192
137595,10,137595,4.0,1455356898


- train 데이터에 있는 115개의 영화 vector 확인

In [40]:
user_sim_df = movie_sim_df.loc[user_train_df['movieId']]
user_sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
296,0.060278,0.000000,0.151290,0.240932,0.324545,0.766315,0.151290,0.000000,0.000000,0.293930,...,0.075820,0.076312,0.424541,0.000000,0.0,0.073420,0.079999,0.273688,0.000000,0.324545
356,0.045248,0.000000,0.522619,0.561551,0.243624,0.000000,0.522619,0.000000,0.000000,0.000000,...,0.056915,0.057285,0.318687,0.000000,0.0,0.055114,0.060052,0.205448,0.000000,0.243624
588,0.694848,0.482628,0.079391,0.073887,0.170309,0.000000,0.079391,0.602876,0.000000,0.239976,...,0.375435,0.475041,0.130194,0.494659,0.0,0.363552,0.396129,0.000000,0.423341,0.170309
597,0.086580,0.000000,1.000000,0.930671,0.466160,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.108904,0.000000,0.356361,0.000000,0.0,0.105457,0.114907,0.000000,0.000000,0.466160
912,0.000000,0.000000,0.808480,0.900989,0.000000,0.000000,0.808480,0.000000,0.000000,0.000000,...,0.000000,0.113222,0.261775,0.000000,0.0,0.000000,0.000000,0.406062,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113275,0.141984,0.000000,0.356361,0.567512,0.764462,0.000000,0.356361,0.000000,0.000000,0.000000,...,0.178593,0.179752,1.000000,0.000000,0.0,0.172940,0.188437,0.644669,0.000000,0.764462
113394,0.141984,0.000000,0.356361,0.567512,0.764462,0.000000,0.356361,0.000000,0.000000,0.000000,...,0.178593,0.179752,1.000000,0.000000,0.0,0.172940,0.188437,0.644669,0.000000,0.764462
136020,0.241259,0.293761,0.000000,0.000000,0.000000,0.678782,0.000000,0.366952,0.496414,0.665766,...,0.203588,0.000000,0.000000,0.000000,0.0,0.197144,0.000000,0.000000,0.256778,0.000000
137595,0.141984,0.000000,0.356361,0.567512,0.764462,0.000000,0.356361,0.000000,0.000000,0.000000,...,0.178593,0.179752,1.000000,0.000000,0.0,0.172940,0.188437,0.644669,0.000000,0.764462


- 사용자가 평가한 115개 영화에 대한 평점

In [41]:
user_rating_df = user_train_df[['rating']]
user_rating_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
296,1.0
356,3.5
588,4.0
597,3.5
912,4.0
...,...
113275,4.5
113394,4.0
136020,5.0
137595,4.0


- 유사도를 가중치로 사용하여 평점 예측

In [42]:
user_sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
predict_rating = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (user_sim_sum + 1)
prediction_df = pd.DataFrame(predict_rating, index=movie_sim_df.index).reset_index()
prediction_df = prediction_df.rename(columns={'level_0': 'movieId', 0: 'predict_rating'})
prediction_df

Unnamed: 0,movieId,predict_rating
0,1,3.379647
1,2,3.332194
2,3,3.215757
3,4,3.200339
4,5,3.146149
...,...,...
9737,193581,3.327307
9738,193583,3.299837
9739,193585,2.999658
9740,193587,3.416861


- Test 데이터의 true rating과 비교

In [43]:
result_df = prediction_df.merge(test_df[test_df.userId == user_id], on = 'movieId')
result_df

Unnamed: 0,movieId,predict_rating,userId,rating,timestamp
0,1907,3.284974,10,4.0,1455306183
1,4993,3.301527,10,4.0,1455356385
2,5066,3.194188,10,3.0,1455399329
3,5377,3.200339,10,3.5,1455301898
4,6535,3.146149,10,4.0,1455398379
5,8533,3.194188,10,5.0,1455301847
6,8970,2.999658,10,1.0,1455398160
7,31685,3.215757,10,4.5,1455357602
8,33145,3.200339,10,3.0,1455398153
9,33679,3.282119,10,3.0,1455357626


- 성능 지표(RMSE, MAE) 확인

In [44]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# rmse 구하기
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['predict_rating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true=result_df['rating'].values, y_pred=result_df['predict_rating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 1.134288279167042
mae 0.8684869962217949


#### 전체 유저에 대해 평점 예측

In [45]:
result_df = pd.DataFrame()

for user_id in tqdm(user_id_list):
        
    user_train_df = train_df[train_df.userId == user_id]
    user_train_df.index = user_train_df['movieId']
    user_train_df = user_train_df.sort_index()
    user_sim_df = movie_sim_df.loc[user_train_df['movieId']]
    
    user_rating_df = user_train_df[['rating']]
    
    user_sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
    predict_rating = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (user_sim_sum + 1)
    prediction_df = pd.DataFrame(predict_rating, index=movie_sim_df.index).reset_index()
    prediction_df = prediction_df.rename(columns={'level_0': 'movieId', 0: 'predict_rating'})
    
    # test
    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [00:20<00:00, 30.19it/s]


In [46]:
print(len(test_df))
print(len(result_df))

20168
20168


- 성능 지표(RMSE, MAE) 확인

In [47]:
# rmse
mse = mean_squared_error(y_true = result_df['rating'].values, y_pred = result_df['predict_rating'].values)
rmse = np.sqrt(mse)

# mae
mae = mean_absolute_error(y_true = result_df['rating'].values, y_pred = result_df['predict_rating'].values)

print("mae", mae)
print("rmse", rmse)

mae 0.7485065244915766
rmse 0.9551243628312982
