# 가짜연구소 4기 - 추천시스템 톺아보기
## Collaborative Filtering
- 작성자: 김민수(kimminsu.ds@gmail.com)
- 출처: https://grouplens.org/datasets/movielens/latest/

## 00. 환경설정

### 00-01. 패키지

In [1]:
import os
import pandas as pd
import seaborn as sns
import scipy
import numpy as np
import random
from matplotlib import pyplot as plt
from datetime import datetime
from tqdm.auto import tqdm
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

### 00-02. MovieLens 데이터

In [2]:
path = "../data/MovieLens/"
ratings_df = pd.read_csv(path + 'ratings.csv', encoding="utf-8")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
user_id_list = sorted(list(set(ratings_df['userId'].values)))
movie_id_list = sorted(list(set(ratings_df['movieId'].values)))
print("유저 수: ", len(user_id_list), "\t영화 수: ", len(movie_id_list))

유저 수:  610 	영화 수:  9724


#### 유저-아이템 행렬 생성

In [4]:
user_item_matrix = ratings_df.pivot_table('rating', 'userId', 'movieId')
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


- null값에 대해 0으로 imputation
    - imputation 방법에 따라 성능이 달라지므로 주의

In [5]:
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### train/test셋 분리

In [6]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=10)
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


#### train 데이터로 다시 유저-아이템 행렬 생성
- test 데이터에만 포함되고 train 데이터에는 포함되지 않는 영화도 존재

In [7]:
user_item_matrix = train_df.pivot('userId', 'movieId', 'rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 01. User-based CF
- 유저 간 유사도 행렬 생성
- 행렬의 차원은 유저 수 X 유저 수

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix, user_item_matrix)
user_similarity_df = pd.DataFrame(data=user_similarity, columns=user_item_matrix.index, index=user_item_matrix.index)
user_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.016301,0.002372,0.159681,0.077843,0.086387,0.129031,0.142718,0.076267,0.011988,...,0.072078,0.134184,0.188766,0.057567,0.134504,0.152000,0.216784,0.238994,0.085660,0.126228
2,0.016301,1.000000,0.000000,0.004447,0.021457,0.000000,0.033068,0.030285,0.000000,0.077491,...,0.203789,0.000000,0.014237,0.000000,0.000000,0.017086,0.015708,0.026126,0.035100,0.062231
3,0.002372,0.000000,1.000000,0.002876,0.006938,0.001567,0.000000,0.005876,0.000000,0.000000,...,0.003541,0.006292,0.017135,0.000000,0.000000,0.008137,0.021943,0.013369,0.000000,0.018756
4,0.159681,0.004447,0.002876,1.000000,0.095364,0.062998,0.072072,0.054389,0.015945,0.032033,...,0.081442,0.086043,0.246229,0.041505,0.083509,0.147993,0.105575,0.116951,0.038205,0.093053
5,0.077843,0.021457,0.006938,0.095364,1.000000,0.232372,0.050577,0.457225,0.000000,0.020130,...,0.029949,0.360573,0.087434,0.136533,0.133002,0.074504,0.097953,0.111661,0.181444,0.041444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.152000,0.017086,0.008137,0.147993,0.074504,0.058634,0.166350,0.095844,0.065156,0.076395,...,0.132795,0.093870,0.235955,0.044477,0.108305,1.000000,0.115738,0.217882,0.051468,0.157066
607,0.216784,0.015708,0.021943,0.105575,0.097953,0.107113,0.122464,0.166779,0.016895,0.009983,...,0.062090,0.147275,0.161968,0.089290,0.112441,0.115738,1.000000,0.210962,0.156866,0.105044
608,0.238994,0.026126,0.013369,0.116951,0.111661,0.128532,0.272866,0.153214,0.061451,0.061855,...,0.149167,0.143084,0.181972,0.108152,0.151388,0.217882,0.210962,1.000000,0.097806,0.265188
609,0.085660,0.035100,0.000000,0.038205,0.181444,0.164772,0.048867,0.387803,0.000000,0.022308,...,0.000000,0.287308,0.022737,0.154484,0.071690,0.051468,0.156866,0.097806,1.000000,0.047442


### 01-01. Average rating

#### 15번 유저에 대해서 test 데이터의 평점 예측

In [9]:
user_id = 15
user_test_df = test_df[test_df.userId == user_id]
user_test_df

Unnamed: 0,userId,movieId,rating,timestamp
1557,15,122904,2.0,1510571949
1484,15,3535,3.5,1510572486
1561,15,134853,4.5,1510572481
1500,15,5445,4.0,1510571793
1553,15,115713,2.0,1510572009
1497,15,4886,3.5,1510577956
1547,15,109487,4.0,1510571878
1504,15,5989,5.0,1510571938
1467,15,2011,5.0,1510572060
1558,15,122922,2.0,1510572670


In [10]:
# average rating

result = []

for _, row in user_test_df.iterrows():
    
    user_id  = row['userId']
    movie_id = row['movieId']
    rating   = row['rating']
    
    if movie_id in user_item_matrix.columns:
        
        # 영화에 평점을 매긴 다른 유저들을 가져옵니다
        user_movie_matrix = user_item_matrix[user_item_matrix[movie_id] > 0][[movie_id]]
        
        # 다른 유저들의 해당 영화의 평점 평균을 구합니다
        numerator = user_movie_matrix[movie_id].sum()
        
        # 유저들의 숫자를 구합니다
        denominator = len(user_movie_matrix)
        
        predicted_rating = numerator / denominator
        
        result.append([user_id, movie_id, rating, predicted_rating])
    
result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
result_df

Unnamed: 0,userId,movieId,rating,predictedRating
0,15.0,122904.0,2.0,3.965116
1,15.0,3535.0,3.5,3.704545
2,15.0,134853.0,4.5,3.771429
3,15.0,5445.0,4.0,3.666667
4,15.0,115713.0,2.0,4.047619
5,15.0,4886.0,3.5,3.93
6,15.0,109487.0,4.0,4.042373
7,15.0,5989.0,5.0,3.912088
8,15.0,2011.0,5.0,3.5
9,15.0,122922.0,2.0,3.75


#### 성능 지표(RMSE/MAE) 확인

In [11]:
# rmse 구하기
mse = mean_squared_error(y_true = result_df['rating'].values, y_pred = result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = result_df['rating'].values, y_pred = result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 1.2809291818972557
mae 1.0537538379393914


#### train data의 전체 평균 평점

In [12]:
global_average = train_df['rating'].mean()
global_average

3.503954480091238

#### 전체 test 데이터에 대해서 average rating

In [13]:
result = []

for _, row in tqdm(test_df.iterrows()):
    
    user_id  = row['userId']
    movie_id = row['movieId']
    rating   = row['rating']
    
    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
        
        user_movie_matrix = user_item_matrix[user_item_matrix[movie_id] > 0][[movie_id]]
        numerator = user_movie_matrix[movie_id].sum()
        denominator = len(user_movie_matrix)
        predicted_rating = numerator / denominator
        
        result.append([user_id, movie_id, rating, predicted_rating])
    
    else:
        result.append([user_id, movie_id, rating, global_average])
        
    
result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
result_df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.959770
1,384.0,2763.0,3.0,3.742857
2,52.0,58559.0,5.0,4.270492
3,600.0,719.0,2.5,2.833333
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.142857
20164,177.0,6787.0,3.0,4.083333
20165,103.0,969.0,4.0,3.857143
20166,42.0,2875.0,3.0,3.083333


#### 성능 지표(RMSE/MAE) 확인

In [14]:
# rmse 구하기
mse = mean_squared_error(y_true = result_df['rating'].values, y_pred = result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = result_df['rating'].values, y_pred = result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.9792027101742625
mae 0.7553096185597311


### 01-02. Weighted Average Rating

In [15]:
# 전체 test 데이터에 대해서 similarity 활용한 weighted average rating

result = []

for _, row in tqdm(test_df.iterrows()):
    
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    
    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
        
        # 영화에 평점을 매긴 다른 유저들을 가져옵니다
        movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
        user_ids = movie_ratings.index
        
        # 예측하려는 유저와 해당 영화에 평점을 매긴 다른 유저들간의 유사도를 구합니다
        simliaritys = user_similarity_df[user_ids].loc[user_id]
        
        # 위에서 구한 유사도를 가중치로 사용하여 다른 유저들의 평점의 평균을 구합니다
        numerator = np.inner(movie_ratings.values, simliaritys)
        denominator = simliaritys.sum()
        
        if denominator == 0:
            continue
        predicted_rating = numerator / denominator
        
        result.append([user_id, movie_id, rating, predicted_rating])
    else:
        result.append([user_id, movie_id, rating, global_average])
    
weighted_average_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
weighted_average_result_df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.908533
1,384.0,2763.0,3.0,3.765776
2,52.0,58559.0,5.0,4.288992
3,600.0,719.0,2.5,2.555142
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20160,20.0,5015.0,4.0,3.237979
20161,177.0,6787.0,3.0,4.039030
20162,103.0,969.0,4.0,3.725923
20163,42.0,2875.0,3.0,3.061219


#### 성능 지표(RMSE/MAE) 확인

In [16]:
# rmse 구하기
mse = mean_squared_error(y_true = weighted_average_result_df['rating'].values,
                         y_pred = weighted_average_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = weighted_average_result_df['rating'].values,
                          y_pred = weighted_average_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.9713599605151955
mae 0.7488152459430106


### 01-02. k-Nearest Neighborhood CF(user-based)

In [17]:
# 전체 test 데이터에 대해서 similarity가 높은 유저 k명에 대해서만 weighted average rating

k = 20

result = []

for _, row in tqdm(test_df.iterrows()):
    
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    
    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
        
        # 영화에 평점을 매긴 다른 유저들을 가져옵니다
        movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
        user_ids = movie_ratings.index
        
        # 예측하려는 유저와 해당 영화에 평점을 매긴 다른 유저들 가운데 유사도가 높은 k명의 유저만을 사용합니다.
        candidate_simliaritys = user_similarity_df[user_ids].loc[user_id].sort_values(ascending=False)[:k]
        candidate_movie_ratings = movie_ratings[candidate_simliaritys.index]
        
        # 유사도가 높은 k명의 유저의 유사도와 평점을 사용하여 예측 유저의 평점을 예측합니다.
        numerator = np.inner(candidate_movie_ratings.values, candidate_simliaritys)
        denominator = candidate_simliaritys.sum()
        
        if denominator == 0:
            result.append([user_id, movie_id, rating, global_average])
            continue
        
        predicted_rating = numerator / denominator
        
        result.append([user_id, movie_id, rating, predicted_rating])
    
    else:
        result.append([user_id, movie_id, rating, global_average])
    
k_weighted_average_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
k_weighted_average_result_df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.463541
1,384.0,2763.0,3.0,3.819279
2,52.0,58559.0,5.0,4.648799
3,600.0,719.0,2.5,2.464960
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.237979
20164,177.0,6787.0,3.0,4.020931
20165,103.0,969.0,4.0,3.569079
20166,42.0,2875.0,3.0,3.061219


#### 성능 지표(RMSE/MAE) 확인

In [18]:
# rmse 구하기
mse = mean_squared_error(y_true = k_weighted_average_result_df['rating'].values,
                         y_pred = k_weighted_average_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = k_weighted_average_result_df['rating'].values,
                          y_pred = k_weighted_average_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.9705218301732899
mae 0.7473291624002141


## 02. Item based CF
- 아이템끼리의 유사도를 사용하여 평점 예측

#### train 데이터로 아이템-유저 매트릭스 생성

In [19]:
item_user_matrix = train_df.pivot_table('rating', 'movieId', 'userId').fillna(0)
item_user_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- 아이템끼리의 유사도 산출
- 행렬의 차원은 아이템 수 X 아이템 수

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(item_user_matrix, item_user_matrix)
item_similarity_df = pd.DataFrame(data=item_similarity,
                                  columns = item_user_matrix.index, index=item_user_matrix.index)
item_similarity_df

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.338970,0.315604,0.030125,0.261765,0.295128,0.235605,0.093550,0.170233,0.280456,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.338970,1.000000,0.249558,0.094187,0.238732,0.208502,0.224759,0.120127,0.010906,0.274381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.315604,0.249558,1.000000,0.000000,0.341006,0.240170,0.307270,0.269336,0.273639,0.194403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.030125,0.094187,0.000000,1.000000,0.181493,0.051524,0.251309,0.162301,0.000000,0.104297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.261765,0.238732,0.341006,0.181493,1.000000,0.250941,0.473463,0.117417,0.303564,0.180550,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### 02-01. Weighted Average Rating

In [21]:
# 유저가 본 영화들을 모두 활용해 예측한다.

k = 20

result = []

for _, row in tqdm(test_df.iterrows()):
    
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    
    if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:
        
        # 해당 유저가 봤던 다른 아이템 리스트를 가져옵니다
        item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
        movie_ids = item_ratings.index
        
        # 예측 아이템과 유저가 봤던 다른 아이템 간의 유사도를 구합니다
        item_simliaritys = item_similarity_df[movie_ids].loc[movie_id]
        
        # 다른 아이템들과 예측 아이템과의 유사도를 가중치로 사용하여 평점을 예측합니다
        numerator = np.inner(item_ratings.values, item_simliaritys)
        denominator = item_simliaritys.sum()
        
        if denominator == 0:
            result.append([user_id, movie_id, rating, global_average])
            continue
        
        predicted_rating = numerator / denominator
        
        result.append([user_id, movie_id, rating, predicted_rating])
        
    else:
        result.append([user_id, movie_id, rating, global_average])
    
item_based_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
item_based_result_df

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,4.079030
1,384.0,2763.0,3.0,2.963573
2,52.0,58559.0,5.0,4.571206
3,600.0,719.0,2.5,2.959931
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.563257
20164,177.0,6787.0,3.0,3.388352
20165,103.0,969.0,4.0,4.082881
20166,42.0,2875.0,3.0,3.678894


#### 성능 지표(RMSE/MAE) 확인

In [23]:
# rmse 구하기
mse = mean_squared_error(y_true = item_based_result_df['rating'].values,
                         y_pred = item_based_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = item_based_result_df['rating'].values,
                          y_pred = item_based_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.9255275896882756
mae 0.7132160614874838


### 02-02. k-Nearest Neighborhood CF(item-based)

In [24]:
# 유저가 본 영화들 가운데 예측 대상 movie와 비슷한 k개의 영화를 이용해 예측한다.

k = 20

result = []

for _, row in tqdm(test_df.iterrows()):
    
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']
    
    if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:
        
        # 해당 유저가 봤던 다른 아이템 리스트를 가져옵니다
        item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
        movie_ids = item_ratings.index
        
        # 예측 아이템과 유저가 봤던 다른 아이템 간의 유사도를 구하고, 유사도가 가장 높은 아이템 k개를 선택합니다
        candidate_simliaritys = item_similarity_df[movie_ids].loc[movie_id].sort_values(ascending=False)[:k]
        candidate_item_ratings = item_ratings[candidate_simliaritys.index]
        
        # 선택된 k개의 아이템의 평점과 예측 아이템과의 유사도를 가중치로 사용하여 평점을 예측합니다
        numerator = np.inner(candidate_item_ratings.values, candidate_simliaritys)
        denominator = candidate_simliaritys.sum()
        
        if denominator == 0:
            result.append([user_id, movie_id, rating, global_average])
            continue
        
        predicted_rating = numerator / denominator
        
        result.append([user_id, movie_id, rating, predicted_rating])
        
    else:
        result.append([user_id, movie_id, rating, global_average])
    
knn_item_based_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [25]:
knn_item_based_result_df

Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,4.552831
1,384.0,2763.0,3.0,2.969577
2,52.0,58559.0,5.0,4.627186
3,600.0,719.0,2.5,2.782147
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.383256
20164,177.0,6787.0,3.0,3.218865
20165,103.0,969.0,4.0,4.216606
20166,42.0,2875.0,3.0,3.381899


#### 성능 지표(RMSE/MAE) 확인

In [26]:
# rmse 구하기
mse = mean_squared_error(y_true = knn_item_based_result_df['rating'].values,
                         y_pred = knn_item_based_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = knn_item_based_result_df['rating'].values,
                          y_pred = knn_item_based_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.8737220918753847
mae 0.6635483136467064
