In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

#### movieLens 100K 데이터 불러오기
* u.user : 사용자 데이터
* u.item : 영화에 대한 데이터
    * movie_id : 영화 id
    * title : 제목 
    * release date : 개봉날짜
    * 'unknown' 부터 'Western'까지 19가지의 변수로 0또는 1로 장르를 표현
        * 예를들어 액션 영화는 'Action'에 1로 표시되며 나머지 18개 장르 변수는 0으로 표시
* u.data : 영화평가(rating) 데이터
    * user_id : 사용자 id
    * movie_id : 영화 id
    * rating : 평점 (1~5)
    * timestamp : 평가한 연도/날짜/시간

In [2]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('./Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [3]:
# train, test set 분리
x = ratings.copy()
y = ratings['user_id']
# 계층적 데이터 추출 옵션 (분류 모델에서 추천!)
# 여러 층으로 분할후 각 층별로 렌덤 데이터 추출, 원래 데이터의 분포와 유사하게 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [4]:
x_train

Unnamed: 0,user_id,movie_id,rating
1878,279,79,3
10040,270,747,5
1173,6,517,4
85397,197,802,4
48133,655,70,2
...,...,...,...
75855,279,946,3
44980,318,722,4
49622,650,608,4
39315,405,379,1


In [5]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    # 예측 대상인 test set에 있는 사용자(user_id)와 영화(movie_id)를 pair로 짝을 맞춰 데이터를 만든다.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에 대해서 주어진 예측 모델에 의한 예측값을 계산해 y_pred에 저장한다.
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    # 얻어진 예측값 배열과 실제값(rating)에 대한 RMSE를 계산하여 반환한다.
    return RMSE(y_true, y_pred)

In [6]:
# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1675,1677,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,,,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,,,,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# train set의 모든 가능한 사용자 pair의 Cosine similarities 계산
# 코사인 유사도를 계산하기 위해 rating값을 matrix_dummy에 복사한다.
# 코사인 유사도를 계산할때 NaN값이 있으면 에러가 발생하므로 NaN값을 0으로 바꿔준다.
matrix_dummy = rating_matrix.copy().fillna(0)
# 모든 사용자 간의 코사인 유사도를 구한다.
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity

array([[1.        , 0.11503833, 0.03379491, ..., 0.09201245, 0.13365915,
        0.28667303],
       [0.11503833, 1.        , 0.11065229, ..., 0.13938804, 0.15647346,
        0.10313831],
       [0.03379491, 0.11065229, 1.        , ..., 0.1326662 , 0.07782386,
        0.0342218 ],
       ...,
       [0.09201245, 0.13938804, 0.1326662 , ..., 1.        , 0.06386232,
        0.04457529],
       [0.13365915, 0.15647346, 0.07782386, ..., 0.06386232, 1.        ,
        0.09923354],
       [0.28667303, 0.10313831, 0.0342218 , ..., 0.04457529, 0.09923354,
        1.        ]])

In [8]:
# user_similarity에 index를 지정해준다.
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.115038,0.033795,0.060518,0.291783,0.335600,0.342459,0.184404,0.036107,0.304313,...,0.267410,0.069875,0.189716,0.144729,0.127010,0.098832,0.253685,0.092012,0.133659,0.286673
2,0.115038,1.000000,0.110652,0.129119,0.083546,0.197608,0.061168,0.004504,0.128000,0.155999,...,0.107501,0.245472,0.298646,0.301185,0.185990,0.138093,0.156902,0.139388,0.156473,0.103138
3,0.033795,0.110652,1.000000,0.295071,0.000000,0.064631,0.055333,0.085930,0.023258,0.039216,...,0.009376,0.056940,0.099851,0.046466,0.077066,0.000000,0.127313,0.132666,0.077824,0.034222
4,0.060518,0.129119,0.295071,1.000000,0.041711,0.050703,0.065445,0.157018,0.059498,0.059137,...,0.029982,0.048554,0.135802,0.198113,0.054326,0.000000,0.193745,0.185119,0.179976,0.060031
5,0.291783,0.083546,0.000000,0.041711,1.000000,0.198597,0.277194,0.157623,0.044796,0.174914,...,0.250950,0.031514,0.065141,0.104238,0.090993,0.071992,0.200330,0.110538,0.129010,0.244168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.098832,0.138093,0.000000,0.000000,0.071992,0.106509,0.063387,0.065493,0.000000,0.066070,...,0.036683,0.291969,0.205895,0.174449,0.395931,1.000000,0.052249,0.160862,0.038752,0.083464
940,0.253685,0.156902,0.127313,0.193745,0.200330,0.257199,0.262917,0.147691,0.100448,0.270607,...,0.236453,0.081972,0.126331,0.122637,0.092983,0.052249,1.000000,0.094874,0.191103,0.184749
941,0.092012,0.139388,0.132666,0.185119,0.110538,0.087165,0.046684,0.146905,0.106031,0.076452,...,0.000000,0.173055,0.244891,0.255963,0.191283,0.160862,0.094874,1.000000,0.063862,0.044575
942,0.133659,0.156473,0.077824,0.179976,0.129010,0.231954,0.204631,0.092687,0.082103,0.151186,...,0.164111,0.050251,0.066183,0.092645,0.073554,0.038752,0.191103,0.063862,1.000000,0.099234


#### 평가 경향 고려
CF의 정확도를 개선시키는 방법 중의 하나는 사용자의 평가경향(user bias)을 고려해서 예측치를 조정하는 것이다.  
사용자에 따라서 평가를 전체적으로 높게 하는 사람이 있는 반면에 평가를 전체적으로 낮게 하는 사람이 있는 등, 사람에 따라 평가경향이 다르다.  
예를 들어 현재 추천의 대상이 되는 사용자의 평점평균이 3.0인 데 비해 예측치를 구하는 데 사용된 사용자(이웃)들의 평점평균은 4.0이라고 가정하자. 만일 이웃의 원래 평점으로 계산한 예측값이 3.5라면 현재 사용자의 평가경향을 고려하면 3.5-1(이웃과 현 사용자와 평점평균 차이)=2.5가 합리적인 예측치일 것이다. 사용자들의 평가 경향을 고려하는 경우 예측치를 계산하는 방법을 수식으로 표시하면 다음과 같다.  
  
  
$$p_{a,i} = \overline{r_a} + \frac{\sum_{u=1}^{n}w_{a,u}\ast (r_{u,i}-\overline{r_u})}{\sum_{u=1}^{n}w_{a,u}}$$  
* $a$ : 사용자, $u$ : 이웃 사용자, $n$ : 이웃 사용자의 수  
* $p_{a,i}$ : 아이템 $i$에 대한 사용자 $a$의 예상 평점
* $w_{a,u}$ : 사용자 $a$와 $u$의 유사도
* $r_{u,i}$ : 아이템 $i$에 대한 사용자 $u$의 평점
* $\overline{r_a}$ : 사용자 $a$의 전체 평점평균
* $\overline{r_u}$ : 사용자 $u$의 전체 평점평균  
  
위 식을 바탕으로 예측치를 구하는 알고리즘은 아래와 같다.
* 각 사용자의 평점평균을 구한다.
* 각 아이템의 평점을 각 사용자의 평균에서의 차이(평점-해당 사용자의 평점평균)로 변환한다.
* 평점편차(평점과 평균의 차이)를 사용해서 해당 사용자의 해당 아이템의 편차 예측값(평점편차의 예측값)을 구한다. 구체적으로는 해당 사용자의 이웃을 구하고 이들 이웃의 해당 아이템에 대한 평점편차와 유사도를 가중평균한다.
* 이렇게 구한 평차 예측값은 평균에서의 차이를 의미하기 때문에 현 사용자의 평균에 이 편차 예측값을 더해준다.
* 예측값을 구할 수 없는 경우에 해당 사용자의 평점평균으로 대체한다.

In [9]:
# train 데이터의 user의 rating 평균과 영화의 평점편차 계산 
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

In [10]:
rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1675,1677,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.372549,-0.627451,0.372549,-0.627451,-0.627451,,,-2.627451,1.372549,,...,,,,,,,,,,
2,0.255319,,,,,,,,,-1.744681,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.122137,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.594595,,...,,,,,,,,,,
940,,,,-1.387500,,,,,,,...,,,,,,,,,,
941,,,,,,,-0.058824,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


#### 신뢰도 가중(significance weighting)
* 유사도를 신뢰도에 따라서 가중
* 공통으로 평가한 아이템이 많은 사용자와의 유사도에 공통 아이템이 적은 사용자와의 유사도보다 더 큰 가중치를 준다.
* 신뢰도(공통으로 평가한 영화의 수)가 일정 이산인 사용자만을 예측치 계산에 사용

In [11]:
# 사용자별 공통 평가 수 계산
rating_binary1 = np.array((rating_matrix > 0).astype(float))
rating_binary2 = rating_binary1.T
counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)

In [12]:
counts

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,8.0,5.0,6.0,45.0,54.0,86.0,16.0,2.0,44.0,...,43.0,7.0,25.0,11.0,19.0,9.0,30.0,5.0,16.0,44.0
2,8.0,47.0,7.0,4.0,4.0,19.0,8.0,1.0,4.0,12.0,...,7.0,9.0,22.0,11.0,11.0,5.0,8.0,5.0,8.0,7.0
3,5.0,7.0,40.0,9.0,0.0,6.0,8.0,4.0,1.0,4.0,...,1.0,2.0,7.0,3.0,5.0,0.0,9.0,4.0,4.0,2.0
4,6.0,4.0,9.0,18.0,2.0,3.0,6.0,6.0,1.0,3.0,...,1.0,1.0,6.0,5.0,2.0,0.0,7.0,3.0,6.0,3.0
5,45.0,4.0,0.0,2.0,131.0,27.0,55.0,14.0,3.0,22.0,...,33.0,2.0,7.0,5.0,12.0,6.0,17.0,4.0,11.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,9.0,5.0,0.0,0.0,6.0,9.0,7.0,2.0,0.0,4.0,...,3.0,10.0,13.0,5.0,19.0,37.0,2.0,4.0,2.0,5.0
940,30.0,8.0,9.0,7.0,17.0,29.0,38.0,9.0,4.0,30.0,...,24.0,4.0,13.0,6.0,8.0,2.0,80.0,3.0,12.0,16.0
941,5.0,5.0,4.0,3.0,4.0,6.0,4.0,5.0,2.0,4.0,...,0.0,4.0,11.0,6.0,7.0,4.0,3.0,17.0,2.0,2.0
942,16.0,8.0,4.0,6.0,11.0,23.0,26.0,5.0,3.0,13.0,...,14.0,2.0,7.0,3.0,6.0,2.0,12.0,2.0,59.0,8.0


In [13]:
def CF_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        # 현 user와 다른 사용자 간의 유사도 가져오기
        sim_scores = user_similarity[user_id]
        # 현 movie의 평점편차 가져오기
        movie_ratings = rating_bias[movie_id]
        # 현 movie에 대한 rating이 없는 사용자 표시
        no_rating = movie_ratings.isnull()
        # 현 사용자와 다른 사용자간 공통 평가 아이템 수 가져오기 
        common_counts = counts[user_id]
        # 공통으로 평가한 영화의 수가 SIG_LEVEL보다 낮은 사용자 표시
        low_significance = common_counts < SIG_LEVEL
        # 평가를 안 하였거나, SIG_LEVEL이 기준 이하인 user 제거
        none_rating_idx = movie_ratings[no_rating | low_significance].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        # Neighbor size가 지정되지 않은 경우        
        if neighbor_size == 0:
            # 편차로 예측값(편차 예측값) 계산
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            # 편차 예측값에 현 사용자의 평균 더하기
            prediction = prediction + rating_mean[user_id]
        # Neighbor size가 지정된 경우            
        else:
            # 해당 영화를 평가한 사용자가 최소 MIN_RATINGS 이상인 경우에만 계산            
            if len(sim_scores) > MIN_RATINGS:
                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                # 유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)
                # 유사도와 rating을 neighbor size만큼 받기
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                # 편차로 예측치 계산
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                # 예측값에 현 사용자의 평균 더하기
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    # 경우에 따라서 예측값이 1보다 작거나 5보다 큰 경우에 대한 처리
    if prediction < 1:
        prediction = 1
    elif prediction > 5:
        prediction = 5
    return prediction

SIG_LEVEL = 3
MIN_RATINGS = 2

#### 최적의 neighbor size 구하기

In [14]:
best_error = 1e9
best_nighbor_size = 0
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    error = score(CF_knn_bias_sig, neighbor_size)
    print("Neighbor size = %d : RMSE = %.4f" % (neighbor_size, error))
    if error < best_error :
        best_error = error
        best_nighbor_size = neighbor_size
print(f"Best neighbor size = {best_nighbor_size}")

Neighbor size = 10 : RMSE = 0.9505
Neighbor size = 20 : RMSE = 0.9401
Neighbor size = 30 : RMSE = 0.9383
Neighbor size = 40 : RMSE = 0.9392
Neighbor size = 50 : RMSE = 0.9403
Neighbor size = 60 : RMSE = 0.9409
Best neighbor size = 30


#### 주어진 사용자에 대해 추천받기

In [15]:
def recom_movie(user_id, n_items, neighbor_size=30):
    # 현 사용자가 평가한 영화 가져오기
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        # 현 사용자가 이미 평가한 영화는 제외 (평점을 0으로)        
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        # 현 사용자가 평가하지 않은 영화의 예상 평점 계산
        else:
            user_movie.loc[movie] = CF_knn_bias_sig(user_id, movie, neighbor_size)
    # 영화를 예상 평점에 따라 정렬해서 제목을 뽑아서 돌려 줌
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=best_nighbor_size)

movie_id
515                      Local Hero (1983)
64      What's Eating Gilbert Grape (1993)
1189               That Old Feeling (1997)
119                      Striptease (1996)
1449                Golden Earrings (1947)
Name: title, dtype: object