In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

#### movieLens 100K 데이터 불러오기
* u.user : 사용자 데이터
* u.item : 영화에 대한 데이터
    * movie_id : 영화 id
    * title : 제목 
    * release date : 개봉날짜
    * 'unknown' 부터 'Western'까지 19가지의 변수로 0또는 1로 장르를 표현
        * 예를들어 액션 영화는 'Action'에 1로 표시되며 나머지 18개 장르 변수는 0으로 표시
* u.data : 영화평가(rating) 데이터
    * user_id : 사용자 id
    * movie_id : 영화 id
    * rating : 평점 (1~5)
    * timestamp : 평가한 연도/날짜/시간

In [2]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('./Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [3]:
# train, test set 분리
x = ratings.copy()
y = ratings['user_id']
# 계층적 데이터 추출 옵션 (분류 모델에서 추천!)
# 여러 층으로 분할후 각 층별로 렌덤 데이터 추출, 원래 데이터의 분포와 유사하게 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [4]:
x_train

Unnamed: 0,user_id,movie_id,rating
90325,711,200,4
48930,632,98,4
29206,449,127,5
94430,893,358,2
97310,189,582,5
...,...,...,...
22163,141,235,1
63513,343,410,3
71115,502,343,5
36125,234,524,3


#### 정확도 계산
예측이 정확할 수록 실제값과 예측값 사이의 차이가 줄어들기 때문에 RMSE(Root Mean Squared Error)가 작을수록 정확한 추천 시스템이라고 할 수 있다.  
RMSE를 수식으로 표시하면 아래와 같다.
$$RMSE = \sqrt{\frac{1}{N}\sum_{i=1}^{N}(y_i-\widehat{y_i})^2}$$

In [5]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model):
    # 예측 대상인 test set에 있는 사용자(user_id)와 영화(movie_id)를 pair로 짝을 맞춰 데이터를 만든다.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에 대해서 주어진 예측 모델에 의한 예측값을 계산해 y_pred에 저장한다.
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    # 얻어진 예측값 배열과 실제값(rating)에 대한 RMSE를 계산하여 반환한다.
    return RMSE(y_true, y_pred)

In [6]:
# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1675,1676,1677,1678,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# train set의 모든 가능한 사용자 pair의 Cosine similarities 계산
# 코사인 유사도를 계산하기 위해 rating값을 matrix_dummy에 복사한다.
# 코사인 유사도를 계산할때 NaN값이 있으면 에러가 발생하므로 NaN값을 0으로 바꿔준다.
matrix_dummy = rating_matrix.copy().fillna(0)
# 모든 사용자 간의 코사인 유사도를 구한다.
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity

array([[1.        , 0.12446153, 0.02326615, ..., 0.04590327, 0.11002048,
        0.28958966],
       [0.12446153, 1.        , 0.09905845, ..., 0.09346662, 0.1710437 ,
        0.05346925],
       [0.02326615, 0.09905845, 1.        , ..., 0.10881647, 0.09699063,
        0.01976203],
       ...,
       [0.04590327, 0.09346662, 0.10881647, ..., 1.        , 0.10209701,
        0.12927249],
       [0.11002048, 0.1710437 , 0.09699063, ..., 0.10209701, 1.        ,
        0.09163252],
       [0.28958966, 0.05346925, 0.01976203, ..., 0.12927249, 0.09163252,
        1.        ]])

In [8]:
# user_similarity에 index를 지정해준다.
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.124462,0.023266,0.031639,0.307268,0.313678,0.332559,0.221192,0.025947,0.271694,...,0.344198,0.051883,0.230327,0.091475,0.112379,0.081647,0.209811,0.045903,0.110020,0.289590
2,0.124462,1.000000,0.099058,0.234117,0.034259,0.150826,0.063842,0.116503,0.119138,0.091762,...,0.073610,0.218372,0.213756,0.266062,0.260603,0.161739,0.157003,0.093467,0.171044,0.053469
3,0.023266,0.099058,1.000000,0.313205,0.027698,0.074879,0.050116,0.043328,0.056042,0.060873,...,0.018356,0.056030,0.105099,0.086701,0.125213,0.014945,0.120589,0.108816,0.096991,0.019762
4,0.031639,0.234117,0.313205,1.000000,0.041376,0.055411,0.068355,0.149437,0.057735,0.062712,...,0.014183,0.048102,0.096674,0.189466,0.125689,0.038490,0.202694,0.124560,0.159249,0.025449
5,0.307268,0.034259,0.027698,0.041376,1.000000,0.153465,0.285572,0.194173,0.013901,0.125265,...,0.257369,0.056620,0.078620,0.075304,0.107065,0.024712,0.160048,0.143284,0.113604,0.283211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.081647,0.161739,0.014945,0.038490,0.024712,0.116048,0.098661,0.056024,0.000000,0.061099,...,0.059707,0.362769,0.215816,0.209010,0.273317,1.000000,0.076680,0.116861,0.034929,0.131318
940,0.209811,0.157003,0.120589,0.202694,0.160048,0.247087,0.267895,0.209378,0.054856,0.312018,...,0.238357,0.069292,0.161976,0.099549,0.121650,0.076680,1.000000,0.131710,0.159824,0.206697
941,0.045903,0.093467,0.108816,0.124560,0.143284,0.143369,0.050950,0.092918,0.000000,0.043939,...,0.027603,0.255579,0.176109,0.243374,0.252990,0.116861,0.131710,1.000000,0.102097,0.129272
942,0.110020,0.171044,0.096991,0.159249,0.113604,0.209281,0.163485,0.206798,0.020281,0.108925,...,0.120403,0.050692,0.048299,0.076064,0.088111,0.034929,0.159824,0.102097,1.000000,0.091633


#### 예측치 계산
가장 기본적인 CF 알고리즘은 이웃을 전체 사용자로 한다.  
평점 예측치를 구하는 방법을 수식으로 표현하면 아래와 같다.  
$$p_{a,i} = \frac{\sum_{u=1}^{n}w_{a,u}\ast r_{u,i}}{\sum_{u=1}^{n}w_{a,u}}$$
* $a$ : 사용자, $u$ : 이웃 사용자, $n$ : 이웃 사용자의 수  
* $p_{a,i}$ : 아이템 $i$에 대한 사용자 $a$의 예상 평점
* $w_{a,u}$ : 사용자 $a$와 $u$의 유사도
* $r_{u,i}$ : 아이템 $i$에 대한 사용자 $u$의 평점  

위 식을 바탕으로 예측치를 구하는 알고리즘은 아래와 같다.
* 모든 사용자간의 코사인 유사도를 계산한다.
* 현재 추천 대상이 되는 사람과 다른 사용자의 유사도를 추출한다.
* 현재 사용자가 평가하지 않은 모든 아이템에 대해서 현재 사용자의 예상 평가 값을 구한다. 예상 평가값은 다른 사용자의 해당 아이템에 대한 평가(평점)를 현재 사용자와 그 사용자와의 유사도를 활용하여 가중합하여 구한다.
* 아이템 중에서 예상 평가값이 가장 높은 N개의 아이템을 추천한다.

In [9]:
# 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.dropna()
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(CF_simple)

1.019843360152776

In [10]:
def recom_movie(user_id, n_items):
    # 현 사용자가 평가한 영화 가져오기
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        # 현 사용자가 이미 평가한 영화는 제외 (평점을 0으로)        
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        # 현 사용자가 평가하지 않은 영화의 예상 평점 계산
        else:
            user_movie.loc[movie] = CF_simple(user_id, movie)
    # 영화를 예상 평점에 따라 정렬해서 제목을 뽑아서 돌려 줌
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1642                                    Angel Baby (1995)
1201          Maybe, Maybe Not (Bewegte Mann, Der) (1994)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1629    Silence of the Palace, The (Saimt el Qusur) (1...
Name: title, dtype: object