In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

#### movieLens 100K 데이터 불러오기
* u.user : 사용자 데이터
* u.item : 영화에 대한 데이터
    * movie_id : 영화 id
    * title : 제목 
    * release date : 개봉날짜
    * 'unknown' 부터 'Western'까지 19가지의 변수로 0또는 1로 장르를 표현
        * 예를들어 액션 영화는 'Action'에 1로 표시되며 나머지 18개 장르 변수는 0으로 표시
* u.data : 영화평가(rating) 데이터
    * user_id : 사용자 id
    * movie_id : 영화 id
    * rating : 평점 (1~5)
    * timestamp : 평가한 연도/날짜/시간

In [2]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('./Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [3]:
# train, test set 분리
x = ratings.copy()
y = ratings['user_id']
# 계층적 데이터 추출 옵션 (분류 모델에서 추천!)
# 여러 층으로 분할후 각 층별로 렌덤 데이터 추출, 원래 데이터의 분포와 유사하게 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [4]:
x_train

Unnamed: 0,user_id,movie_id,rating
32541,473,25,4
27156,533,477,4
76116,791,754,4
63425,629,475,4
19010,303,809,2
...,...,...,...
52318,694,174,5
91649,650,520,4
43393,436,974,5
26741,488,491,4


In [5]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    # 예측 대상인 test set에 있는 사용자(user_id)와 영화(movie_id)를 pair로 짝을 맞춰 데이터를 만든다.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에 대해서 주어진 예측 모델에 의한 예측값을 계산해 y_pred에 저장한다.
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    # 얻어진 예측값 배열과 실제값(rating)에 대한 RMSE를 계산하여 반환한다.
    return RMSE(y_true, y_pred)

In [6]:
# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1669,1670,1671,1672,1675,1676,1678,1679,1680
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# train set의 모든 가능한 사용자 pair의 Cosine similarities 계산
# 코사인 유사도를 계산하기 위해 rating값을 matrix_dummy에 복사한다.
# 코사인 유사도를 계산할때 NaN값이 있으면 에러가 발생하므로 NaN값을 0으로 바꿔준다.
matrix_dummy = rating_matrix.copy().fillna(0)
# 모든 사용자 간의 코사인 유사도를 구한다.
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity

array([[1.        , 0.11414548, 0.04952549, ..., 0.14612766, 0.1316413 ,
        0.32869675],
       [0.11414548, 1.        , 0.11151175, ..., 0.21181361, 0.13145893,
        0.09418667],
       [0.04952549, 0.11151175, 1.        , ..., 0.10994071, 0.12539853,
        0.02106246],
       ...,
       [0.14612766, 0.21181361, 0.10994071, ..., 1.        , 0.10221969,
        0.12136512],
       [0.1316413 , 0.13145893, 0.12539853, ..., 0.10221969, 1.        ,
        0.13145269],
       [0.32869675, 0.09418667, 0.02106246, ..., 0.12136512, 0.13145269,
        1.        ]])

In [8]:
# user_similarity에 index를 지정해준다.
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.114145,0.049525,0.064122,0.254955,0.328206,0.311416,0.280731,0.057258,0.298708,...,0.261380,0.091288,0.219663,0.160962,0.137928,0.085403,0.238942,0.146128,0.131641,0.328697
2,0.114145,1.000000,0.111512,0.198179,0.062104,0.160387,0.091312,0.075848,0.132919,0.069202,...,0.099780,0.208777,0.238940,0.383016,0.305171,0.158392,0.149127,0.211814,0.131459,0.094187
3,0.049525,0.111512,1.000000,0.338608,0.000000,0.047075,0.056169,0.055940,0.060687,0.012925,...,0.044325,0.020067,0.097515,0.091118,0.055501,0.016221,0.122888,0.109941,0.125399,0.021062
4,0.064122,0.198179,0.338608,1.000000,0.042025,0.048977,0.092360,0.120282,0.132592,0.015689,...,0.038259,0.048716,0.116743,0.241309,0.119578,0.039377,0.203184,0.177930,0.165480,0.015340
5,0.254955,0.062104,0.000000,0.042025,1.000000,0.189214,0.270702,0.164498,0.076994,0.135539,...,0.205304,0.055856,0.067791,0.045755,0.090713,0.057748,0.264006,0.025303,0.088247,0.242682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.085403,0.158392,0.016221,0.039377,0.057748,0.086744,0.072188,0.091366,0.053371,0.061383,...,0.040714,0.333557,0.253757,0.147642,0.278223,1.000000,0.040893,0.131066,0.000000,0.084282
940,0.238942,0.149127,0.122888,0.203184,0.264006,0.284829,0.198088,0.189958,0.106659,0.259443,...,0.229877,0.127201,0.113155,0.135243,0.185897,0.040893,1.000000,0.156622,0.200105,0.206331
941,0.146128,0.211814,0.109941,0.177930,0.025303,0.135403,0.064285,0.137615,0.183284,0.115283,...,0.060019,0.116961,0.314259,0.237006,0.196663,0.131066,0.156622,1.000000,0.102220,0.121365
942,0.131641,0.131459,0.125399,0.165480,0.088247,0.287623,0.213494,0.090277,0.123570,0.166443,...,0.161413,0.071367,0.056636,0.149413,0.067729,0.000000,0.200105,0.102220,1.000000,0.131453


In [9]:
# Neighbor size를 정해서 예측치를 계산하는 함수 
def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.drop(none_rating_idx)
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        # Neighbor size가 지정되지 않은 경우        
        if neighbor_size == 0:          
            # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        # Neighbor size가 지정된 경우
        else:                       
            # 해당 영화를 평가한 사용자가 최소 2명이 되는 경우에만 계산
            if len(sim_scores) > 1: 
                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                # 유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)
                # 유사도를 neighbor size만큼 받기
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                # 영화 rating을 neighbor size만큼 받기
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                # 최종 예측값 계산 
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

#### 최적의 neighbor size 구하기

In [10]:
best_error = 1e9
best_nighbor_size = 0
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    error = score(cf_knn, neighbor_size)
    print("Neighbor size = %d : RMSE = %.4f" % (neighbor_size, error))
    if error < best_error :
        best_error = error
        best_nighbor_size = neighbor_size
print(f"Best neighbor size = {best_nighbor_size}")

Neighbor size = 10 : RMSE = 1.0289
Neighbor size = 20 : RMSE = 1.0155
Neighbor size = 30 : RMSE = 1.0118
Neighbor size = 40 : RMSE = 1.0112
Neighbor size = 50 : RMSE = 1.0115
Neighbor size = 60 : RMSE = 1.0118
Best neighbor size = 40


#### 주어진 사용자에 대해 추천받기

In [11]:
def recom_movie(user_id, n_items, neighbor_size=30):
    # 현 사용자가 평가한 영화 가져오기
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        # 현 사용자가 이미 평가한 영화는 제외 (평점을 0으로)        
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        # 현 사용자가 평가하지 않은 영화의 예상 평점 계산
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
    # 영화를 예상 평점에 따라 정렬해서 제목을 뽑아서 돌려 줌
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=best_nighbor_size)

movie_id
1189                   That Old Feeling (1997)
1594                           Shopping (1994)
98      Snow White and the Seven Dwarfs (1937)
169                     Cinema Paradiso (1988)
515                          Local Hero (1983)
Name: title, dtype: object