In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

#### movieLens 100K 데이터 불러오기
* u.user : 사용자 데이터
* u.item : 영화에 대한 데이터
    * movie_id : 영화 id
    * title : 제목 
    * release date : 개봉날짜
    * 'unknown' 부터 'Western'까지 19가지의 변수로 0또는 1로 장르를 표현
        * 예를들어 액션 영화는 'Action'에 1로 표시되며 나머지 18개 장르 변수는 0으로 표시
* u.data : 영화평가(rating) 데이터
    * user_id : 사용자 id
    * movie_id : 영화 id
    * rating : 평점 (1~5)
    * timestamp : 평가한 연도/날짜/시간

In [2]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('./Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [3]:
# train, test set 분리
x = ratings.copy()
y = ratings['user_id']
# 계층적 데이터 추출 옵션 (분류 모델에서 추천!)
# 여러 층으로 분할후 각 층별로 렌덤 데이터 추출, 원래 데이터의 분포와 유사하게 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [4]:
x_train

Unnamed: 0,user_id,movie_id,rating
87694,868,69,2
4777,216,274,3
6342,178,1012,4
69441,834,50,5
58598,303,271,2
...,...,...,...
36192,533,151,3
88565,189,568,4
1997,95,1229,2
13494,387,393,2


In [5]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model):
    # 예측 대상인 test set에 있는 사용자(user_id)와 영화(movie_id)를 pair로 짝을 맞춰 데이터를 만든다.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에 대해서 주어진 예측 모델에 의한 예측값을 계산해 y_pred에 저장한다.
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    # 얻어진 예측값 배열과 실제값(rating)에 대한 RMSE를 계산하여 반환한다.
    return RMSE(y_true, y_pred)

In [6]:
# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1675,1676,1678,1679,1680
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,3.0,,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# train set의 모든 가능한 사용자 pair의 Cosine similarities 계산
# 코사인 유사도를 계산하기 위해 rating값을 matrix_dummy에 복사한다.
# 코사인 유사도를 계산할때 NaN값이 있으면 에러가 발생하므로 NaN값을 0으로 바꿔준다.
matrix_dummy = rating_matrix.copy().fillna(0)
# 모든 사용자 간의 코사인 유사도를 구한다.
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity

array([[1.        , 0.1411888 , 0.0211074 , ..., 0.07324342, 0.14591351,
        0.28520551],
       [0.1411888 , 1.        , 0.06928426, ..., 0.15645496, 0.11841501,
        0.0962417 ],
       [0.0211074 , 0.06928426, 1.        , ..., 0.        , 0.06403786,
        0.01945672],
       ...,
       [0.07324342, 0.15645496, 0.        , ..., 1.        , 0.07605525,
        0.04094445],
       [0.14591351, 0.11841501, 0.06403786, ..., 0.07605525, 1.        ,
        0.15746087],
       [0.28520551, 0.0962417 , 0.01945672, ..., 0.04094445, 0.15746087,
        1.        ]])

In [9]:
# user_similarity에 index를 지정해준다.
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.141189,0.021107,0.049439,0.204076,0.289488,0.350537,0.305402,0.025411,0.275900,...,0.244625,0.080766,0.197821,0.121015,0.135283,0.091063,0.199964,0.073243,0.145914,0.285206
2,0.141189,1.000000,0.069284,0.122637,0.077151,0.260118,0.106583,0.087864,0.199102,0.135912,...,0.153650,0.235181,0.351799,0.429832,0.270287,0.206924,0.220518,0.156455,0.118415,0.096242
3,0.021107,0.069284,1.000000,0.197894,0.028577,0.043959,0.053683,0.038378,0.056079,0.060246,...,0.032861,0.037134,0.093873,0.037093,0.113514,0.019031,0.114790,0.000000,0.064038,0.019457
4,0.049439,0.122637,0.197894,1.000000,0.013584,0.067442,0.079837,0.079149,0.074048,0.063639,...,0.070664,0.000000,0.116348,0.117547,0.087056,0.000000,0.158306,0.013340,0.091804,0.041105
5,0.204076,0.077151,0.028577,0.013584,1.000000,0.161860,0.281063,0.241363,0.014243,0.170015,...,0.238462,0.090384,0.084065,0.072576,0.141300,0.074114,0.231256,0.114039,0.161783,0.218255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.091063,0.206924,0.019031,0.000000,0.074114,0.090481,0.080710,0.123643,0.052695,0.046043,...,0.030879,0.293104,0.213583,0.169162,0.306259,1.000000,0.077901,0.130529,0.040116,0.120666
940,0.199964,0.220518,0.114790,0.158306,0.231256,0.274198,0.250885,0.146164,0.090056,0.254831,...,0.286767,0.086233,0.176562,0.172859,0.145909,0.077901,1.000000,0.075550,0.226604,0.210596
941,0.073243,0.156455,0.000000,0.013340,0.114039,0.104417,0.053733,0.059255,0.087417,0.040069,...,0.023417,0.170760,0.196668,0.178859,0.242393,0.130529,0.075550,1.000000,0.076055,0.040944
942,0.145914,0.118415,0.064038,0.091804,0.161783,0.182803,0.227171,0.151890,0.103012,0.186862,...,0.185189,0.050320,0.051259,0.114677,0.059000,0.040116,0.226604,0.076055,1.000000,0.157461


In [11]:
# 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.dropna()
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(CF_simple)

1.0178187673065584