In [5]:
import numpy as np 
import pandas as pd
import operator
from scipy.sparse import coo_matrix
from numpy.linalg import norm
from sklearn.metrics import mean_squared_error
import random

In [6]:
ls

[0m[01;34msample_data[0m/


In [7]:
import os, sys 
from google.colab import drive 

### 해당 코드 실행 시 colab에서 실행중인 폴더의 /content/drive/My Drive가 구글 드라이브에 연결됨

drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
#### 영화 데이터셋 형태 확인 ####
movies = pd.read_csv("drive/MyDrive/data/others/movies.csv")

print("### Movie Dataset Format ###", end = '\n\n')
print(movies.head())

### Movie Dataset Format ###

   movieId  ...                                       genres
0        1  ...  Adventure|Animation|Children|Comedy|Fantasy
1        2  ...                   Adventure|Children|Fantasy
2        3  ...                               Comedy|Romance
3        4  ...                         Comedy|Drama|Romance
4        5  ...                                       Comedy

[5 rows x 3 columns]


In [9]:
################### 가상 유저 추가해주기 위한 작업 데이터 전처리 - 장르별 영화 리스트를 뽑는다 #################

# 이 셀을 실행하는 이유는 후에 특정 장르를 좋아하는 가상의 유저를 만들어서 추천의 결과를 보기 위함입니다.

movie_dict = dict()                 # {movie_id : (movie_title, movie_genre)}
musical_list = list()               # musical 장르의 영화들
horror_list = list()                # horror 장르의 영화들
documentary_list = list()           # documentary 장르의 영화들
comedy_list = list()                # comedy 장르의 영화들
animation_list = list()

for (movie_id, movie_title, movie_genre) in movies.itertuples(index=False):
    movie_dict[movie_id] = (movie_title, movie_genre)
    if 'Musical' in movie_genre:
        musical_list.append(movie_id)
    if 'Horror' in movie_genre:
        horror_list.append(movie_id)
    if 'Documentary' in movie_genre:
        documentary_list.append(movie_id)
    if 'Comedy' in movie_genre:
        comedy_list.append(movie_id)
    if 'Animation' in movie_genre:
        animation_list.append(movie_id)

In [10]:
ratings = pd.read_csv("drive/MyDrive/data/others/ratings.csv")

# 평점 데이터셋 형태 확인
print("### Rating Dataset Format ###", end='\n\n')
print(ratings.head(), end='\n\n\n')
ratings.drop(['timestamp'], axis=1, inplace=True)
print(ratings.head())

### Rating Dataset Format ###

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [11]:
'''
    유저가 평균적으로 영화를 130개정도 본 것으로 나타나서 일반적으로 추가하는 사람들도 그정도로 설정
    그리고, 영화에 대한 평점을 줄 때는 영화 순서를 섞고, 무조건 5점을 주는 것이 아니라 4, 5점 중 랜덤으로 설정
    낮은 점수를 줄 때도 마찬가지.
'''
# uid==800을 가지는 유저를 추가한다. 이 유저는 뮤지컬의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);

print(ratings)
new_uid = 800
rows = list()
for movie_id in musical_list[:100]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in horror_list[:50]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in documentary_list[:20]:
    rows.append([new_uid, movie_id, random.randint(2,3)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)


        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]
        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101001     800    94122     2.0
101002     800     4350     2.0
101003     800    56379     2.0
101004     800   134524     3.0
101005     800    34072     2.0

[101006 rows x 3 columns]


In [12]:
# uid==850을 가지는 유저를 추가한다. 이 유저는 다큐 영화의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);


print(ratings)
new_uid = 850
rows = list()
for movie_id in documentary_list[:100]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in horror_list[:50]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in animation_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,2)])

for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101001     800    94122     2.0
101002     800     4350     2.0
101003     800    56379     2.0
101004     800   134524     3.0
101005     800    34072     2.0

[101006 rows x 3 columns]
        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101161     850     2566     2.0
101162     850   128968     2.0
101163     850    33090     2.0
101164     850   139640     1.0
101165     850    61160     1.0

[101166 rows x 3 columns]


In [13]:
# uid==900을 가지는 유저를 추가한다. 이 유저는 호러의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);

new_uid = 900
rows = list()
for movie_id in horror_list[:120]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in documentary_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in comedy_list[:30]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in animation_list[:20]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101341     900    79008     1.0
101342     900    46948     1.0
101343     900    42734     2.0
101344     900    27369     2.0
101345     900     4241     2.0

[101346 rows x 3 columns]


In [14]:
# uid==950을 가지는 유저를 추가한다. 이 유저는 애니메이션의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);

new_uid = 950
rows = list()
for movie_id in horror_list[:20]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in documentary_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,3)])
for movie_id in comedy_list[:30]:
    rows.append([new_uid, movie_id, random.randint(2,4)])
for movie_id in animation_list[:150]:
    rows.append([new_uid, movie_id, random.randint(3,5)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101551     950    44020     5.0
101552     950    26776     3.0
101553     950    95965     4.0
101554     950   161354     5.0
101555     950    95475     3.0

[101556 rows x 3 columns]


In [15]:
########################################################################
######### Mapping user id, movie id to user index, movie index #########
########################################################################

uid_2_idx = dict()     # user id --> user idx
mid_2_idx = dict()    # movie id --> movie idx

u_idx = 0
m_idx = 0
for user_id, movie_id, r in ratings.itertuples(index=False):
    if user_id not in uid_2_idx:
        uid_2_idx[user_id] = u_idx
        u_idx = u_idx+1

    if movie_id not in mid_2_idx:
        mid_2_idx[movie_id] = m_idx
        m_idx = m_idx+1

num_user = len(uid_2_idx)
num_movie = len(mid_2_idx)
print(num_user) # 유저수
print(num_movie) # 영화수

uidx_2_id = {v:k for (k,v) in uid_2_idx.items()}    # user idx --> user id
midx_2_id = {v:k for (k,v) in mid_2_idx.items()}    # movie idx --> movie id


614
9726


In [16]:
# rating matrix 초기화
rating_matrix = np.zeros((num_user, num_movie)) # num_user x num_movie를 크기로 가지는 numpy 행렬 선언, 각 요소는 유저가 영화에 매긴 rating을 나타냄.

for user_id, movie_id, r in ratings.itertuples(index=False):
    u_idx = uid_2_idx[user_id]
    m_idx = mid_2_idx[movie_id]
    rating_matrix[u_idx, m_idx] = r

rating_matrix

array([[4., 4., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 5., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 4.]])

In [17]:
######################################################################################################################################
# Training Set과 Test Set을 분리해 주는 함수
######################################################################################################################################
def train_test_split(ratings):
    test = np.zeros_like(ratings)
    train = ratings.copy()
    for x in range(ratings.shape[0]):
        nonzero_idx = ratings[x, :].nonzero()[0]
        test_ratings = np.random.choice(nonzero_idx, 
                                        size=int(len(nonzero_idx)/5),  
                                        replace=False)
        train[x, test_ratings] = 0.
        test[x, test_ratings] = ratings[x, test_ratings]
        
    assert(np.all((train * test) == 0))     # train set과 test set이 완전히 분리되었는지 확인

    return train, test

######################################################################################
###################### 유저별 평균을 기존 rating에서 빼준다 ##########################
######################################################################################
####### *****rating이 "있는" 값들의 평균만을 rating이 있는 곳에서만 빼준다############
def subtract_mean_ratings(ratings):
    mean_subtracted_ratings = np.zeros_like(ratings)
    avg_ratings = np.zeros(ratings.shape[0])
    for i in range(ratings.shape[0]):
        nonzero_idx = ratings[i].nonzero()[0]                       # rating 값이 존재하는(0이 아닌) index 추출
        sum_ratings = np.sum(ratings[i])          
        num_nonzero = len(nonzero_idx)
        avg_rating = sum_ratings / num_nonzero                      # rating 값들의 평균값 계산
        if num_nonzero == 0:                  
            avg_rating = 0
        avg_ratings[i] = avg_rating
        mean_subtracted_ratings[i, nonzero_idx] = ratings[i, nonzero_idx] - avg_rating 
                                                                    # 원 rating matrix에서 평균 값을 빼줌
    return mean_subtracted_ratings, avg_ratings


In [18]:
train_ratings, test_ratings = train_test_split(rating_matrix)
mean_subtracted_ratings, avg_ratings = subtract_mean_ratings(train_ratings)

In [53]:
def calculate_rmse(R, U, V, lambda_u, lambda_v):
    error = 0
    for u, i, r_ui in zip(R.row, R.col, R.data):
        error += (r_ui-np.dot(U[u,:],V[:,i]))**2
    for u in range(U.shape[0]):
        error += lambda_u*(norm(U[u,:])**2)
    for i in range(V.shape[1]):
        error += lambda_v*(norm(V[:,i])**2)
    rmse = np.sqrt(error/len(R.data))
    return rmse

def SGD(U, V, R, lr, lambda_u, lambda_v):
    for u, i, r_ui in R:
        '''
            Insert Your Code!
        '''
        e = r_ui - np.dot(U[u, :],V[:, i])
        grad_user = 2*e*(-V[:,i])+2*lambda_u*U[u,:]
        grad_item = 2*e*(-U[u,:])+2*lambda_v*V[:,i]
        U[u,:] -=lr*grad_user
        V[:,i] -=lr*grad_item
    return U, V


def train(ratings, dim=10, max_epoch=50, lambda_u=0.1, lambda_v=0.1, lr=0.01):
    #lambda_u : regularization coefficient of U matrix, lambda_v : regularizatoin coeff of V matrix,
    # lr : learning rate
    num_u, num_i = ratings.shape

    U = np.random.rand(num_u, dim)
    V = np.random.rand(dim, num_i)
    R = coo_matrix(ratings)
    rmse = calculate_rmse(R, U, V, lambda_u, lambda_v)
    print("Initial RMSE: " + str(rmse))

    for epoch in range(max_epoch):
        if epoch != 0 and epoch%10==0:
            lr /= 2
        rating_data = list(zip(R.row, R.col, R.data))
        random.shuffle(rating_data)
        U, V = SGD(U, V, rating_data, lr, lambda_u, lambda_v)
        rmse = calculate_rmse(R, U, V, lambda_u, lambda_v)        
        print('Epoch: {:5}, RMSE: {:15}, Learning Rate:{}'.format(epoch, rmse, lr))
    
    return U, V 

def predict(U, V, user_id=None):
    # user_id가 주어지지 않은 경우엔 전체 예측된 평점 행렬을 return,
    # 주어진 경우엔 해당 유저에 대한 예측 평점 행렬을 return한다.

    if user_id is None:
        return np.dot(U, V)
    else:
        user_idx = uid_2_idx[user_id]
        return np.dot(U[user_idx], V )


#######################################################################################
##########user id = uid인 유저에게 유저가 본 영화를 제외하고 N개를 추천해준다######
#######################################################################################
def recommend_for_uid(uid, ori_rating_matrix, U, V, top_N):

    movie_rating_dict = dict()
    u_predicted_rating = predict(U, V, user_id=uid)
    for movie_idx, pred_rating in enumerate(u_predicted_rating):
        movie_rating_dict[movie_idx] = pred_rating
    
    uidx = uid_2_idx[uid]
    sorted_dict = sorted(movie_rating_dict.items(), key=operator.itemgetter(1), reverse=True)       # 전체 영화를 대상으로 예측 rating을 기준으로 나열
    already_seen_movie_idxs = np.nonzero(ori_rating_matrix[uidx])[0]                                # 유저가 평점을매긴(이미 본) 영화들의 idx

    print_cnt = 0
    for idx, pred_rating in sorted_dict:
        if print_cnt == top_N:
            break
        if idx not in already_seen_movie_idxs:
            print(str(movie_dict[midx_2_id[idx]])+"    "+str(pred_rating))
            print_cnt = print_cnt+1


In [40]:
U, V = train(mean_subtracted_ratings)

Initial RMSE: 2.7389661595361483
Epoch:     0, RMSE: 0.9763203950454862, Learning Rate:0.01
Epoch:     1, RMSE: 0.9514558786078686, Learning Rate:0.01
Epoch:     2, RMSE: 0.9403761535696558, Learning Rate:0.01
Epoch:     3, RMSE: 0.9294676338189434, Learning Rate:0.01
Epoch:     4, RMSE: 0.9158462748841169, Learning Rate:0.01
Epoch:     5, RMSE: 0.8972222117640486, Learning Rate:0.01
Epoch:     6, RMSE: 0.8795187665614561, Learning Rate:0.01
Epoch:     7, RMSE: 0.8633979299190392, Learning Rate:0.01
Epoch:     8, RMSE: 0.848787763300607, Learning Rate:0.01
Epoch:     9, RMSE: 0.8365186595704289, Learning Rate:0.01
Epoch:    10, RMSE: 0.8272997194436066, Learning Rate:0.005
Epoch:    11, RMSE: 0.8211153393017896, Learning Rate:0.005
Epoch:    12, RMSE: 0.8159870177814491, Learning Rate:0.005
Epoch:    13, RMSE: 0.8106301468829887, Learning Rate:0.005
Epoch:    14, RMSE: 0.8054993756983668, Learning Rate:0.005
Epoch:    15, RMSE: 0.8001373434983837, Learning Rate:0.005
Epoch:    16, RMSE

In [41]:
predicted_ratings = predict(U, V)
predicted_ratings += avg_ratings.reshape([-1,1])
print(predicted_ratings)

[[4.55735374 4.1495546  4.59921715 ... 4.33083892 4.5198685  4.23945239]
 [4.14761425 4.11034535 3.72871786 ... 4.67665807 4.38882521 4.06034808]
 [1.58430539 1.49067663 2.16461207 ... 3.47014124 1.73212693 2.70850481]
 ...
 [3.86989882 2.99678228 3.92313203 ... 3.6187934  4.64270758 3.16405686]
 [3.34741761 2.72844163 3.95599202 ... 2.89841735 3.21773264 2.87075648]
 [3.50724282 3.79529723 3.92730174 ... 2.85520816 3.08820576 3.63217628]]


In [42]:
# 타겟 유저를 정해서, 해당 유저가 본 영화들의 분포를 뽑아내보기
# 유저의 추첨 결과가 올바르게 나왔는지 해석하기 위하여 해당 함수를 만들어서 추천 결과와 출력합니다.  
def print_user_preference(ori_rating_matrix, u_idx, type):
    '''
        type : ['avg', 'sum', 'cnt']
        avg 는 평균 평점, sum은 평점의 합, cnt은 평점 매긴 횟수
    '''
    target_hist = ori_rating_matrix[u_idx]
    target_seen_movie_idx =  np.nonzero(target_hist)[0]
    target_genre_dict = dict()
    target_genre_sum_rating_dict = dict()
    for movie_idx in target_seen_movie_idx:
        _, genre = movie_dict[midx_2_id[movie_idx]]
        for g in genre.split("|"):
            if g not in target_genre_dict.keys():
                target_genre_dict[g]=0
                target_genre_sum_rating_dict[g]=0
            target_genre_dict[g]=target_genre_dict[g]+1
            target_genre_sum_rating_dict[g] = target_genre_sum_rating_dict[g] + ori_rating_matrix[u_idx][movie_idx]

    # # 많이 본 횟수 대로 
    if type=='cnt':
        target_genre_dict = sorted(target_genre_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_dict:
            print(k,v)
    
    # # 영화 평점의 합대로
    elif type=='sum':
        target_genre_sum_rating_dict = sorted(target_genre_sum_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_sum_rating_dict:
            print(k,v)
        
    # 영화 평균 평점 순서대로
    elif type=="avg":    
        target_genre_avg_rating_dict = dict()
        for k,v in target_genre_sum_rating_dict.items():
            target_genre_avg_rating_dict[k] = v/target_genre_dict[k]
        target_genre_avg_rating_dict = sorted(target_genre_avg_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_avg_rating_dict:
            print(k,v)
    
    else:
        target_genre_avg_rating_dict = dict()
        for k,v in target_genre_sum_rating_dict.items():
            target_genre_avg_rating_dict[k] = v/target_genre_dict[k]
        target_genre_avg_rating_dict = sorted(target_genre_avg_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_avg_rating_dict:
            print(k,v,target_genre_dict[k],target_genre_sum_rating_dict[k])

In [43]:
##### Check the RMSE of the test set ####### 
pred = predicted_ratings[test_ratings.nonzero()].flatten()
actual = test_ratings[test_ratings.nonzero()].flatten()

print("### Test RMSE ###")
print(np.sqrt(mean_squared_error(pred,actual)))

### Test RMSE ###
0.8807934811993829


In [44]:
musical_fan = 800
docu_fan = 850
horror_fan = 900
animation_fan =  950

In [45]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, musical_fan, 'sum')
recommend_for_uid(musical_fan, rating_matrix, U, V, top_N=30)

("River's Edge (1986)", 'Crime|Drama')    1.9230889396773079
('Dead Meat (2004)', 'Horror')    1.8531047933217124
('Last Exit to Brooklyn (1989)', 'Drama')    1.8419151757176473
("Emperor's New Groove 2: Kronk's New Groove, The (2005)", 'Animation|Children|Comedy|Romance')    1.7786326762762474
('All Dogs Christmas Carol, An (1998)', 'Animation|Children|Comedy|Musical')    1.7585610639608151
('Missing (1982)', 'Drama|Mystery|Thriller')    1.7521776078496938
('Ex Machina (2015)', 'Drama|Sci-Fi|Thriller')    1.7506093437882737
('Lucky One, The (2012)', 'Drama')    1.7488185605144906
('Angel Baby (1995)', 'Drama')    1.7465180403711615
('Girl with the Dragon Tattoo, The (2011)', 'Drama|Thriller')    1.7462355795553557
('Ricky Gervais Live: Animals (2003)', 'Comedy')    1.7366572820032147
('War for the Planet of the Apes (2017)', 'Action|Adventure|Drama|Sci-Fi')    1.7359209061549041
('Ink (2009)', 'Action|Fantasy|Sci-Fi')    1.7150875141136637
('Airborne (1993)', 'Adventure|Comedy')    1.

In [46]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, docu_fan,'sum')
recommend_for_uid(docu_fan, rating_matrix, U, V, top_N=30)

('Three Billboards Outside Ebbing, Missouri (2017)', 'Crime|Drama')    2.1295223062728503
("Schindler's List (1993)", 'Drama|War')    1.9770077711895298
('Hamlet (1996)', 'Crime|Drama|Romance')    1.9753074217372137
('Five Easy Pieces (1970)', 'Drama')    1.9037836485861948
('Grand Day Out with Wallace and Gromit, A (1989)', 'Adventure|Animation|Children|Comedy|Sci-Fi')    1.8999616956970233
('Shawshank Redemption, The (1994)', 'Crime|Drama')    1.8809783751005824
('Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 'Drama')    1.8694281518446962
('Broadway Danny Rose (1984)', 'Comedy')    1.8377272643954274
('American Pie Presents: The Book of Love (American Pie 7: The Book of Love) (2009)', 'Comedy')    1.797888479718386
('Mister Roberts (1955)', 'Comedy|Drama|War')    1.7713435758460423
('Sound of Music, The (1965)', 'Musical|Romance')    1.761346274070356
('Yojimbo (1961)', 'Action|Adventure')    1.7468777999430167
('Great Escape, The (1963)', 'Action|Adventure|Drama|War')    1.73370

In [47]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, horror_fan,'sum')
recommend_for_uid(horror_fan, rating_matrix, U, V, top_N=30)

('Strange Brew (1983)', 'Comedy')    1.9928681732492552
('Happiness (1998)', 'Comedy|Drama')    1.9569375276476877
('Girl with the Dragon Tattoo, The (2011)', 'Drama|Thriller')    1.9529344071807366
('Everybody Wants Some (2016)', 'Comedy')    1.9460801011169082
("Widows' Peak (1994)", 'Drama')    1.8820679643052016
('Lady Eve, The (1941)', 'Comedy|Romance')    1.8534695634722764
('American Grindhouse (2010)', 'Documentary')    1.8302346748639036
('Cyrano de Bergerac (1990)', 'Comedy|Drama|Romance')    1.7782174182943622
('Incredibles 2 (2018)', 'Action|Adventure|Animation|Children')    1.770026782840625
('Lionheart (1990)', 'Action')    1.7671800377435773
('Alien Contamination (1980)', 'Action|Horror|Sci-Fi')    1.7610064388767772
('Moonrise Kingdom (2012)', 'Comedy|Drama|Romance')    1.7472101087926648
('Sympathy for the Underdog (1971)', 'Action|Crime|Drama')    1.7419841534689149
('Eddie Murphy Delirious (1983)', 'Comedy|Documentary')    1.7389760826470464
('Drive (2011)', 'Crime|D

In [48]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, horror_fan,'sum')
recommend_for_uid(animation_fan, rating_matrix, U, V, top_N=30)

('Chaser, The (Chugyeogja) (2008)', 'Crime|Drama|Thriller')    1.4766317640362536
('Long Goodbye, The (1973)', 'Crime|Film-Noir')    1.4633975412163487
('Forbidden Kingdom, The (2008)', 'Action|Adventure|Comedy|Fantasy')    1.4034638274773945
('Shallow Grave (1994)', 'Comedy|Drama|Thriller')    1.393882494040517
('Henry: Portrait of a Serial Killer (1986)', 'Crime|Horror|Thriller')    1.3925751790827423
('Blade Runner 2049 (2017)', 'Sci-Fi')    1.3741158607599726
('Idiocracy (2006)', 'Adventure|Comedy|Sci-Fi|Thriller')    1.3626379068651775
('Body of Evidence (1993)', 'Drama|Thriller')    1.36005247710488
('Silent Movie (1976)', 'Comedy')    1.3572032846019273
('Irrational Man (2015)', 'Crime|Drama')    1.3470932105014537
('Legend of Rita, The (Stille nach dem Schuß, Die) (1999)', 'Drama')    1.3174388024876702
('Mary Reilly (1996)', 'Drama|Horror|Thriller')    1.3010827734560224
('War for the Planet of the Apes (2017)', 'Action|Adventure|Drama|Sci-Fi')    1.3006143672887198
('Bad Word

# Test Code

In [74]:
################ Generating Synthetic Data #######################
synthetic_rating = np.zeros((5,10))

for i in range(10):
    random.seed(i)
    u_idx = random.randint(0,4)
    i_idx = random.randint(0,9)
    r_ui = random.randint(1,5)
    synthetic_rating[u_idx ,i_idx] = r_ui

synthetic_R = coo_matrix(synthetic_rating)
synthetic_R_zipped = list(zip(synthetic_R.row, synthetic_R.col, synthetic_R.data))
np.random.seed(7)
synthetic_U = np.random.rand(5, 3)
synthetic_V = np.random.rand(3, 10)

In [75]:
answer_U = np.load('./drive/MyDrive/data/others/answer_U.npy')
answer_V = np.load('./drive/MyDrive/data/others/answer_V.npy')
answer_rmse = np.load('./drive/MyDrive/data/others/answer_rmse.npy')

In [76]:
def test_code(sgd, rmse):
    U, V = sgd(synthetic_U, synthetic_V, synthetic_R_zipped, lr=0.01, lambda_u=0.1, lambda_v=0.1) 
    mse_u = mean_squared_error(U, answer_U)
    mse_v = mean_squared_error(V, answer_V)
    assert(mse_u <1e-2 or mse_v < 1e-2), 'calculated U, V is different with the answer : SGD 함수 오류'   ## 정답과 오차범위 내의 값이 아니면 assertion 오류

    rmse = calculate_rmse(synthetic_R, U, V, lambda_u=0.1, lambda_v=0.1)
    assert(abs(answer_rmse-rmse)<1e-05), 'calculated rmse is diferent with the answer : RMSE 함수 오류'   ## 정답과 오차범위 내의 값이 아니면 assertion 오류
    
    print("모든 함수 알맞게 구현됨")


In [77]:
test_code(SGD, calculate_rmse)

모든 함수 알맞게 구현됨
