## Data Load

In [4]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [7]:
data = Dataset.load_builtin('ml-100k', prompt=False)
data.raw_ratings[:10] #어떤 사용자가 어떤 영화에 대해 몇점을 줬는지 

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [8]:
model = SVD()

In [9]:
cross_validate(model, data, measures = ['rmse','mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9311  0.9319  0.9362  0.9414  0.9391  0.9359  0.0040  
MAE (testset)     0.7324  0.7353  0.7386  0.7432  0.7394  0.7378  0.0037  
Fit time          3.08    3.09    3.09    3.18    3.09    3.11    0.04    
Test time         0.12    0.12    0.09    0.12    0.12    0.11    0.01    


{'test_rmse': array([0.93107117, 0.93185384, 0.93617103, 0.94144728, 0.93907713]),
 'test_mae': array([0.73244553, 0.73526215, 0.73858333, 0.74320116, 0.73940241]),
 'fit_time': (3.080501079559326,
  3.089439630508423,
  3.0866944789886475,
  3.1779470443725586,
  3.0941660404205322),
 'test_time': (0.1211545467376709,
  0.11902713775634766,
  0.09002017974853516,
  0.11602592468261719,
  0.1180264949798584)}

# Content-based Filtering

In [10]:
#이전 행동과 유사한 항목 추천
#나의 이전 경험과 비슷한 경험을 가진 다른 사용자의 항목을 추천
# 유사도 기반, 

In [11]:
import numpy as np
from surprise import Dataset

In [13]:
raw_data = np.array(data.raw_ratings, dtype = int)
raw_data[:5]

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       [      244,        51,         2, 880606923],
       [      166,       346,         1, 886397596]])

In [14]:
raw_data[:, 0] -=1
raw_data[:, 1] -=1
raw_data[:5]

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       [      243,        50,         2, 880606923],
       [      165,       345,         1, 886397596]])

In [15]:
n_users = np.max(raw_data[:, 0])
n_movies = np.max(raw_data[:, 1])
shape = (n_users+1, n_movies +1)
shape

(943, 1682)

In [16]:
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = 1.
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [19]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
    if my_id !=user_id:
        similarity = np.dot(my_vector, user_vector)
        if similarity > best_match:
            best_match = similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 183, Best Match ID: 275


In [20]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 >0.:
        recommend_list.append(i)
print(recommend_list)

[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

In [28]:
# uclidean_dist
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []
for user_id, user_vector in enumerate(adj_matrix):
    if my_id !=user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if similarity > euclidean_dist:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 61.89507250177513, Best Match ID: 942


In [22]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 >0.:
        recommend_list.append(i)
print(recommend_list)

[273, 280, 281, 283, 317, 355, 366, 372, 384, 385, 390, 392, 398, 400, 401, 402, 404, 405, 411, 414, 418, 420, 422, 425, 426, 430, 442, 448, 449, 467, 469, 470, 474, 484, 507, 525, 540, 545, 548, 558, 565, 567, 568, 569, 575, 580, 584, 594, 608, 613, 624, 654, 671, 684, 716, 719, 720, 721, 723, 731, 738, 755, 762, 764, 784, 793, 795, 807, 815, 823, 824, 830, 839, 927, 940, 942, 1010, 1027, 1043, 1046, 1066, 1073, 1187, 1227, 1329]


In [23]:
# cosign_similarity
def compute_cos_similarity(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))
    dot = np.dot(v1,v2)
    return dot / (norm1*norm2)

In [29]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []
for user_id, user_vector in enumerate(adj_matrix):
    if my_id !=user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if similarity > cos_similarity:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 0.3981747390558696, Best Match ID: 942


In [32]:
# 인접행렬을 rating 값으로 
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = rating
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [33]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []
for user_id, user_vector in enumerate(adj_matrix):
    if my_id !=user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if similarity > euclidean_dist:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 61.89507250177513, Best Match ID: 942


In [34]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []
for user_id, user_vector in enumerate(adj_matrix):
    if my_id !=user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if similarity > cos_similarity:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 0.3981747390558696, Best Match ID: 942


## Collaborative Filtering

In [35]:
# 사용자와 항목의 유사성을 동시에 고려하여 추천
# 기존 내 경험과 달라도 추천
# 자동으로 임베딩 학습

In [36]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

In [37]:
model = KNNBasic()
cross_validate(model, data, measures = ['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9738  0.9758  0.9898  0.9759  0.9757  0.9782  0.0059  
MAE (testset)     0.7711  0.7688  0.7827  0.7707  0.7704  0.7728  0.0050  
Fit time          0.27    0.29    0.44    0.36    0.33    0.34    0.06    
Test time         2.17    2.31    2.20    2.03    1.99    2.14    0.12    


{'test_rmse': array([0.97381263, 0.97579504, 0.98984296, 0.97589985, 0.97569975]),
 'test_mae': array([0.77112841, 0.76884324, 0.78271622, 0.77073722, 0.77035232]),
 'fit_time': (0.2670605182647705,
  0.28806447982788086,
  0.43634748458862305,
  0.3553805351257324,
  0.33307600021362305),
 'test_time': (2.1739447116851807,
  2.3120408058166504,
  2.2035608291625977,
  2.025456428527832,
  1.9874458312988281)}

In [38]:
model = NMF()
cross_validate(model, data, measures = ['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9541  0.9657  0.9598  0.9754  0.9763  0.9662  0.0086  
MAE (testset)     0.7499  0.7572  0.7567  0.7657  0.7674  0.7594  0.0064  
Fit time          3.24    3.38    3.51    3.31    3.12    3.31    0.13    
Test time         0.11    0.12    0.09    0.09    0.09    0.10    0.01    


{'test_rmse': array([0.95411376, 0.96566743, 0.95979197, 0.9753708 , 0.97626902]),
 'test_mae': array([0.74986306, 0.75721441, 0.75665401, 0.76567596, 0.76739147]),
 'fit_time': (3.2446415424346924,
  3.3792128562927246,
  3.5137147903442383,
  3.305305004119873,
  3.117706537246704),
 'test_time': (0.10602426528930664,
  0.11602616310119629,
  0.09202075004577637,
  0.09402132034301758,
  0.0890192985534668)}

In [39]:
model = SVDpp()
cross_validate(model, data, measures = ['rmse', 'mae'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9143  0.9283  0.9231  0.9144  0.9171  0.9195  0.0055  
MAE (testset)     0.7170  0.7270  0.7244  0.7168  0.7209  0.7212  0.0040  
Fit time          132.51  131.61  131.17  135.50  112.34  128.62  8.28    
Test time         2.22    2.27    2.27    2.13    1.99    2.18    0.11    


{'test_rmse': array([0.91426857, 0.9283277 , 0.92311802, 0.9144376 , 0.91711805]),
 'test_mae': array([0.71696266, 0.72699778, 0.72439005, 0.71678452, 0.72087452]),
 'fit_time': (132.50747227668762,
  131.61227107048035,
  131.16543292999268,
  135.50321125984192,
  112.33640646934509),
 'test_time': (2.2245001792907715,
  2.2745110988616943,
  2.26951003074646,
  2.1334803104400635,
  1.9869821071624756)}

## Hybrid

In [40]:
## collaborative_filtering으로 임베딩 학습 후 content_based_filtering으로 유사도 기반 추천

In [41]:
from sklearn.decomposition import randomized_svd, non_negative_factorization

In [42]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [43]:
# S가 특이값
U, S, V = randomized_svd(adj_matrix, n_components = 2)
S = np.diag(S)
print(U.shape, S.shape, V.shape)

(943, 2) (2, 2) (2, 1682)




In [44]:
np.matmul(np.matmul(U,S),V)

array([[ 3.91732663e+00,  1.47276644e+00,  7.98261988e-01, ...,
         6.24907189e-04,  1.41100852e-02,  1.36545878e-02],
       [ 1.85777226e+00,  3.96191175e-01,  5.05705740e-01, ...,
         5.38862978e-03,  1.77237914e-03,  5.26968095e-04],
       [ 8.94989517e-01,  1.71578497e-01,  2.51738682e-01, ...,
         2.92094923e-03,  5.39937171e-04, -1.25733753e-04],
       ...,
       [ 9.92051955e-01,  2.10814957e-01,  2.70363365e-01, ...,
         2.89019297e-03,  9.34221962e-04,  2.66612193e-04],
       [ 1.30425401e+00,  5.27669941e-01,  2.50080165e-01, ...,
        -4.20677765e-04,  5.30525683e-03,  5.28069948e-03],
       [ 2.82999397e+00,  9.70812247e-01,  6.15871694e-01, ...,
         2.02091492e-03,  8.67740813e-03,  8.03107892e-03]])

In [45]:
#사용자 기반 추천
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1, []
for user_id, user_vector in enumerate(U):
    if my_id !=user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if similarity > cos_similarity:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 0.9415201705645377, Best Match ID: 942


In [46]:
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
    log1, log2 = log
    if log1 < 1. and log2 >0.:
        recommend_list.append(i)
print(recommend_list)

[273, 280, 281, 283, 317, 355, 366, 372, 384, 385, 390, 392, 398, 400, 401, 402, 404, 405, 411, 414, 418, 420, 422, 425, 426, 430, 442, 448, 449, 467, 469, 470, 474, 484, 507, 525, 540, 545, 548, 558, 565, 567, 568, 569, 575, 580, 584, 594, 608, 613, 624, 654, 671, 684, 716, 719, 720, 721, 723, 731, 738, 755, 762, 764, 784, 793, 795, 807, 815, 823, 824, 830, 839, 927, 940, 942, 1010, 1027, 1043, 1046, 1066, 1073, 1187, 1227, 1329]


In [57]:
# 항목 기반 추천 (내가 본 항목과 비슷한 항목)
my_id, my_vector = 0, V.T[0]
best_match, best_match_id, best_match_vector = -1, -1, []
for user_id, user_vector in enumerate(V.T):
    if my_id !=user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if similarity > cos_similarity:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 0.25977083374828575, Best Match ID: 1681


In [52]:
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
    if adj_matrix[i][my_id] > 0.9:
        recommend_list.append(i)
print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 

In [53]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [54]:
# 비음수 행렬 분해 사용
A, B, iter = non_negative_factorization(adj_matrix, n_components = 2)



In [55]:
np.matmul(A,B)

array([[3.71108858e+00, 1.48454102e+00, 7.39535363e-01, ...,
        3.64490531e-03, 1.45506510e-02, 1.44110916e-02],
       [2.11724416e+00, 2.37338725e-01, 5.51650703e-01, ...,
        4.76092332e-03, 3.03585165e-05, 0.00000000e+00],
       [9.85272360e-01, 1.10447010e-01, 2.56713988e-01, ...,
        2.21552443e-03, 1.41275191e-05, 0.00000000e+00],
       ...,
       [1.04485667e+00, 1.17126289e-01, 2.72238757e-01, ...,
        2.34950819e-03, 1.49818803e-05, 0.00000000e+00],
       [1.45758857e+00, 5.42112787e-01, 2.99181695e-01, ...,
        1.61177659e-03, 5.15921394e-03, 5.10771005e-03],
       [2.44733103e+00, 9.41208708e-01, 4.95740038e-01, ...,
        2.56992125e-03, 9.08287324e-03, 8.99389282e-03]])

In [58]:
#사용자 기반 추천
my_id, my_vector = 0, A[0]
best_match, best_match_id, best_match_vector = -1, -1, []
for user_id, user_vector in enumerate(A):
    if my_id !=user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if similarity > cos_similarity:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 0.9987492019399694, Best Match ID: 942


In [59]:
# 항목 기반 추천 (내가 본 항목과 비슷한 항목)
my_id, my_vector = 0, B.T[0]
best_match, best_match_id, best_match_vector = -1, -1, []
for user_id, user_vector in enumerate(B.T):
    if my_id !=user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if similarity > cos_similarity:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))


Best Match: 0.5348293339902824, Best Match ID: 1681
