In [1]:
# 행렬분해, 경사하강법(딥러닝) 을 이용한 알고리즘

# 행렬분해 - matrix를 두 개의 서로 다른 행렬로 분해하는 것. 
#            분해된 행렬은 원래 행렬의 내재된 다른 의미를 갖는 행렬로 변환됨.
# 
# 경사하강법 - 함수의 값을 최소화하는 파라미터들을 찾는 방식
#                f(x) = w1x1 + w2x2 + ... => f(x)를 최소화하는 w1, w2 ...를 찾는 방법
#                f(x)를 w1으로 미분한 값을 w1에 더해서 업데이트..
#                f(x)를 w2로 미분한 값을 w2에 더해서 업데이트..

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# 경사하강법을 이용한 행렬분해

In [4]:
# 사용자 영화 평점 매트릭스

R = np.array(
[
    [4, np.nan, np.nan, 2, np.nan],
     [np.nan, 5, np.nan, 3, 1],
     [np.nan, np.nan, 3, 4, 4],
     [5, 2, 1, 2, np.nan],
]
)
R

array([[ 4., nan, nan,  2., nan],
       [nan,  5., nan,  3.,  1.],
       [nan, nan,  3.,  4.,  4.],
       [ 5.,  2.,  1.,  2., nan]])

In [9]:
num_users, num_items = R.shape
print(num_users, num_items)  # 이용자수, 영화수

4 5


In [7]:
k = 3

In [None]:
# a*b = c 
# a의 row원소수, b의 칼럼 수가 같아야..
# c의 shape은 (a row num, b col num)

In [None]:
# R  = P * Q.T

In [11]:
np.random.seed(1)

P = np.random.normal(scale=1/k, size=(num_users, k))
Q = np.random.normal(scale=1/k, size=(num_items, k))

In [12]:
P

array([[ 0.54144845, -0.2039188 , -0.17605725],
       [-0.35765621,  0.28846921, -0.76717957],
       [ 0.58160392, -0.25373563,  0.10634637],
       [-0.08312346,  0.48736931, -0.68671357]])

In [13]:
Q

array([[-0.1074724 , -0.12801812,  0.37792315],
       [-0.36663042, -0.05747607, -0.29261947],
       [ 0.01407125,  0.19427174, -0.36687306],
       [ 0.38157457,  0.30053024,  0.16749811],
       [ 0.30028532, -0.22790929, -0.04096341]])

In [43]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)  # 예측행렬
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]  # non zero data 행 인덱스값
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]  # non zero data 열 인덱스값
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind] # 실제 행렬의 non zero data 실제값
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind] # 예상 행렬의 non zero data 실제값
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros) # 실제 행렬값과 예상행렬값의 차이 계산
    rmse = np.sqrt(mse)
    
    return rmse  

In [48]:
non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
non_zeros

[(0, 0, 4.0),
 (0, 3, 2.0),
 (1, 1, 5.0),
 (1, 3, 3.0),
 (1, 4, 1.0),
 (2, 2, 3.0),
 (2, 3, 4.0),
 (2, 4, 4.0),
 (3, 0, 5.0),
 (3, 1, 2.0),
 (3, 2, 1.0),
 (3, 3, 2.0)]

In [45]:
# 경사하강법

In [46]:
steps = 1000
learning_rate = 0.01
r_lambda = 0.01

In [47]:
steps=1000
learning_rate=0.01
r_lambda=0.01  # 규제값 크기

for step in range(steps):
    for i, j, r in non_zeros:  # 실제 행렬에서 값이 있는 인덱스, 칼럽, 값을 가져온다
        eij = r - np.dot(P[i, :], Q[j, :].T)    # P : 사용자행 벡터, Q : 아이템행 벡터, 실제값과 예상행렬의 값과의 차이
        P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])

    rmse = get_rmse(R, P, Q, non_zeros)  # non_zeros - 원본행렬에서 0값이 아닌 위치인덱스와 값
    
    if (step % 50) == 0 :
        print("### iteration step : ", step," rmse : ", rmse)

### iteration step :  0  rmse :  3.1087272149360428
### iteration step :  50  rmse :  0.37194908472247334
### iteration step :  100  rmse :  0.17490026004137008
### iteration step :  150  rmse :  0.1098084020518687
### iteration step :  200  rmse :  0.07949713491686256
### iteration step :  250  rmse :  0.06161504064517699
### iteration step :  300  rmse :  0.04924231665945879
### iteration step :  350  rmse :  0.04017336747107848
### iteration step :  400  rmse :  0.03351400043052365
### iteration step :  450  rmse :  0.028721731864412722
### iteration step :  500  rmse :  0.025360722716168745
### iteration step :  550  rmse :  0.023055789122931013
### iteration step :  600  rmse :  0.0214967357885098
### iteration step :  650  rmse :  0.020445038507496214
### iteration step :  700  rmse :  0.019729654328785883
### iteration step :  750  rmse :  0.019234308660665035
### iteration step :  800  rmse :  0.01888260053161404
### iteration step :  850  rmse :  0.01862516738322388
### iterat

In [49]:
pred_matrix = np.dot(P, Q.T)  # P 와 Q 가 업데이트 되었다.
print('예측 행렬:\n', np.round(pred_matrix, 3))

예측 행렬:
 [[3.993 0.206 1.529 1.998 2.098]
 [6.373 4.978 0.458 2.98  1.003]
 [7.546 0.338 2.985 3.976 3.986]
 [4.964 2.005 1.011 2.024 1.735]]


In [None]:
rray([[ 4., nan, nan,  2., nan],
       [nan,  5., nan,  3.,  1.],
       [nan, nan,  3.,  4.,  4.],
       [ 5.,  2.,  1.,  2., nan]])

In [50]:
P

array([[ 1.27260011, -0.05637675, -0.58887639],
       [ 0.1583115 ,  0.97585931, -2.41062949],
       [ 2.48992852,  0.15398693, -1.02716039],
       [ 0.75348919, -0.19312815, -1.41932602]])

In [51]:
Q

array([[ 2.0102305 ,  0.15047143, -2.4508715 ],
       [-0.66762534,  0.6315032 , -1.85318495],
       [ 1.15467149,  0.02785577, -0.10296908],
       [ 1.1897163 ,  0.67208697, -0.88584624],
       [ 1.44219812, -0.25061738, -0.4226378 ]])

In [52]:
import pickle

In [53]:
with open('ratings_matrix.pickle', 'rb')  as f:
    ratings_matrix = pickle.load(f)

In [54]:
ratings_matrix

title,'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),10 Things I Hate About You (1999),100 Girls (2000),...,Zero de conduite (Zero for Conduct) (Zéro de conduite: Jeunes diables au collège) (1933),Zeus and Roxanne (1997),Ziggy Stardust and the Spiders from Mars (1973),Zoolander (2001),"Zorro, the Gay Blade (1981)",Zulu (1964),Zus & Zo (2001),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,,,3.0,,,,,2.0,,,...,,,,2.0,,,,,,
703,,,,,,,,,,,...,,,,,,,,,,
704,,,,,,,,,,,...,,,,,,,,,,
705,,,,,,,,,,,...,,,,,,,,,,


In [55]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]  # 실제행렬값
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]  # 예축행렬값
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [56]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    
    num_users, num_items = R.shape
    
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, :], Q[j, :].T)
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [57]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)

### iteration step :  0  rmse :  2.8729590523415185
### iteration step :  10  rmse :  0.7751689243422856
### iteration step :  20  rmse :  0.5537474877497137
### iteration step :  30  rmse :  0.40368881085111313
### iteration step :  40  rmse :  0.3189991155493651
### iteration step :  50  rmse :  0.2701146687360125
### iteration step :  60  rmse :  0.24005226968391416
### iteration step :  70  rmse :  0.22024721939838302
### iteration step :  80  rmse :  0.2063715181024185
### iteration step :  90  rmse :  0.19614894903723173
### iteration step :  100  rmse :  0.18831132373049958
### iteration step :  110  rmse :  0.18210906530821652
### iteration step :  120  rmse :  0.17707517982732157
### iteration step :  130  rmse :  0.17290508730011464
### iteration step :  140  rmse :  0.16939202628333708
### iteration step :  150  rmse :  0.1663906839118822
### iteration step :  160  rmse :  0.1637958563994298
### iteration step :  170  rmse :  0.16152943025595382
### iteration step :  180  rm

In [61]:
ratings_matrix.shape

(706, 6033)

In [59]:
pred_matrix.shape

(706, 6033)

In [63]:
ratings_pred_matrix = pd.DataFrame(pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)
ratings_pred_matrix.head()

title,'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),10 Things I Hate About You (1999),100 Girls (2000),...,Zero de conduite (Zero for Conduct) (Zéro de conduite: Jeunes diables au collège) (1933),Zeus and Roxanne (1997),Ziggy Stardust and the Spiders from Mars (1973),Zoolander (2001),"Zorro, the Gay Blade (1981)",Zulu (1964),Zus & Zo (2001),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.555666,2.159842,3.028817,3.901788,3.377757,3.911236,2.153347,2.639678,3.759682,3.224469,...,2.865701,1.822462,1.626562,3.271546,1.284278,3.817903,2.149276,4.196964,3.278201,2.759743
2,4.042561,2.42729,4.286435,4.240135,4.008166,4.753625,2.325696,3.739063,3.167588,3.470914,...,2.985829,1.873201,1.608128,3.619765,1.297612,4.36512,2.367471,3.348176,3.569174,4.005358
3,3.711459,2.154449,4.460065,3.929028,3.237729,3.84647,2.130991,3.004557,2.808339,3.354645,...,2.694811,1.708401,1.480801,3.123194,1.194588,4.739826,2.120213,3.110852,2.866829,4.085646
4,3.064267,1.797329,1.885467,3.001823,2.940077,2.717085,1.226566,2.254631,2.153997,2.63327,...,2.318292,1.42937,1.119984,3.523544,1.044003,3.137157,1.231094,3.140168,3.891372,3.129351
5,3.880001,2.251895,3.142835,4.033303,3.671936,3.695018,1.592028,3.269441,3.431756,3.490227,...,2.442626,1.843348,1.459751,3.400892,1.294543,4.127748,1.645492,2.75549,3.650279,4.121481


In [64]:
ratings_matrix.head()

title,'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),10 Things I Hate About You (1999),100 Girls (2000),...,Zero de conduite (Zero for Conduct) (Zéro de conduite: Jeunes diables au collège) (1933),Zeus and Roxanne (1997),Ziggy Stardust and the Spiders from Mars (1973),Zoolander (2001),"Zorro, the Gay Blade (1981)",Zulu (1964),Zus & Zo (2001),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [69]:
user_id = 9
top_n = 50

ratings_pred_matrix.loc[user_id].sort_values(ascending=False)[:top_n]

title
Monty Python and the Holy Grail (1975)                                            5.868125
Network (1976)                                                                    5.573849
Raising Arizona (1987)                                                            5.562702
Braveheart (1995)                                                                 5.481068
Unforgiven (1992)                                                                 5.480685
Maverick (1994)                                                                   5.462828
Shawshank Redemption, The (1994)                                                  5.453794
Boot, Das (Boat, The) (1981)                                                      5.420071
Matrix, The (1999)                                                                5.303379
Airplane! (1980)                                                                  5.231960
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.