### 희소행렬을 밀집행렬로 분해하여 비어있는 공간까지 채워 넣는 기법. User2,3이 선택하지 않은 아이템에 대해서 다른 유저가 선택을 했기 때문에 해당 아이템에 대한 factor가 정보로서 남아 있다는 것이다.

In [16]:
import numpy as np
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
              [np.NaN, 5, np.NaN, 3, 1],
              [np.NaN, np.NaN, 3, 4, 4],
              [5, 2, 1, 2, np.NaN]])
num_users, num_items = R.shape
K=3

In [17]:
np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_users, K))

In [18]:
Q = np.random.normal(scale=1./K, size=(num_users, K))

In [19]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zero_ind)
    
    return rmse

### SVD (Singular Value Decomposition)방식을 사용하려면 NaN값이 없어야 하므로, SGD나 Alternating Least Suqares 방식을 이용한다.

In [32]:
non_zeros = [ (i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
non_zeros

[(0, 0, 4.0),
 (0, 3, 2.0),
 (1, 1, 5.0),
 (1, 3, 3.0),
 (1, 4, 1.0),
 (2, 2, 3.0),
 (2, 3, 4.0),
 (2, 4, 4.0),
 (3, 0, 5.0),
 (3, 1, 2.0),
 (3, 2, 1.0),
 (3, 3, 2.0)]

In [None]:
steps=1000
learning_rate = 0.01
r_lambda=0.01

for step in range(steps):
    for i, j, r in non_zeros:
        eij = r - np.dot(P[i,:], Q[j,:])
        P[i, :] = P[i, :] + learning_rate*(eij * Q[j, :] - r_lambda*P[i, :])
        Q[j, :] = Q[j, :] + learning_rate*(eij * P[i, :] - r_lambda*Q[j, :])
        
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 50) ==0:
            print("### iteration step: ", step, "rmse: ", rmse)

In [27]:
pred_matrix = np.dot(P, Q.T)
print('predict matirx:\n', np.round(pred_matrix,3))

predict matirx:
 [[-0.035 -0.155  0.013  0.121]
 [-0.31   0.381  0.337 -0.156]
 [ 0.053 -0.247 -0.08   0.158]
 [-0.319  0.236  0.345  0.022]]


## Contents based filerting

In [34]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [38]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]

In [39]:
pd.set_option('max_colwidth', 100)
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."
