<div style = "height: 50px;"></div>

# 필요한 모듈 불러오기

In [68]:
import os

import pandas            as pd
import matplotlib.pyplot as plt
import numpy             as np
import tensorflow        as tf

import warnings
warnings.simplefilter("ignore")

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

<div style = "height: 50px;"><div/>

# <span style = "font-weight:bold;color:#0172d4;">STEP 1</span> : 데이터 전처리

## 1. 데이터 불러오기 (1) ratings
<hr/>

In [69]:
rating_file_path='data/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep = '::', names = ratings_cols, engine = 'python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head(2)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


<div style = "height: 50px;"><div/>

## 2. 데이터 전처리하기 (1) ratings
<hr/>

### ratings 3점 미만은 삭제하기

In [70]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings'] >= 3]
filtered_data_size = len(ratings)

In [71]:
print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


<div style = "height: 25px;"><div/>

### atings 컬럼의 이름을 counts로 변경

In [72]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns = {'ratings':'counts'}, inplace = True)

In [73]:
ratings.head(2)

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


<div style = "height: 50px;"><div/>

## 3. 데이터 불러오기 (2) movies
<hr/>

In [74]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path = 'data/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep = '::', names = cols, engine = 'python', encoding = 'ISO-8859-1')
movies.head(2)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


<div style = "height: 50px;"><div/>

## 4. 데이터 전처리하기 (2) movies
<hr/>

### 문자형 데이터 소문자로 변환하기

In [75]:
movies['title'] = movies['title'].str.lower()
movies['genre'] = movies['genre'].str.lower()

In [76]:
movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy
5,6,heat (1995),action|crime|thriller
6,7,sabrina (1995),comedy|romance
7,8,tom and huck (1995),adventure|children's
8,9,sudden death (1995),action
9,10,goldeneye (1995),action|adventure|thriller


<div style = "height: 50px;"></div>

## 5. 데이터 합치기 
<hr/>

<p style= "font-size:15px; line-height:24px;">데이터를 분석하거나 모델을 돌릴 떄에는 ratings데이터를 사용하는데, <br/>
이를 출력했을 때 movie_id만 나오는 것 보다는 제목과 타이틀이 같이 나오는 것이 보기 편할 것이라고 판단되어 데이터를 합치고자 한다.</p>

In [77]:
movies.head(2)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy


In [78]:
# movie_id 값을 기준으로 데이터프레임 정렬하기
ratings = ratings.sort_values(by = 'movie_id')

In [79]:
def data_add(dic_columns, dic_columns2):

    # 영화 movie_id, dic_columns2 딕셔너리화 하기
    movies_dict =  movies[dic_columns]
    movies_dict = pd.DataFrame(movies_dict)
    movies_dict[dic_columns2] = movies[dic_columns2]
    movies_dict.set_index(dic_columns, inplace = True)
    movies_dict = movies_dict.to_dict()
    movies_dict[dic_columns2][1]
    movies_dict_2 = movies_dict[dic_columns2]
    
    # ratings[dic_columns]와 같은 값이라면 dic_columns2을 빈 리스트에 추가하는 함수
    movie_list = []

    for m_id, m_title in movies_dict_2.items():
        for j in ratings[dic_columns]:
            if j == m_id:
                movie_list.append(m_title)
    
    # 위에 만든 리스트를 ratings의 새로운 열로 만듦
    ratings[dic_columns2] = movie_list

<p style= "font-size:15px; line-height:24px;">위 함수는 dic_columns(movie_id)를 key값으로, dic_columns2(ratings에 추가할 열 값)을 values값으로 하는 딕셔너리를 생성한 후,<br/> 
ratings의 movie_id와 생성한 딕셔너리의 키값(movie_id)과 같다면, dic_columns2값을 추가하는 함수이다.</p>

In [80]:
# ratings에 title 추가
data_add('movie_id', 'title')

In [81]:
# ratings에 genre 추가
data_add('movie_id', 'genre')

In [82]:
ratings = ratings.reset_index(drop = True)

In [83]:
ratings.tail()

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,4939,3952,3,987878275,"contender, the (2000)",drama|thriller
836474,1748,3952,3,992337202,"contender, the (2000)",drama|thriller
836475,1965,3952,3,974688627,"contender, the (2000)",drama|thriller
836476,1589,3952,4,974734727,"contender, the (2000)",drama|thriller
836477,4816,3952,4,985653887,"contender, the (2000)",drama|thriller


<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 2</span> : 데이터 분석하기

## 1. ratings에 있는 유니크한 영화 개수
<hr/>

In [84]:
# ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

<div style = "height: 50px;"></div>

## 2. ratings에 있는 유니크한 사용자 수
<hr/>

In [85]:
# ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

<div style = "height: 50px;"></div>

## 3. 가장 인기 있는 영화 30개(인기순)
<hr/>

<p style= "font-size:15px; line-height:24px;">인기의 기준은, 영화의 user_id 개수이다.<br/>
user_id 개수가 가장 많은 movie_id를 찾은 후, 이를 통해 영화 제목을 출력하고자 한다.</p>

In [86]:
# 인기 많은 아티스트
movie_id_count = ratings.groupby('title')['user_id'].count()

In [87]:
movie_id_count = movie_id_count.sort_values(ascending=False).head(30)

In [88]:
movie_id_count

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
jurassic park (1993)                                     2413
sixth sense, the (1999)                                  2385
fargo (1996)                                             2371
braveheart (1995)                                        2314
men in black (1997)                                      2297
schindler's list (1993)                                  2257
pr

<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 3</span> : 선호하는 영화 5가지를 ratings에 추가하기 

<p style= "font-size:15px; line-height:24px;">애니메이션 장르의 영화만 출력하여 좋아하는 영화가 있는지 확인해보고자 한다.</p>

In [89]:
# 영화 검색을 통해 movie_id 찾기

In [90]:
# 영화 검색
favorite_search = movies[movies['genre'].str.contains('animation')]
favorite_search.tail(20)

Unnamed: 0,movie_id,title,genre
2985,3054,pokémon: the first movie (1998),animation|children's
3045,3114,toy story 2 (1999),animation|children's|comedy
3090,3159,fantasia 2000 (1999),animation|children's|musical
3144,3213,batman: mask of the phantasm (1993),animation|children's
3218,3287,"tigger movie, the (2000)",animation|children's
3331,3400,we're back! a dinosaur's story (1993),animation|children's
3360,3429,creature comforts (1990),animation|comedy
3414,3483,"road to el dorado, the (2000)",animation|children's
3523,3592,time masters (les maîtres du temps) (1982),animation|sci-fi
3542,3611,saludos amigos (1943),animation|children's|comedy


In [91]:
# 좋아하는 영화
my_favorite = ['lion king, the (1994)' , 'pocahontas (1995)' ,'toy story (1995)' ,'toy story 2 (1999)' ,'road to el dorado, the (2000)']
my_favorite_id = [364, 48, 1, 3114, 3483]
my_favorite_genre = ["animation|children's|musical", "animation|children's|musical|romance", "animation|children's|comedy", "animation|children's|comedy", "animation|children's"]

In [92]:
# '6041'이라는 user_id가 위 영화를 3회씩, 956704887만큼 봤다고 가정
my_playlist = pd.DataFrame({'user_id': [6041]*5, 'movie_id': my_favorite_id, 'counts':[3]*5, 'timestamp':[956704887]*5, 'title': my_favorite, 'genre': my_favorite_genre})

In [93]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836478 entries, 0 to 836477
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    836478 non-null  int64 
 1   movie_id   836478 non-null  int64 
 2   counts     836478 non-null  int64 
 3   timestamp  836478 non-null  int64 
 4   title      836478 non-null  object
 5   genre      836478 non-null  object
dtypes: int64(4), object(2)
memory usage: 38.3+ MB


In [94]:
my_playlist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    5 non-null      int64 
 1   movie_id   5 non-null      int64 
 2   counts     5 non-null      int64 
 3   timestamp  5 non-null      int64 
 4   title      5 non-null      object
 5   genre      5 non-null      object
dtypes: int64(4), object(2)
memory usage: 368.0+ bytes


In [95]:
# user_id에 'zimin'이라는 데이터가 없다면 임의로 만든 my_favorite 데이터를 추가
if not ratings.isin({'user_id':[6041]})['user_id'].any(): 
    ratings = ratings.append(my_playlist)                           

In [96]:
# 확인 
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,4939,3952,3,987878275,"contender, the (2000)",drama|thriller
836474,1748,3952,3,992337202,"contender, the (2000)",drama|thriller
836475,1965,3952,3,974688627,"contender, the (2000)",drama|thriller
836476,1589,3952,4,974734727,"contender, the (2000)",drama|thriller
836477,4816,3952,4,985653887,"contender, the (2000)",drama|thriller
0,6041,364,3,956704887,"lion king, the (1994)",animation|children's|musical
1,6041,48,3,956704887,pocahontas (1995),animation|children's|musical|romance
2,6041,1,3,956704887,toy story (1995),animation|children's|comedy
3,6041,3114,3,956704887,toy story 2 (1999),animation|children's|comedy
4,6041,3483,3,956704887,"road to el dorado, the (2000)",animation|children's


In [97]:
ratings = ratings.reset_index(drop = True)

In [98]:
# 컬럼 순서 변경
ratings = ratings[['user_id', 'counts', 'movie_id', 'title', 'genre', 'timestamp']]

In [99]:
ratings

Unnamed: 0,user_id,counts,movie_id,title,genre,timestamp
0,5636,3,1,toy story (1995),animation|children's|comedy,958967064
1,4458,4,1,toy story (1995),animation|children's|comedy,965096620
2,4546,4,1,toy story (1995),animation|children's|comedy,964656523
3,5037,5,1,toy story (1995),animation|children's|comedy,962547743
4,5575,4,1,toy story (1995),animation|children's|comedy,959291679
...,...,...,...,...,...,...
836478,6041,3,364,"lion king, the (1994)",animation|children's|musical,956704887
836479,6041,3,48,pocahontas (1995),animation|children's|musical|romance,956704887
836480,6041,3,1,toy story (1995),animation|children's|comedy,956704887
836481,6041,3,3114,toy story 2 (1999),animation|children's|comedy,956704887


<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 4</span> : 데이터 인덱싱

In [100]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

In [101]:
# 유저, 아티스트 indexing 하는 코드
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [104]:
# 인덱싱 확인
print(user_to_idx[6041])    
print(movie_to_idx[1])

6039
0


In [38]:
ratings_test = ratings.copy()
# ratings = ratings_test.copy()

In [105]:
# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()

if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

user_id column indexing OK!!


In [106]:
# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_id_data = ratings['movie_id'].map(movie_to_idx.get).dropna()

if len(temp_movie_id_data) == len(ratings):
    print('movie column indexing OK!!')
    ratings['movie_id'] = temp_movie_id_data
else:
    print('movie_id column indexing Fail!!')

ratings

movie column indexing OK!!


Unnamed: 0,user_id,counts,movie_id,title,genre,timestamp
0,0,3,0,toy story (1995),animation|children's|comedy,958967064
1,1,4,0,toy story (1995),animation|children's|comedy,965096620
2,2,4,0,toy story (1995),animation|children's|comedy,964656523
3,3,5,0,toy story (1995),animation|children's|comedy,962547743
4,4,4,0,toy story (1995),animation|children's|comedy,959291679
...,...,...,...,...,...,...
836478,6039,3,350,"lion king, the (1994)",animation|children's|musical,956704887
836479,6039,3,47,pocahontas (1995),animation|children's|musical|romance,956704887
836480,6039,3,0,toy story (1995),animation|children's|comedy,956704887
836481,6039,3,2845,toy story 2 (1999),animation|children's|comedy,956704887


<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 5</span> : CSR matrix 만들기

In [107]:
# data를 CSR Matrix에 맞게 변형
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

In [108]:
csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))

<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 6</span> : AlternatingLeastSquares 모델 설계 및 훈련하기

In [109]:
# implicit 라이브러리에서 권장하고 있는 부분 (학습 내용과는 무관함)
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [196]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=700, regularization=0.01, use_gpu=False, iterations=50, dtype=np.float32)

In [197]:
# 모델 훈련
als_model.fit(csr_data)

  0%|          | 0/50 [00:00<?, ?it/s]

<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 7</span> : AlternatingLeastSquares 모델 평가하기

## 1. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악하기
<hr/>

In [198]:
user_to_idx[6041]

6039

In [199]:
test_user, toy_story1 = user_to_idx[6041], movie_to_idx[1]
test_user_vec, toy_story1_vec = als_model.user_factors[test_user], als_model.item_factors[toy_story1]

In [200]:
test_user_vec

array([-1.20597191e-01, -1.42932191e-01, -3.42828110e-02, -1.01214819e-01,
        1.32168710e-01, -2.88038999e-01,  1.48793057e-01,  3.01586598e-01,
        1.65445074e-01,  1.22846439e-01, -1.73211396e-01, -1.35136947e-01,
        1.05831623e-01, -2.28009641e-01, -2.69394189e-01,  1.47827700e-01,
        1.70879021e-01,  2.19084546e-01,  5.59382476e-02, -1.62197515e-01,
       -1.42817453e-01,  2.98008591e-01, -1.86207935e-01, -2.97203541e-01,
        1.24151133e-01,  3.80978137e-02, -2.91550130e-01,  8.00252408e-02,
       -2.45823920e-01,  2.91575491e-01, -7.37768412e-02,  1.72070131e-01,
       -1.82047952e-02,  1.21951699e-01,  5.23321442e-02,  7.88353384e-02,
       -1.96543913e-02,  1.95830837e-01,  7.36032352e-02, -2.28725597e-01,
       -2.06321210e-01, -6.03963099e-02,  2.97019444e-02,  2.62824327e-01,
       -9.90527347e-02, -1.35031424e-03, -3.13997805e-01, -7.08134919e-02,
       -1.92414597e-01,  3.47495191e-02, -2.01249737e-02, -2.95412093e-01,
        1.34008946e-02, -

In [201]:
toy_story1_vec

array([-1.11733275e-02,  2.02332484e-03,  5.46153449e-03, -5.50984126e-03,
        5.95646724e-03, -2.09215954e-02,  2.94126347e-02, -2.77331471e-02,
        3.96656506e-02,  2.75880322e-02,  1.88987181e-02,  1.05378414e-02,
        4.28187586e-02, -1.53473765e-02,  2.99978815e-03,  1.16213504e-02,
       -6.02362119e-03, -6.03786809e-03,  2.35972069e-02, -1.45992041e-02,
       -1.33595187e-02,  2.62241904e-02,  5.55036915e-03,  1.16651412e-02,
        2.96097528e-02, -2.04302669e-02, -1.65991224e-02,  1.83865279e-02,
        2.28166697e-03,  1.65198129e-02,  2.14585904e-02, -2.43959408e-02,
        1.41429603e-02, -1.94566250e-02,  2.25478783e-02,  2.67442316e-02,
        6.24932407e-04,  1.92349795e-02,  1.80827957e-02, -4.74170335e-02,
        1.40443696e-02,  5.76709118e-03,  1.02263149e-02,  2.60467622e-02,
       -1.71668245e-03, -2.45246645e-02,  8.73882044e-03,  3.21199447e-02,
       -5.12653636e-03,  1.00812875e-02, -7.76473805e-03,  2.71340292e-02,
        1.75255872e-02,  

In [202]:
# test_user와 toy_story1을 내적하는 코드
np.dot(test_user_vec, toy_story1_vec)

0.95071226

factors=1000로 지정하였더니, 사용자와 아이템 벡터 내적수치가 의미있게 잘 형성되었다.<br/>
factors를 100으로 주게 되면, 약 0.57%가 나온 것을 확인하였다.

<div style = "height: 50px;"></div>

## 2. 내가 좋아하는 영화와 비슷한 영화를 추천받아 보기
<hr/>

In [284]:
# 유사도 구하는 함수
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = similar_movie[0]
    similar_movie = [idx_to_movie[i] for i in similar_movie]
    return similar_movie

In [295]:
# 영화 검색을 통해 movie_id 찾기
movie_id_toy = ratings.drop_duplicates(['title'], keep = 'first')
movie_id_toy[movie_id_toy['title'].str.contains('pocahontas')]

Unnamed: 0,user_id,counts,movie_id,title,genre,timestamp
19773,252,4,47,pocahontas (1995),animation|children's|musical|romance,974999474


toy story를 검색해서 toy story2가 나온다면 성공으로 판단하고자 한다.

In [296]:
movies[movies['title'].str.contains('pocahontas')]

Unnamed: 0,movie_id,title,genre
47,48,pocahontas (1995),animation|children's|musical|romance


In [297]:
similar_movie = get_similar_movie(48)

In [298]:
similar_data = movies[movies['movie_id'].isin(similar_movie)]
similar_data

Unnamed: 0,movie_id,title,genre
47,48,pocahontas (1995),animation|children's|musical|romance
307,310,rent-a-kid (1995),comedy
773,783,"hunchback of notre dame, the (1996)",animation|children's|musical
805,815,power 98 (1995),action|mystery|thriller
876,888,land before time iii: the time of the great gi...,animation|children's
1774,1843,slappy and the stinkers (1998),children's|comedy
2182,2251,"cabinet of dr. ramirez, the (1991)",comedy
2490,2559,"king and i, the (1999)",animation|children's
2497,2566,doug's 1st movie (1999),animation|children's
3067,3136,"james dean story, the (1957)",documentary


왕과 나, 노틀담의 꼽추, 공룡시대 3 등 많은 애니메이션 영화가 출력된 것을 확인할 수 있었다. <br/> 유사도가 의미있게 측정이 되었다.

<div style = "height: 50px;"></div>

## 3. 내가 가장 좋아할 만한 영화들을 추천받아보기
<hr/>

In [251]:
user = user_to_idx[6041]
# recommend에서는 user*item CSR Matrix를 받습니다.
movies_recommended = als_model.recommend(user, csr_data, N=10, filter_already_liked_items=True)
movies_recommended

ValueError: user_items must contain 1 row for every user in userids

오류를 찾아본 결과, csr_data[userids]로 변경해야한다는 것을 확인하였다. <br/>
[참고자료](https://bytemeta.vip/repo/benfred/implicit/issues/535)

In [258]:
user = user_to_idx[6041]
# recommend에서는 user*item CSR Matrix를 받습니다.
movies_recommend = als_model.recommend(user, csr_data[user], N=10, filter_already_liked_items=True)
movies_recommend

(array([ 725, 1409,  568,  575, 2434, 3304, 1689, 1520, 2506, 2153],
       dtype=int32),
 array([0.21304816, 0.18543401, 0.16891176, 0.14465469, 0.14410135,
        0.1351954 , 0.12553616, 0.11215486, 0.11122189, 0.1099642 ],
       dtype=float32))

In [259]:
recommend_movie = movies_recommend[0]

In [260]:
movies_recommend = [idx_to_movie[i] for i in recommend_movie]

In [320]:
movies_recommend

[783, 1566, 588, 595, 2687, 3615, 1907, 1688, 2761, 2394]

In [321]:
movies[movies['movie_id'].isin(movies_recommend)]

Unnamed: 0,movie_id,title,genre
584,588,aladdin (1992),animation|children's|comedy|musical
591,595,beauty and the beast (1991),animation|children's|musical
773,783,"hunchback of notre dame, the (1996)",animation|children's|musical
1526,1566,hercules (1997),adventure|animation|children's|comedy|musical
1642,1688,anastasia (1997),animation|children's|musical
1838,1907,mulan (1998),animation|children's
2325,2394,"prince of egypt, the (1998)",animation|musical
2618,2687,tarzan (1999),animation|children's
2692,2761,"iron giant, the (1999)",animation|children's
3546,3615,dinosaur (2000),animation|children's


미녀와 야수, 노틀담의 꼽추, 헤라클레스, 아나스타샤, 뮬란, 이집트 왕자, 타잔 등 각종 동화 애니메이션 영화들을 추천받았다. <br/> 그 중, 노틀담의 꼽추가 0.21304816으로 가장 높으므로, 노틀담의 꼽추를 추천할 때 추가했던 각각 my_favorite들의 기여도를 확인하고자 한다.

<div style = "height: 50px;"></div>

## 4. 기여도 확인하기
<hr/>

In [330]:
pocahontas = movie_to_idx[783]
explain = als_model.explain(user, csr_data, itemid = pocahontas)

In [331]:
explain

(0.21131858974165724,
 [(47, 0.15035395579520142),
  (350, 0.03454991607283462),
  (3182, 0.029928047362040813),
  (0, -0.0008852820369958259),
  (2845, -0.0026280474514237713)],
 (array([[0.59014716, 0.15278273, 0.16235044, ..., 0.10929095, 0.10622961,
          0.12207671],
         [0.09016429, 0.61502478, 0.14821994, ..., 0.1047266 , 0.10400323,
          0.11683836],
         [0.09581065, 0.11596328, 0.6139697 , ..., 0.09324496, 0.0957614 ,
          0.09955473],
         ...,
         [0.06449774, 0.08110722, 0.09051558, ..., 0.52140977, 0.00386288,
          0.00853695],
         [0.06269111, 0.08019461, 0.09145637, ..., 0.06588863, 0.52365138,
          0.00139759],
         [0.07204323, 0.0905097 , 0.09826057, ..., 0.07610832, 0.07113377,
          0.51762511]]),
  False))

In [332]:
explain_data = [(idx_to_movie[i[0]], i[1]) for i in explain[1]]

In [341]:
explain_list = []

for i in range(5):
    j = explain_data[i][0]
    explain_list.append(j)

In [345]:
explain_list

[48, 364, 3483, 1, 3114]

In [344]:
movies[movies['movie_id'].isin(explain_list)]

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
47,48,pocahontas (1995),animation|children's|musical|romance
360,364,"lion king, the (1994)",animation|children's|musical
3045,3114,toy story 2 (1999),animation|children's|comedy
3414,3483,"road to el dorado, the (2000)",animation|children's


노틀담의 꼽추를 추천하는데 가장 큰 영향을 미친 영화는 pocahontas이며,  약 0.1503540의 영향을 미친 것을 확인하였다. 

### 기여도 순위

<ol>
    <li style = "margin-bottom:3px;"><span style = "font-weight:bold;color:#0172d4;">pocahontas</span> : 0.1503540</li>
    <li style = "margin-bottom:3px;"><span style = "font-weight:bold;color:#0172d4;">lion king, the</span> : 0.034550</li>
    <li style = "margin-bottom:3px;"><span style = "font-weight:bold;color:#0172d4;">road to el dorado, the</span> : 0.029928</li>
    <li style = "margin-bottom:3px;"><span style = "font-weight:bold;color:#0172d4;">toy story</span> : -0.000885</li>
    <li style = "margin-bottom:3px;"><span style = "font-weight:bold;color:#0172d4;">toy story 2</span> : -0.002628</li>
</ol>

<div style = "height: 50px;"></div>

# <span style = "font-weight:bold;color:#0172d4;">STEP 7</span> : 회고하기

<ol style= "font-size:16px; line-height:30px;">
	<li style = "margin-bottom:14px;">영화 데이터가 2000년도까지라서 많이 아쉽다.</li>
	<li style = "margin-bottom:14px;">버전 문제로 인해 lms에서는 recommend가 잘 실행되었으나, 로컬에서는 작동하지 않았다.</li>
	<li style = "margin-bottom:14px;">2개의 데이터를 합치는 과정을 진행하였으나, 모델을 돌리는데에 있어 큰 작용을 하지 않아 시간을 낭비한 것 같다.</li>
	<li style = "margin-bottom:14px;">모델의 출력 결과를 시각화하면 좀 더 좋았을 것 같다. (시간 부족 이슈로 인한 한계)</li>
    <li style = "margin-bottom:14px;">출력된 인덱스를 movies의 인덱스로 변환하는 과정에 있어 오류가 뜨는데 버전 오류로 인해 생긴 것 같다.<br/>이는 [idx_to_movie[i] for i in 모델이 출력한 결과값들]에서 [idx_to_movie[i] for i in 모델이 출력한 결과값[0]]으로 처리하면 해결 가능하다.</li>
</ol>

<div style = "height: 50px;"></div>

## 2. 참고자료
<hr/>

<ul style= "font-size:16px; line-height:30px;">
    <li style = "margin-bottom:14px;"><a href = "https://bytemeta.vip/repo/benfred/implicit/issues/535">recommend 오류 해결법</a></li>
    <li style = "margin-bottom:14px;"><a href = "https://colab.research.google.com/drive/1wdFPQXfV8QV5EvQG-STGCJC0gObF62Mu#scrollTo=4B8J5xk06nsA">코랩 사용 시 오류 해결법</a></li>
</ul>

<div style = "height: 50px;"></div>