## 데이터 준비

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

import os

In [2]:
ratings = pd.read_csv('./data/movie_lens/ratings.csv')
movies = pd.read_csv('./data/movie_lens/movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## 데이터 분석하기

In [5]:
ratings_movie = ratings['movieId'].nunique()
ratings_user = ratings['userId'].nunique()
print(f'ratings에 있는 유니크한 영화 개수: {ratings_movie}')
print(f'ratings에 있는 유니크한 사용자 수: {ratings_user}')

ratings에 있는 유니크한 영화 개수: 9066
ratings에 있는 유니크한 사용자 수: 671


In [6]:
ratings.drop('timestamp', axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [7]:
movies.drop('genres', axis = 1, inplace=True)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
# movies와 ratings를 합쳐준다.
data= pd.merge(ratings, movies, on = 'movieId')
data.drop('movieId', axis = 1, inplace=True)
data.head(5)

Unnamed: 0,userId,rating,title
0,1,2.5,Dangerous Minds (1995)
1,7,3.0,Dangerous Minds (1995)
2,31,4.0,Dangerous Minds (1995)
3,32,4.0,Dangerous Minds (1995)
4,36,3.0,Dangerous Minds (1995)


In [9]:
#가장 인기있는 영화 30개(인기순)
data['title'] = data['title'].str.lower()
movies_count = data.groupby('title')['userId'].count()
movies_count.sort_values(ascending=False).head(10)

title
forrest gump (1994)                          341
pulp fiction (1994)                          324
shawshank redemption, the (1994)             311
silence of the lambs, the (1991)             304
star wars: episode iv - a new hope (1977)    291
jurassic park (1993)                         274
matrix, the (1999)                           259
toy story (1995)                             247
schindler's list (1993)                      244
terminator 2: judgment day (1991)            237
Name: userId, dtype: int64

In [10]:
# 유저별 몇 편의 영화를 보고 있는지에 대한 통계
user_count = data.groupby('userId')['title'].count()
user_count.describe()

count     671.000000
mean      149.037258
std       231.226948
min        20.000000
25%        37.000000
50%        71.000000
75%       161.000000
max      2391.000000
Name: title, dtype: float64

## 내가 선호하는 영화 추가하기

In [11]:
my_favor_movie = ['men in black (1997)'
                  , 'fugitive, the (1993)' 
                  ,'terminator, the (1984)' 
                  ,'star wars: episode vi - return of the jedi (1983)' 
                  ,'terminator 2: judgment day (1991)']
my_playlist = pd.DataFrame({'userId': ['0']*5, 'rating':[5]*5, 'title': my_favor_movie})

data = pd.concat([my_playlist, data], sort=True).reset_index(drop=True)    
data.head(10)

Unnamed: 0,rating,title,userId
0,5.0,men in black (1997),0
1,5.0,"fugitive, the (1993)",0
2,5.0,"terminator, the (1984)",0
3,5.0,star wars: episode vi - return of the jedi (1983),0
4,5.0,terminator 2: judgment day (1991),0
5,2.5,dangerous minds (1995),1
6,3.0,dangerous minds (1995),7
7,4.0,dangerous minds (1995),31
8,4.0,dangerous minds (1995),32
9,3.0,dangerous minds (1995),36


선호 데이터가 추가된 모습을 볼 수 있습니다.

## 전처리하기

In [12]:
user_unique = data['userId'].unique()
title_unique = data['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

print('kimin의 index 번호:', user_to_idx['0']) 
print('forrest gump 의 index 번호:', title_to_idx['forrest gump (1994)'])

kimin의 index 번호: 0
forrest gump 의 index 번호: 62


내가 선호하는 영화가 잘 추가 됐는지 확인합니다.

In [13]:
temp_user_data = data['userId'].map(user_to_idx.get).dropna()

temp_title_data = data['title'].map(title_to_idx.get).dropna()

if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면 실행
    data['userId'] = temp_user_data 

if len(temp_title_data) == len(data):
    data['title'] = temp_title_data

data

Unnamed: 0,rating,title,userId
0,5.0,0,0
1,5.0,1,0
2,5.0,2,0
3,5.0,3,0
4,5.0,4,0
...,...,...,...
100004,2.5,1460,213
100005,3.5,9061,213
100006,3.0,9062,78
100007,1.0,9063,78


CSR matrix를 만들기 전에, user id와 title을 인덱스로 변경해줍니다. 

## CSR matrix 만들기

In [14]:
data['rating']
print(data.rating)

0         5.0
1         5.0
2         5.0
3         5.0
4         5.0
         ... 
100004    2.5
100005    3.5
100006    3.0
100007    1.0
100008    1.0
Name: rating, Length: 100009, dtype: float64


In [15]:
num_user = data['userId'].nunique()
num_movie = data['title'].nunique()

csr_data = csr_matrix((data.rating, (data.userId, data.title)), 
                      shape=(num_user, num_movie))
csr_data

<672x9065 sparse matrix of type '<class 'numpy.float64'>'
	with 100008 stored elements in Compressed Sparse Row format>

##  AlternatingLeastSquares 모델

In [16]:
# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, 
                                   iterations=30, dtype=np.float32)
csr_data_transpose = csr_data.T
als_model.fit(csr_data_transpose)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [18]:
kimin, die_hard = user_to_idx['0'], title_to_idx['die hard (1988)']
kimin_vector, die_hard_vector = als_model.user_factors[kimin], als_model.item_factors[die_hard]

In [19]:
kimin_vector

array([-7.49779493e-02,  1.78338170e-01,  9.81864929e-02,  2.46803194e-01,
       -2.62321949e-01, -1.36777952e-01,  1.45235509e-01, -8.76451358e-02,
       -8.69406164e-02,  4.28637236e-01,  2.27460623e-01,  4.67960328e-01,
       -2.82364994e-01, -4.68048722e-01,  1.14705488e-01,  1.86903462e-01,
       -2.36493886e-01, -2.42668718e-01,  4.48053814e-02,  7.50174047e-03,
       -5.27423024e-02, -1.66138083e-01, -2.74246167e-02,  2.77099043e-01,
        3.89506621e-03,  1.56279907e-01, -1.96906060e-01,  2.38507584e-01,
        2.25634038e-01,  4.17619795e-01, -1.39897645e-01,  2.19494924e-01,
       -2.11697549e-01, -2.53737479e-01, -2.01079458e-01,  1.15843847e-01,
       -1.81851014e-01, -7.17751443e-01,  1.04431741e-01, -2.15486288e-02,
        3.18315029e-01,  3.55931674e-03,  2.82633543e-01,  1.38362527e-01,
       -2.59710588e-02, -3.68795216e-01, -1.96005806e-01, -2.51703143e-01,
       -4.27017512e-04, -2.94784643e-02,  2.81476676e-01, -3.90900135e-01,
       -6.08405247e-02,  

내가 선호하는 영화에 따라 user의 특징을 나타내는 벡터가 생성되었습니다. 

In [20]:
die_hard_vector

array([ 0.00361545,  0.0376028 ,  0.01031397,  0.04850543, -0.01299863,
        0.01347434,  0.00166487, -0.0228728 , -0.02781936,  0.03329022,
        0.01451709,  0.02552956,  0.02384741, -0.00893693,  0.03731588,
       -0.03966041, -0.05437039, -0.0247719 , -0.03278617, -0.00826715,
        0.02049851,  0.00877084, -0.01022084,  0.03746725, -0.00570133,
        0.03435934,  0.02821262,  0.00874103, -0.01561256,  0.03459564,
       -0.00781782, -0.01273238, -0.01081162, -0.0219006 , -0.01813494,
        0.02310316, -0.03148777,  0.00365834, -0.03520845,  0.02139181,
        0.0160508 , -0.00556455,  0.00780175,  0.02663171,  0.00018574,
       -0.00542944,  0.00040563, -0.04733204,  0.02365248,  0.04402654,
        0.04754265, -0.04117234, -0.03610666, -0.00617777,  0.023834  ,
        0.0232125 , -0.02084619,  0.02945786, -0.02073666, -0.06728745,
        0.00581193,  0.00641899,  0.0016074 , -0.00251132, -0.0236092 ,
       -0.02495179,  0.01650328,  0.01236798,  0.03303682, -0.00

마찬가지로 각각 영화를 특성을 나타내는 벡터가 생성되었습니다. 

In [22]:
np.dot(kimin_vector, die_hard_vector)

0.33416337

kimin이 다이 하드를 선호하는 점수는 0.33이 됩니다.

In [23]:
toy = title_to_idx['toy story (1995)']
toy_vector = als_model.item_factors[toy]
np.dot(kimin_vector, toy_vector)

0.0942103

토이스토리의 선호점수는 0.09으로 다이 하드에 비해서는 현저히 낮은 점수입니다.

## 영화 추천받기

In [24]:
idx_to_title = {v:k for k,v in title_to_idx.items()}

def get_similar_title(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title[1:]]
    return similar_title

선호 점수를 기반으로 선호 영화와 비슷한 영화를 추천받는 함수를 만들어 보겠습니다. 여기서 선호 영화는 제외합니다.

In [25]:
get_similar_title('men in black (1997)')

['terminator, the (1984)',
 'terminator 2: judgment day (1991)',
 'fugitive, the (1993)',
 'star wars: episode vi - return of the jedi (1983)',
 'jurassic park (1993)',
 'die hard (1988)',
 'rare birds (2001)',
 'four days in september (o que é isso, companheiro?) (1997)',
 'heaven (2002)']

In [26]:
get_similar_title('toy story (1995)')

['independence day (a.k.a. id4) (1996)',
 'monsters, inc. (2001)',
 'toy story 2 (1999)',
 'magic mike xxl (2015)',
 'aloha (2015)',
 'survivor (2015)',
 "i'll see you in my dreams (2015)",
 '7 days in hell (2015)',
 'entourage (2015)']

In [27]:
get_similar_title('back to the future (1985)')

['forrest gump (1994)',
 'star wars: episode v - the empire strikes back (1980)',
 'raiders of the lost ark (indiana jones and the raiders of the lost ark) (1981)',
 'ghostbusters (a.k.a. ghost busters) (1984)',
 'matrix, the (1999)',
 'terminator, the (1984)',
 'star wars: episode vi - return of the jedi (1983)',
 'star wars: episode iv - a new hope (1977)',
 'magic mike xxl (2015)']

In [28]:
user = user_to_idx['0']
# recommend에서는 user*item CSR Matrix를 받습니다.
title_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
for i in title_recommended: 
    print("{} {:0.3f}".format(idx_to_title[i[0]], i[1]))

jurassic park (1993) 0.437
star wars: episode v - the empire strikes back (1980) 0.340
die hard (1988) 0.334
braveheart (1995) 0.324
forrest gump (1994) 0.317
raiders of the lost ark (indiana jones and the raiders of the lost ark) (1981) 0.316
star wars: episode iv - a new hope (1977) 0.315
true lies (1994) 0.302
back to the future (1985) 0.297
indiana jones and the last crusade (1989) 0.220
matrix, the (1999) 0.200
e.t. the extra-terrestrial (1982) 0.199
silence of the lambs, the (1991) 0.193
men in black (a.k.a. mib) (1997) 0.193
dances with wolves (1990) 0.177
aliens (1986) 0.176
independence day (a.k.a. id4) (1996) 0.160
speed (1994) 0.150
alien (1979) 0.147
saving private ryan (1998) 0.146


## Summary

위에서 평점을 5점 준 데이터들은 아래와 같았습니다. 
- 'men in black (1997)' 
- 'fugitive, the (1993)'
- 'terminator, the (1984)'
- 'star wars: episode vi - return of the jedi (1983)'
- 'terminator 2: judgment day (1991)'
추천된 영화에서 액션과 관련된 영화가 많이 보이는 것 같습니다. 