## 데이터 준비

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

import os

In [3]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)
print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')
ratings.head()

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head(5)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 데이터 분석하기

In [6]:
ratings_movie = ratings['movie_id'].nunique()
ratings_user = ratings['user_id'].nunique()
print(f'ratings에 있는 유니크한 영화 개수: {ratings_movie}')
print(f'ratings에 있는 유니크한 사용자 수: {ratings_user}')

ratings에 있는 유니크한 영화 개수: 3628
ratings에 있는 유니크한 사용자 수: 6039


In [7]:
# movies와 ratings를 합쳐준다.
data= pd.merge(movies, ratings)
using_cols = [ 'user_id','title', 'rating']
data = data[using_cols]
data.head(5)

Unnamed: 0,user_id,title,rating
0,1,Toy Story (1995),5
1,6,Toy Story (1995),4
2,8,Toy Story (1995),4
3,9,Toy Story (1995),5
4,10,Toy Story (1995),5


In [8]:
#가장 인기있는 영화 30개(인기순)
data['title'] = data['title'].str.lower()
movies_count = data.groupby('title')['user_id'].count()
movies_count.sort_values(ascending=False).head(10)

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
Name: user_id, dtype: int64

In [9]:
# 유저별 몇 편의 영화를 보고 있는지에 대한 통계
user_count = data.groupby('user_id')['title'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

## 내가 선호하는 영화 추가하기

In [10]:
my_favor_movie = ['men in black (1997)' , 'fugitive, the (1993)' ,'terminator, the (1984)' ,'back to the future (1985)' ,'forrest gump (1994)']
my_playlist = pd.DataFrame({'user_id': ['kimin']*5, 'title': my_favor_movie, 'rating':[5]*5})

data = pd.concat([data, my_playlist], sort=True)
    
data.tail(10)

Unnamed: 0,rating,title,user_id
836473,3,"contender, the (2000)",5682
836474,4,"contender, the (2000)",5812
836475,3,"contender, the (2000)",5831
836476,4,"contender, the (2000)",5837
836477,4,"contender, the (2000)",5998
0,5,men in black (1997),kimin
1,5,"fugitive, the (1993)",kimin
2,5,"terminator, the (1984)",kimin
3,5,back to the future (1985),kimin
4,5,forrest gump (1994),kimin


## 전처리하기

In [11]:
user_unique = data['user_id'].unique()
title_unique = data['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

print('kimin의 index 번호:', user_to_idx['kimin']) 
print('forrest gump 의 index 번호:', title_to_idx['forrest gump (1994)'])

kimin의 index 번호: 6039
forrest gump 의 index 번호: 342


In [12]:
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data 
else:
    print('user_id column indexing Fail!!')
    
temp_title_data = data['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(data):
    print('title column indexing OK!!')
    data['title'] = temp_title_data
else:
    print('title column indexing Fail!!')

data

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,rating,title,user_id
0,5,0,0
1,4,0,1
2,4,0,2
3,5,0,3
4,5,0,4
...,...,...,...
0,5,1419,6039
1,5,439,6039
2,5,1122,6039
3,5,1152,6039


## CSR matrix 만들기

In [13]:
data['rating']
print(data.rating)

0    5
1    4
2    4
3    5
4    5
    ..
0    5
1    5
2    5
3    5
4    5
Name: rating, Length: 836483, dtype: int64


In [14]:
num_user = data['user_id'].nunique()
num_movie = data['title'].nunique()

csr_data = csr_matrix((data.rating, (data.user_id, data.title)), 
                      shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

##  AlternatingLeastSquares 모델

In [15]:
# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, 
                                   iterations=30, dtype=np.float32)
csr_data_transpose = csr_data.T
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [20]:
kimin, back_future = user_to_idx['kimin'], title_to_idx['back to the future (1985)']
kimin_vector, back_future_vector = als_model.user_factors[kimin], als_model.item_factors[back_future]

In [19]:
kimin_vector

array([ 0.32725337,  0.46704096,  0.3601165 ,  0.08476339, -0.62813365,
       -0.16452338,  0.12036967, -0.07353739,  0.17817178,  0.6810431 ,
       -0.6617621 ,  0.7172151 , -0.22007641, -0.06624185,  0.04734982,
        0.7880284 ,  0.5529376 , -0.5837886 , -0.5205703 , -0.03517917,
        0.7480841 , -0.74441826, -0.15154919,  0.07642394, -0.45291698,
       -0.383498  ,  0.5433273 , -0.46065077, -0.36616284,  0.1430307 ,
        0.3898874 ,  0.49043894,  0.3882472 ,  0.3260817 , -0.7980312 ,
        0.5651583 ,  0.14214188, -0.94524336, -0.61093086,  0.63193226,
        0.04822527,  0.1572215 , -0.18570226,  0.6433806 , -0.25215244,
        0.40647346,  1.0391612 ,  0.55173177,  0.44879565, -0.10444115,
       -0.3531667 ,  0.02185513, -0.6492503 , -0.6074436 ,  0.0275471 ,
       -0.47011158,  0.5340451 , -0.54522806,  0.08179111, -0.47947153,
        0.20876805,  0.1945253 ,  0.4928832 , -0.13175339, -0.25978595,
        0.41144344,  0.22034125,  0.21988942, -0.5413961 , -0.43

In [21]:
back_future_vector

array([-7.40955444e-03,  5.84189892e-02, -1.46386703e-03,  9.84407612e-04,
       -8.95474479e-03,  2.50771847e-02,  1.47968449e-03,  2.72427406e-03,
       -4.20604227e-03,  1.31540559e-02, -2.93520074e-02,  3.00687551e-03,
       -6.77027914e-04,  1.93653461e-02,  6.23673154e-03,  2.67176535e-02,
        1.69822462e-02,  1.18038068e-02, -2.72943769e-02,  1.05355354e-02,
        3.41869071e-02, -4.33712918e-03,  3.20674218e-02, -3.32206138e-03,
        2.10608449e-03, -2.57191854e-03,  4.60849665e-02, -4.39683832e-02,
       -2.35103089e-02,  4.57744393e-03,  1.66120995e-02,  1.02578187e-02,
        2.28685085e-02, -1.37284473e-02, -1.10420296e-02,  1.15731936e-02,
        1.81657262e-02, -3.74217518e-02,  1.93092488e-02,  5.42104580e-02,
       -1.26651255e-02, -6.27271691e-03,  8.51532910e-03,  1.52287018e-02,
        1.61138941e-02,  2.38668174e-02,  4.08922993e-02,  5.86544126e-02,
        2.04588473e-02, -2.42098365e-02, -2.54511461e-02,  3.28882076e-02,
       -2.41123829e-02, -

In [22]:
np.dot(kimin_vector, back_to_the_future_vector)

0.58377063

In [23]:
toy = title_to_idx['toy story (1995)']
toy_vector = als_model.item_factors[toy]
np.dot(kimin_vector, toy_vector)

0.20463058

## 영화 추천받기

In [24]:
idx_to_title = {v:k for k,v in title_to_idx.items()}

def get_similar_title(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title]
    return similar_title

In [25]:
get_similar_title('men in black (1997)')

['men in black (1997)',
 'jurassic park (1993)',
 'terminator 2: judgment day (1991)',
 'total recall (1990)',
 'independence day (id4) (1996)',
 'matrix, the (1999)',
 'fifth element, the (1997)',
 'lost world: jurassic park, the (1997)',
 'galaxy quest (1999)',
 'face/off (1997)']

In [26]:
get_similar_title('toy story (1995)')

['toy story (1995)',
 'toy story 2 (1999)',
 "bug's life, a (1998)",
 'aladdin (1992)',
 'babe (1995)',
 'groundhog day (1993)',
 'lion king, the (1994)',
 'beauty and the beast (1991)',
 "there's something about mary (1998)",
 'pleasantville (1998)']

In [27]:
get_similar_title('back to the future (1985)')

['back to the future (1985)',
 "ferris bueller's day off (1986)",
 'when harry met sally... (1989)',
 'back to the future part ii (1989)',
 'big (1988)',
 'ghostbusters (1984)',
 'bull durham (1988)',
 'fish called wanda, a (1988)',
 'airplane! (1980)',
 'cocoon (1985)']

In [28]:
user = user_to_idx['kimin']
# recommend에서는 user*item CSR Matrix를 받습니다.
title_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
for i in title_recommended: 
    print("{} {:08.3f}".format(idx_to_title[i[0]], i[1]))

terminator 2: judgment day (1991) 0000.636
jurassic park (1993) 0000.564
matrix, the (1999) 0000.537
star wars: episode vi - return of the jedi (1983) 0000.442
total recall (1990) 0000.401
braveheart (1995) 0000.365
groundhog day (1993) 0000.338
sixth sense, the (1999) 0000.333
die hard (1988) 0000.327
hunt for red october, the (1990) 0000.316
saving private ryan (1998) 0000.314
star wars: episode v - the empire strikes back (1980) 0000.313
speed (1994) 0000.271
aliens (1986) 0000.251
star wars: episode iv - a new hope (1977) 0000.250
silence of the lambs, the (1991) 0000.233
airplane! (1980) 0000.232
independence day (id4) (1996) 0000.227
toy story (1995) 0000.205
galaxy quest (1999) 0000.203


In [29]:
rihanna = title_to_idx['die hard (1988)']
explain = als_model.explain(user, csr_data, itemid=rihanna)
[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('terminator, the (1984)', 0.21670287769375524),
 ('fugitive, the (1993)', 0.1549119046321933),
 ('back to the future (1985)', 0.03818649718753686),
 ('forrest gump (1994)', -0.021093030831221167),
 ('men in black (1997)', -0.06425605290505357)]