# Load dataset

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/dataset/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# 2. Data preprocessing

In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/dataset/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 3. add my watch history

In [6]:
id_to_movie = {idx+1:val for idx, val in enumerate(movies['title'])}
movie_to_id = {val:idx+1 for idx, val in enumerate(movies['title'])}

In [7]:
ratings.nunique()

user_id        6039
movie_id       3628
count             3
timestamp    412911
dtype: int64

In [8]:
ratings.groupby('movie_id').count()['count'].sort_values(ascending=False).rename(index=id_to_movie).head()

movie_id
Hairspray (1988)                      3211
Little Princess, A (1995)             2910
GoodFellas (1990)                     2885
Once Upon a Time in America (1984)    2716
Sleeping Beauty (1959)                2561
Name: count, dtype: int64

In [9]:
len(ratings.groupby('movie_id').count())

3628

In [10]:
my_watch_history = ['Beauty and the Beast (1991)',
                    'Star Wars: Episode VI - Return of the Jedi (1983)',
                    'Batman (1989)',
                    'Toy Story (1995)'
                   ]
my_watch_history_id = list(map(lambda x: movie_to_id.get(x),my_watch_history))
my_watch_history_id

[592, 1193, 589, 1]

In [11]:
my_favorite = ['black eyed peas' , 'maroon5' ,'jason mraz' ,'coldplay' ,'beyoncé']

my_playlist = pd.DataFrame({'user_id': ['6041']*len(my_watch_history_id),
                            'movie_id': my_watch_history_id,
                            'count':[np.random.randint(3,6)]*len(my_watch_history_id),
                            'timestamp': 0*len(my_watch_history_id)})

if not ratings.isin({'user_id':['6041']})['user_id'].any():
    ratings = ratings.append(my_playlist)                   

ratings.tail(10)

Unnamed: 0,user_id,movie_id,count,timestamp
1000202,6040,1089,4,956704996
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,6041,592,5,0
1,6041,1193,5,0
2,6041,589,5,0
3,6041,1,5,0


# 4. Create CSR matrix

In [12]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

print(num_user, num_movie)
print(ratings['count'].shape)

csr_data = csr_matrix((ratings['count'], (ratings['user_id'], ratings['movie_id'])),
                      shape = (6042, 3953))
csr_data

6040 3628
(836482,)


<6042x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836482 stored elements in Compressed Sparse Row format>

# 5. Train model

In [13]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [14]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.longlong'>'
	with 836482 stored elements in Compressed Sparse Column format>

In [15]:
als_model = AlternatingLeastSquares(factors=100,
                                    regularization=0.01,
                                    use_gpu=False,
                                    iterations=15,
                                    dtype=np.float32)

In [16]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




# 6. Get result

## 훈련된 모델이 예측한 나의 선호도

In [17]:
my_user_vector = als_model.user_factors[6041]
movie_vector = als_model.user_factors[1193]

np.dot(my_user_vector, movie_vector)

20.269604

## 내가 좋아하는 영화와 비슷한 영화

In [18]:
movie_id = 1193
print(id_to_movie[1193])

similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[id_to_movie[i[0]] for i in similar_movie]

Star Wars: Episode VI - Return of the Jedi (1983)


['Star Wars: Episode VI - Return of the Jedi (1983)',
 'Dangerous Liaisons (1988)',
 'Unforgiven (1992)',
 'Amadeus (1984)',
 'Manhattan (1979)',
 'Sleeper (1973)',
 'Suture (1993)',
 'Henry V (1989)',
 "Greaser's Palace (1972)",
 'Prom Night IV: Deliver Us From Evil (1992)',
 'Death Wish (1974)',
 'Death Wish II (1982)',
 'Hard 8 (a.k.a. Sydney, a.k.a. Hard Eight) (1996)',
 'Hillbillys in a Haunted House (1967)',
 'Rumble in the Bronx (1995)']

## 내가 가장 좋아할 만한 영화

In [19]:
user = 6041
# recommend에서는 user*item CSR Matrix를 받습니다.
artist_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
[(id_to_movie[idx], val) for (idx, val) in artist_recommended]

[('Mr. Death: The Rise and Fall of Fred A. Leuchter Jr. (1999)', 0.3833844),
 ('King of the Hill (1993)', 0.3739538),
 ('Christmas Vacation (1989)', 0.36799496),
 ('Kicked in the Head (1997)', 0.32168952),
 ('Mommie Dearest (1981)', 0.28558925),
 ('On Any Sunday (1971)', 0.27969372),
 ('Taxi Driver (1976)', 0.26931062),
 ('Tough and Deadly (1995)', 0.23545238),
 ('Babe (1995)', 0.22934343),
 ('Stand by Me (1986)', 0.2009564),
 ('Getting Even with Dad (1994)', 0.19948888),
 ('Mask, The (1994)', 0.19248101),
 ('Once Upon a Time in America (1984)', 0.18972036),
 ('Trial by Jury (1994)', 0.18551832),
 ('GoodFellas (1990)', 0.1776644),
 ('Dragonheart (1996)', 0.17136332),
 ('Window to Paris (1994)', 0.16833936),
 ("Felicia's Journey (1999)", 0.16660877),
 ('Bastard Out of Carolina (1996)', 0.16343415),
 ('Hype! (1996)', 0.1629472)]