In [1]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'counts'}, inplace=True)
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [4]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print("movies  id:",movies['movie_id'].nunique())
print("movies title :",movies['title'].nunique())
print("movies genre:",movies['genre'].nunique())

movies  id: 3883
movies title : 3883
movies genre: 301


In [6]:
print("movies :",ratings['movie_id'].nunique())
print("users :",ratings['user_id'].nunique())
ratings_count = ratings.groupby('movie_id')['user_id'].count()
ratings_count_sorted = ratings_count.sort_values(ascending=False)
ratings_count_sorted.head(30)

movies : 3628
users : 6039


movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

In [7]:
temp_pd = pd.merge(ratings_count_sorted, movies, on='movie_id', how='inner')
temp_pd.rename(columns = {'user_id':'ratings'}).head(30)

Unnamed: 0,movie_id,ratings,title,genre
0,2858,3211,American Beauty (1999),Comedy|Drama
1,260,2910,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
2,1196,2885,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
3,1210,2716,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
4,2028,2561,Saving Private Ryan (1998),Action|Drama|War
5,589,2509,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
6,593,2498,"Silence of the Lambs, The (1991)",Drama|Thriller
7,1198,2473,Raiders of the Lost Ark (1981),Action|Adventure
8,1270,2460,Back to the Future (1985),Comedy|Sci-Fi
9,2571,2434,"Matrix, The (1999)",Action|Sci-Fi|Thriller


가장 리뷰가 많은(인기있는) 영화 30개를 나열해보았습니다.

In [8]:
my_favorite = [1097, 1,1580 ,2028 , 1240]
#['E.T. the Extra-Terrestrial (1982)' , 'Toy Story (1995)' ,'Men in Black (1997)' ,'Saving Private Ryan (1998)' ,'Terminator, The (1984)']
# '9999라는 user_id가 위의 영화에 5점씩 줬다고 칩시다
my_ratings = pd.DataFrame({'user_id': [9999]*5, 'movie_id': my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':[9999]})['user_id'].any():  
    ratings = ratings.append(my_ratings)                            

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,9999,1097,5,
1,9999,1,5,
2,9999,1580,5,
3,9999,2028,5,
4,9999,1240,5,


제가 좋아하는 영화들을 ratings에 추가해줬습니다.

In [9]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}
len(user_unique)

6040

In [10]:
len(movie_unique)

3628

In [11]:
print(user_to_idx[9999])
print(movie_to_idx[1])

6039
40


In [12]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 movie_id 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

ratings

user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6039,26,5,
1,6039,40,5,
2,6039,175,5,
3,6039,48,5,


In [13]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

print(type(ratings))
print(type(ratings.counts))
print(len(ratings.counts))
csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
836483


<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [14]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [15]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [16]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [17]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
len(movie_to_idx)

3628

In [21]:
kwon, toy_story = user_to_idx[9999], movie_to_idx[1]
kwon_vector, toy_story_vector = als_model.user_factors[kwon], als_model.item_factors[toy_story]

movie = movie_to_idx[1]
movie_vector = als_model.item_factors[movie]
np.dot(kwon_vector, movie_vector)

0.48821157

제가 좋아하는 영화 5개 목록에 있었던 토이스토리로 선호도를 측정한 결과 0.48이 나왔습니다 높은 수치일까요?

In [23]:
kwon, american_beauty = user_to_idx[9999], movie_to_idx[2858]
kwon_vector = als_model.user_factors[kwon]

movie = american_beauty
movie_vector = als_model.item_factors[movie]
np.dot(kwon_vector, movie_vector)

0.20578079

이번엔 목록에도 없고 처음들어보는 영화인 아메리칸 뷰티입니다, 선호도가 0.2로 매우 낮게 나옵니다.

In [26]:
kwon, matrix = user_to_idx[9999], movie_to_idx[2571]
kwon_vector = als_model.user_factors[kwon]

movie = matrix
movie_vector = als_model.item_factors[movie]
np.dot(kwon_vector, movie_vector)

0.5159556

매트릭스는 목록에는 없지만 제가 매우 좋아하는 영화입니다.  
선호도가 0.51로 목록에 있던 토이스토리보다도 높게 나왔네요 놀라운 결과입니다.  
제가 액션영화 위주로 봤기 때문일까요?  

In [27]:
favorite_movie = 1
movie_idx = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_idx, N=15)
similar_movie

[(40, 1.0),
 (50, 0.78857625),
 (4, 0.6181353),
 (322, 0.5752076),
 (33, 0.56513417),
 (110, 0.51629287),
 (20, 0.4824374),
 (330, 0.45818603),
 (10, 0.4494091),
 (255, 0.3817873),
 (160, 0.37688178),
 (16, 0.36012277),
 (34, 0.3571413),
 (32, 0.3463335),
 (458, 0.3434017)]

In [28]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

[1,
 3114,
 2355,
 34,
 588,
 1265,
 2321,
 364,
 595,
 1923,
 356,
 2687,
 1907,
 1566,
 367]

In [29]:
def id_to_title(data, movie_id) :
    condition = data.movie_id == movie_id
    a = data[condition]['title']
    a = pd.DataFrame(a)    
    return a.iloc[0]['title']

In [31]:
for a in [idx_to_movie[i[0]] for i in similar_movie] :
    print(id_to_title(movies, a))

Toy Story (1995)
Toy Story 2 (1999)
Bug's Life, A (1998)
Babe (1995)
Aladdin (1992)
Groundhog Day (1993)
Pleasantville (1998)
Lion King, The (1994)
Beauty and the Beast (1991)
There's Something About Mary (1998)
Forrest Gump (1994)
Tarzan (1999)
Mulan (1998)
Hercules (1997)
Mask, The (1994)


movie_id를 타이틀명으로 바꿔주었습니다. 토이스토리와 비슷한 작품으로 그 후속작이랑 애니메이션류의 작품을 찾아줬네요  
매우 정확한것같습니다!

In [32]:
def similar_movie(movie_id) :
    movie_idx = movie_to_idx[movie_id]
    similar_movie = als_model.similar_items(movie_idx, N=15)
    idx_to_movie = {v:k for k,v in movie_to_idx.items()}
    for a in [idx_to_movie[i[0]] for i in similar_movie] :
        print(id_to_title(movies, a))

In [33]:
similar_movie(2028)

Saving Private Ryan (1998)
Braveheart (1995)
Schindler's List (1993)
Boat, The (Das Boot) (1981)
Fugitive, The (1993)
Shawshank Redemption, The (1994)
Matrix, The (1999)
Good Will Hunting (1997)
American Beauty (1999)
Simon Sez (1999)
Hunt for Red October, The (1990)
Terminator 2: Judgment Day (1991)
Thelma & Louise (1991)
Silence of the Lambs, The (1991)
Glory (1989)


위 과정을 함수로 만들고 라이언일병 구하기와 유사한 영화들을 찾아봤습니다,

In [34]:
user = user_to_idx[9999]
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(92, 0.58370656),
 (124, 0.5159556),
 (87, 0.46482077),
 (22, 0.4533485),
 (117, 0.43948776),
 (107, 0.4131611),
 (44, 0.39698884),
 (64, 0.3668791),
 (141, 0.3285604),
 (62, 0.3141867),
 (50, 0.3014591),
 (4, 0.27450174),
 (651, 0.2705989),
 (322, 0.2509431),
 (160, 0.24733168),
 (120, 0.22921509),
 (121, 0.22895035),
 (317, 0.2283702),
 (60, 0.21938023),
 (23, 0.21936946)]

In [35]:
[idx_to_movie[i[0]] for i in movie_recommended]

[589,
 2571,
 110,
 1270,
 1196,
 480,
 260,
 1210,
 457,
 2916,
 3114,
 2355,
 1200,
 34,
 356,
 1198,
 593,
 32,
 2628,
 527]

In [36]:
def recommend_movie(user_id) :
    user = user_to_idx[user_id]
    movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
    for a in [idx_to_movie[i[0]] for i in movie_recommended] :
        print(id_to_title(movies, a)) 

In [37]:
recommend_movie(9999)

Terminator 2: Judgment Day (1991)
Matrix, The (1999)
Braveheart (1995)
Back to the Future (1985)
Star Wars: Episode V - The Empire Strikes Back (1980)
Jurassic Park (1993)
Star Wars: Episode IV - A New Hope (1977)
Star Wars: Episode VI - Return of the Jedi (1983)
Fugitive, The (1993)
Total Recall (1990)
Toy Story 2 (1999)
Bug's Life, A (1998)
Aliens (1986)
Babe (1995)
Forrest Gump (1994)
Raiders of the Lost Ark (1981)
Silence of the Lambs, The (1991)
Twelve Monkeys (1995)
Star Wars: Episode I - The Phantom Menace (1999)
Schindler's List (1993)


마지막으로 유저 추천입니다, 어쩜 제취향만 딱딱 맞춰서 추천해주는게 너무 신기합니다!  
제가 많이 사용하는 유튜브 영상 추천도 비슷한 알고리즘으로 작동하는건지 궁금하네요  
사족으로 이번 익스는 다른것보다 데이터프레임 다루는게 제일 어려웠던것 같습니다 ㅠㅠ  
그래도 이번에 하루종일 헤매보면서 조금이라도 늘은 것 같아서 다행입니다