In [1]:
import pandas as pd
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.

ratings.rename(columns={'rating':'count'}, inplace=True)

In [4]:
ratings.head(10)

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# 영화제목 전처리. 영화제목을 소문자로 변환

movies['title'] = movies['title'].str.lower()

In [7]:
# 영화제목 전처리2. 영화제목에서 ()안의 연도는 삭제하기
import re

pattern = re.compile('\(.+?\)')


movies['title'] = movies['title'].str.replace(pattern, '')
movies['title'] = movies['title'].str.strip()

movies['title'].head(10)


0                      toy story
1                        jumanji
2               grumpier old men
3              waiting to exhale
4    father of the bride part ii
5                           heat
6                        sabrina
7                   tom and huck
8                   sudden death
9                      goldeneye
Name: title, dtype: object

In [8]:
ratings['movie_id'].nunique()

3628

In [9]:
ratings['user_id'].nunique()

6039

In [10]:
ratings['user_id'].unique()

array([   1,    2,    3, ..., 6038, 6039, 6040])

In [11]:
ratings['movie_id'].value_counts()

2858    3211
260     2910
1196    2885
1210    2716
2028    2561
        ... 
138        1
3828       1
2909       1
758        1
872        1
Name: movie_id, Length: 3628, dtype: int64

In [12]:
# 가장 인기있는 영화 Top10찾기

movie_count = ratings.groupby('movie_id')['count'].count().sort_values(ascending = False).reset_index()

In [13]:
movie_count.head(10)

Unnamed: 0,movie_id,count
0,2858,3211
1,260,2910
2,1196,2885
3,1210,2716
4,2028,2561
5,589,2509
6,593,2498
7,1198,2473
8,1270,2460
9,2571,2434


In [14]:
for i in movie_count['movie_id'][:11]:
    print(movies[movies['movie_id'] == i]['title'])

2789    american beauty
Name: title, dtype: object
257    star wars: episode iv - a new hope
Name: title, dtype: object
1178    star wars: episode v - the empire strikes back
Name: title, dtype: object
1192    star wars: episode vi - return of the jedi
Name: title, dtype: object
1959    saving private ryan
Name: title, dtype: object
585    terminator 2: judgment day
Name: title, dtype: object
589    silence of the lambs, the
Name: title, dtype: object
1180    raiders of the lost ark
Name: title, dtype: object
1250    back to the future
Name: title, dtype: object
2502    matrix, the
Name: title, dtype: object
476    jurassic park
Name: title, dtype: object


# 내가 선호하는 영화 5가지 rating에 추가하기

In [15]:
# jinwoo라는 이름의 아이디가 ratings데이터에 있는지 확인

ratings.isin({'user_id' : ['jinwoo']})['user_id'].any()

False

In [16]:
my_favorite = [257, 1178, 1192, 1959, 476]

my_playlist = pd.DataFrame({'user_id' : [6041] * 5, 'movie_id' : my_favorite, 'count' : [5] * 5})

if not ratings.isin({'user_id' : ['jinwoo']})['user_id'].any():
    ratings = ratings.append(my_playlist)

In [17]:
ratings.tail()

Unnamed: 0,user_id,movie_id,count,timestamp
0,6041,257,5,
1,6041,1178,5,
2,6041,1192,5,
3,6041,1959,5,
4,6041,476,5,


In [18]:
# 필요없는 컬럼 제거하기. timestamp

ratings = ratings.drop(['timestamp'], axis = 1)
ratings.tail()

Unnamed: 0,user_id,movie_id,count
0,6041,257,5
1,6041,1178,5
2,6041,1192,5
3,6041,1959,5
4,6041,476,5


In [19]:
ratings.rename(columns = {'count' : 'rating'}, inplace = True)

In [20]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
0,6041,257,5
1,6041,1178,5
2,6041,1192,5
3,6041,1959,5


In [21]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836483 entries, 0 to 4
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   836483 non-null  int64
 1   movie_id  836483 non-null  int64
 2   rating    836483 non-null  int64
dtypes: int64(3)
memory usage: 25.5 MB


# CSR matrix 만들기

In [22]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

print(num_user, num_movie)

# csr_data = csr_matrix((ratings.rating, (ratings.user_id, ratings.movie_id)), shape= (max(ratings.user_id) + 1, max(ratings.movie_id) + 1))
csr_data = csr_matrix((ratings['rating'], (ratings['user_id'], ratings['movie_id'])), shape= (max(ratings.user_id) + 1, max(ratings.movie_id) + 1))
csr_data



6040 3628


<6042x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 모델 학습시키기

In [23]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [34]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [25]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [35]:
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

In [36]:
jin_vector, jurassic_vector = als_model.user_factors[6041], als_model.item_factors[476]

print('슝=3')

슝=3


In [37]:
jin_vector

array([ 0.0125584 , -0.05139833,  0.3423189 , -0.04922902, -0.41205585,
        0.46662682,  0.10651635,  0.08743443, -0.2348634 , -0.12504847,
        0.20084485,  0.05797926,  0.16785796,  0.48587865, -0.3333503 ,
       -0.08809332,  0.28063083, -0.46169594, -0.05565002,  0.10566279,
        0.42293772,  0.05386944, -0.51209515, -0.37707326,  0.3029513 ,
        0.47982723, -0.08070127, -0.21194822, -0.08402086, -0.04643084,
       -0.09779337,  0.1095699 , -0.34343123,  0.375914  , -0.23503624,
        0.43436068, -0.03138721,  0.31124908, -0.42810032, -0.0076006 ,
        0.16454965,  0.20969425, -0.24202704,  0.14763954,  0.17507009,
        0.10660507, -0.0986859 ,  0.00994746,  0.12355671, -0.33065468,
        0.2837323 ,  0.11399012, -0.23140617, -0.12714113,  0.35312065,
       -0.4074183 , -0.0753928 , -0.08191906, -0.31635886,  0.03807835,
       -0.1380323 ,  0.05004336, -0.20945841,  0.06282005, -0.3072139 ,
        0.09033222,  0.07279763,  0.09282459,  0.35490504,  0.12

In [38]:
jurassic_vector

array([ 1.7583753e-03,  3.6146897e-03,  5.4267864e-04,  6.2168590e-03,
        4.9502710e-03,  2.5315145e-03,  5.6804107e-03, -5.1284465e-03,
       -2.8342472e-03,  3.6488585e-03,  3.8488070e-03,  4.4175558e-04,
        2.7775476e-03,  1.1844984e-03, -1.8620612e-03,  4.0440345e-03,
       -2.2897332e-03, -1.0651144e-02, -1.1907791e-03, -1.6552419e-03,
        4.1184789e-03,  6.2132510e-03, -3.3141878e-03,  1.6332462e-03,
        5.1981574e-03,  1.6705528e-03, -8.5480686e-04,  4.0346751e-04,
       -2.2484244e-04,  2.4361783e-03,  1.1503653e-02,  3.6581610e-03,
       -1.7185349e-03,  4.0486166e-03,  3.1224191e-03,  8.7015675e-03,
        7.6599959e-03,  5.6615435e-03,  1.5095751e-03,  6.0301914e-04,
        4.8132204e-03,  3.0016485e-03,  2.3572838e-03,  6.8282499e-03,
       -9.2395302e-04,  2.7560245e-03,  4.2774966e-03,  1.3210519e-02,
        1.4010115e-03, -1.1279710e-03,  3.9317906e-03,  3.2454540e-03,
        2.1031854e-04,  4.5539648e-03, -2.3080229e-03,  5.7312110e-03,
      

In [39]:
np.dot(jin_vector, jurassic_vector)

0.024725322

# 비슷한 아티스트 찾기

In [40]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
0,6041,257,5
1,6041,1178,5
2,6041,1192,5
3,6041,1959,5


In [41]:
# 고유한 유저, 아티스트를 찾아내는 코드
# user_unique = ratings['user_id'].unique()
movie_unique = movies['title'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

NameError: name 'artist_unique' is not defined

In [33]:
favorite_movie = 'jurassic park'
movie_id = movies[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

KeyError: 'jurassic park'