In [1]:
import numpy as np
import scipy
import implicit
import pandas as pd
print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


# 01. 데이터 준비/ 전처리

In [2]:
import os

# 평점 데이터
rating_file_path = os.getenv('HOME')+'/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
original_data_size = len(ratings)
ratings.head(10)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [3]:
# 3점 이상만 남김
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'original_data_size: {original_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / original_data_size:.2%}')

original_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼 이름 counts로
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
# 영화제목
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   counts     836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


# 02. EDA

- ratings에 있는 유니크한 영화 개수

In [8]:
ratings['movie_id'].nunique()

3628

- ratings에 있는 유니크한 사용자 수

In [9]:
ratings['user_id'].nunique()

6039

- 가장 인기 있는 영화 30개(제일 시청 횟수가 많은)

In [10]:
temp=movies.set_index('movie_id')
top_pop30=ratings.groupby(['movie_id']).sum().sort_values(by='counts',ascending=False).sample(30).index.to_list()
temp.loc[top_pop30,:]

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
233,Exotica (1994),Drama
2402,Rambo: First Blood Part II (1985),Action|War
1878,Woo (1998),Comedy|Romance
2532,Conquest of the Planet of the Apes (1972),Action|Sci-Fi
1296,"Room with a View, A (1986)",Drama|Romance
847,"Big Squeeze, The (1996)",Comedy|Drama
2232,Cube (1997),Sci-Fi|Thriller
2443,Playing by Heart (1998),Drama|Romance
3616,Loser (2000),Comedy|Romance
1272,Patton (1970),Drama|War


In [11]:
ratings['timestamp'].max()

1046454590

# 03. 선호하는 영화 추가

In [12]:
favorite_mv=['Jumanji (1995)','Toy Story (1995)','Johns (1996)','Meet the Parents (2000)','Batman (1989)']
movies_idx=movies.set_index('movie_id')

my_idx=[]
for i in range(5):
    my_idx.append(movies_idx[movies_idx['title']==favorite_mv[i]].index.values[0])

my_mv = pd.DataFrame({'user_id':['son']*5, 'movie_id':my_idx,'counts':[5]*5,'timestamp':[1046454590]*5})

if not ratings.isin({'user_id':['son']})['user_id'].any():
    ratings = ratings.append(my_mv)

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,son,2,5,1046454590
1,son,1,5,1046454590
2,son,1063,5,1046454590
3,son,3948,5,1046454590
4,son,592,5,1046454590


In [13]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [14]:
print(user_to_idx['son'])

6039


In [15]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    ratings['user_id'] = temp_user_data
else:
    print('user Nope')


temp_movie_data = ratings['movie_id'].map(movie_to_idx).dropna()  
if len(temp_movie_data) == len(ratings):
    ratings['movie_id'] = temp_movie_data
else:
    print('movie Nope')

In [16]:
ratings.reset_index(drop=True,inplace=True)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
836473,6038,1030,3,956715518
836474,6038,986,5,956704887
836475,6038,311,5,956704746
836476,6038,142,4,956715648
836477,6038,26,4,956715569
836478,6039,513,5,1046454590
836479,6039,40,5,1046454590
836480,6039,2232,5,1046454590
836481,6039,461,5,1046454590
836482,6039,527,5,1046454590


# 04. CSR matrix

In [17]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 05. 모델 훈련

In [18]:
from implicit.als import AlternatingLeastSquares
import os

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [19]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01,
                                    use_gpu=False, iterations=15, 
                                    dtype=np.float32)

In [20]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [22]:
son, jumanzi = user_to_idx['son'], movie_to_idx[1]
son_vector, jumanzi_vector = als_model.user_factors[son], als_model.item_factors[jumanzi]

In [23]:
son_vector

array([-0.07165094, -0.03296149,  0.60203046, -0.5663036 , -0.02866571,
       -0.83111805,  0.4297778 ,  0.2545107 , -0.4335289 ,  0.23983052,
       -0.49603674,  0.30919674,  0.59416604,  0.09444896, -0.64263964,
        0.5567203 ,  0.3380822 , -0.49345428, -0.70030636, -0.34830853,
       -0.47008196, -0.57751036, -0.75894666, -0.12553076,  0.68662083,
       -0.54275507,  0.06577144,  0.3966685 ,  0.11561286,  0.22672306,
        0.38508627,  0.501404  ,  0.75598776, -0.7376049 , -0.45625824,
        0.5872793 ,  0.05356599, -0.3378719 , -0.1293606 , -0.25590315,
       -0.03190133, -0.61608917, -0.20958298,  0.16666692,  0.0901476 ,
        0.4569527 ,  0.25170004,  0.3640158 , -0.12477922,  0.41397265,
       -0.12757225, -0.67097074, -0.38476878,  0.05535836,  0.37191036,
        0.49185413, -0.12563118,  0.82903033,  0.2769626 , -0.38354823,
        0.5355651 ,  0.15015471, -0.47437444, -0.47210297,  0.16667897,
        0.10373396, -0.20625931, -0.00175547,  0.239686  , -0.07

In [24]:
jumanzi_vector

array([ 0.00667473,  0.00780472,  0.00470632,  0.01349532, -0.02031371,
       -0.02107431,  0.0249759 ,  0.01271361, -0.00974342,  0.00748174,
       -0.00626786,  0.02899953,  0.00693814, -0.01142909, -0.01136255,
        0.04723967, -0.00910166,  0.03192261, -0.00258996, -0.01262324,
       -0.01339355, -0.00518042, -0.04728771,  0.01645847,  0.0120601 ,
       -0.01723992,  0.03159207,  0.0240054 ,  0.00738945, -0.00809108,
        0.00264922,  0.02090364,  0.02335256, -0.01392293,  0.00918804,
       -0.03042461, -0.01232472,  0.01607889,  0.01424689, -0.02163776,
        0.01256819, -0.02411241,  0.00808249, -0.00091583,  0.00255568,
        0.05115049,  0.02858164,  0.01875778, -0.00364864, -0.01081224,
        0.01086081, -0.01272353, -0.01189496,  0.00188437,  0.01639898,
        0.01339746, -0.01339734,  0.021716  , -0.00492687, -0.01814526,
        0.0048024 , -0.02471838,  0.01585431, -0.01623305,  0.00874303,
       -0.00481173, -0.00615449, -0.00663854, -0.01698514,  0.00

# 06. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도 확인

In [25]:
np.dot(son_vector, jumanzi_vector)

0.42512885

In [26]:
idx = movie_to_idx[50]
movies.iloc[idx,:]

movie_id                   236
title       French Kiss (1995)
genre           Comedy|Romance
Name: 233, dtype: object

In [27]:
french_kiss = movie_to_idx[236]
french_vector = als_model.item_factors[french_kiss]
np.dot(son_vector, french_vector)

0.0033341916

# 07. 선호하는 영화와 비슷한 영화 추천

In [28]:
similar_movie = als_model.similar_items(40,N=15) # jumanzi
similar_movie

[(40, 1.0),
 (50, 0.7924648),
 (4, 0.5612359),
 (33, 0.55537516),
 (110, 0.5513543),
 (322, 0.5302504),
 (255, 0.4582389),
 (20, 0.45761758),
 (330, 0.45212117),
 (10, 0.4200844),
 (34, 0.3751218),
 (126, 0.3561919),
 (478, 0.3532287),
 (16, 0.35274252),
 (32, 0.34697092)]

In [29]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
idx_mv = [idx_to_movie[i[0]] for i in similar_movie]

movies_idx.iloc[idx_mv,:]

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Jumanji (1995),Adventure|Children's|Fantasy
3183,"Third Miracle, The (1999)",Drama
2424,You've Got Mail (1998),Comedy|Romance
592,Batman (1989),Action|Adventure|Crime|Drama
1285,Heathers (1989),Comedy
35,Carrington (1995),Drama|Romance
1992,Child's Play 2 (1990),Horror
2390,Little Voice (1998),Comedy
368,Maverick (1994),Action|Comedy|Western
599,"Wild Bunch, The (1969)",Western


In [30]:
# 함수화
def get_similar_movie(movie_name: str):
    movie_name = movies_idx[movies_idx['title']==movie_name].index[0]
    movie_idx = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_idx)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    similar_movie = movies_idx.loc[similar_movie,:]
    return similar_movie

In [31]:
get_similar_movie('Batman (1989)')

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
592,Batman (1989),Action|Adventure|Crime|Drama
2115,Indiana Jones and the Temple of Doom (1984),Action|Adventure
1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
2194,"Untouchables, The (1987)",Action|Crime|Drama
1377,Batman Returns (1992),Action|Adventure|Comedy|Crime
2640,Superman (1978),Action|Adventure|Sci-Fi
1275,Highlander (1986),Action|Adventure
153,Batman Forever (1995),Action|Adventure|Comedy|Crime
2470,Crocodile Dundee (1986),Adventure|Comedy
2406,Romancing the Stone (1984),Action|Adventure|Comedy|Romance


# 08. 가장 선호하는 영화 추천

In [32]:
user = user_to_idx['son']
# user가 이미 평가한 아이템은 제외
movie_recommanded = als_model.recommend(user, csr_data, N=20,filter_already_liked_items=True)
movie_recommanded

[(50, 0.5020976),
 (755, 0.34014055),
 (458, 0.25791588),
 (1853, 0.25107628),
 (188, 0.2461956),
 (678, 0.242724),
 (172, 0.23739767),
 (1814, 0.22783764),
 (4, 0.22277398),
 (322, 0.21047603),
 (545, 0.20628229),
 (33, 0.19854523),
 (39, 0.19281268),
 (945, 0.1882557),
 (587, 0.17807665),
 (596, 0.17715313),
 (486, 0.17436615),
 (515, 0.17257723),
 (616, 0.17185815),
 (851, 0.16632095)]

In [33]:
movie_id_son = [idx_to_movie[i[0]] for i in movie_recommanded]
[movies_idx.loc[movie_id_son,'title']]

[movie_id
 3114                              Toy Story 2 (1999)
 3897                            Almost Famous (2000)
 367                                 Mask, The (1994)
 3911                             Best in Show (2000)
 2115     Indiana Jones and the Temple of Doom (1984)
 1073    Willy Wonka and the Chocolate Factory (1971)
 1291       Indiana Jones and the Last Crusade (1989)
 3893                              Nurse Betty (2000)
 2355                            Bug's Life, A (1998)
 34                                       Babe (1995)
 317                         Santa Clause, The (1994)
 588                                   Aladdin (1992)
 150                                 Apollo 13 (1995)
 1275                               Highlander (1986)
 1377                           Batman Returns (1992)
 3489                                     Hook (1991)
 3916                      Remember the Titans (2000)
 153                            Batman Forever (1995)
 2193             

In [34]:
toy_2 = movies_idx[movies_idx['title']=='Toy Story 2 (1999)'].index[0]
toy_2 = movie_to_idx[toy_2]
explain = als_model.explain(user, csr_data, itemid=toy_2)

In [35]:
[(movies_idx.loc[idx_to_movie[i[0]],'title'], i[1]) for i in explain[1]]

[('Toy Story (1995)', 0.40169062955280466),
 ('Jumanji (1995)', 0.04236708307417451),
 ('Meet the Parents (2000)', 0.039782017981236546),
 ('Batman (1989)', 0.022174964798917025),
 ('Johns (1996)', -0.012769375991063859)]

### 회고

 CSR matrix를 만들 때 user_id의 갯수와 movie_id의 갯수에 맞게 counts값을 할당해서 잘 만들 수 있었습니다. 
 
내가 선호하는 영화 중 jumanji를 선택했을 때 dot product 값은 0.42512885, 이 외의 관련없는 영화를 골랐을 때 0.0033341916로 낮은 수치를 얻었습니다.

그 다음 recommanded로 추천 영화를 뽑았을 때 Toy Story 2가 가장 높은 값으로 선정되었는데, 내가 선호하는 영화 중 Toy Story 1이 있다는 것을 생각하면 직관적으로도 연관있는 영화를 뽑아준 것 같습니다.

explain으로 확인 해봤을 때도 Toy Story 2에 영향을 많이 준 영화는 당연하게도 0.4016으로 Toy story였습니다.