# 목차
0. 개발환경
1. 데이터 준비와 전처리
2. 분석
3. 내가 선호하는 영화 5가지 ratings에 추가
4. CSR matrix
5. als_model = AlternatingLeastSquares 모델 구성&훈련¶
6. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해보기
7. 내가 좋아하는 영화와 비슷한 영화를 추천받기
8. 내가 가장 좋아할 만한 영화들을 추천받아 보기
9. 회고

# 0. 개발환경

In [2]:
import numpy as np
import pandas as pd
import os

import scipy
import implicit
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# 1. 데이터 준비와 전처리

In [3]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
# 사용할 컬럼
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# Data filtering 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [7]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.tail()

Unnamed: 0,movie_id,title,genre
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


# 2. 분석

1. ratings에 있는 유니크한 영화 개수<br>
2. ratings에 있는 유니크한 사용자 수<br>
3. 가장 인기 있는 영화 30개(인기순)

nunique()은 특정 컬럼에 포함된 유니크한 데이터의 개수 파악 시 사용

In [9]:
# ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [10]:
# ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [11]:
# 가장 인기 있는 영화 30개(인기순)
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_top30 = movie_count.sort_values(ascending=False).head(30)

In [12]:
# 30개의 영화 누적 count수
for i, k in zip(movie_top30.index, movie_top30.values):
    print(movies[movies['movie_id']==i]['title'].values[0],k)

American Beauty (1999) 3211
Star Wars: Episode IV - A New Hope (1977) 2910
Star Wars: Episode V - The Empire Strikes Back (1980) 2885
Star Wars: Episode VI - Return of the Jedi (1983) 2716
Saving Private Ryan (1998) 2561
Terminator 2: Judgment Day (1991) 2509
Silence of the Lambs, The (1991) 2498
Raiders of the Lost Ark (1981) 2473
Back to the Future (1985) 2460
Matrix, The (1999) 2434
Jurassic Park (1993) 2413
Sixth Sense, The (1999) 2385
Fargo (1996) 2371
Braveheart (1995) 2314
Men in Black (1997) 2297
Schindler's List (1993) 2257
Princess Bride, The (1987) 2252
Shakespeare in Love (1998) 2213
L.A. Confidential (1997) 2210
Shawshank Redemption, The (1994) 2194
Godfather, The (1972) 2167
Groundhog Day (1993) 2121
E.T. the Extra-Terrestrial (1982) 2102
Being John Malkovich (1999) 2066
Ghostbusters (1984) 2051
Pulp Fiction (1994) 2030
Forrest Gump (1994) 2022
Terminator, The (1984) 2019
Toy Story (1995) 2000
Fugitive, The (1993) 1941


# 3. 내가 선호하는 영화 5가지 ratings에 추가

In [13]:
# 본인이 좋아하시는 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite_movie = ['Matrix, The (1999)', 'Forrest Gump (1994)' ,'Sixth Sense, The (1999)' ,'Jurassic Park (1993)' ,'Saving Private Ryan (1998)']
my_favorite = [movies[movies['title'] == name]['movie_id'].values[0] for name in my_favorite_movie]

# 'Oh'이라는 user_id가 위 영화를 5회씩 시청했다고 가정하겠습니다.
Oh = max(ratings['user_id']) + 1
my_playlist = pd.DataFrame({'user_id': [Oh]*5, 'movie_id': my_favorite, 'counts':[5]*5})


if not ratings.isin({'user_id':[Oh]})['user_id'].any():  # user_id에 'Oh'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 
ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,6041,2571,5,
1,6041,356,5,
2,6041,2762,5,
3,6041,480,5,
4,6041,2028,5,


# 4. CSR matrix(협업 필터링, Collaborative Filtering)

In [14]:
# 실습 위에 설명보고 이해해서 만들어보기

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['counts'], (ratings.user_id, ratings.movie_id)))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 5. als_model = AlternatingLeastSquares 모델 구성&훈련

Matrix Factorization에서 쪼개진 두 Feature Matrix를 한꺼번에 훈련하는 것은 잘 수렴하지 않기 때문에, 한쪽을 고정시키고 다른 쪽을 학습하는 방식을 번갈아 수행

In [15]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [16]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=35, dtype=np.float32)

In [17]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [18]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/35 [00:00<?, ?it/s]

# 6. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해보기

In [19]:
# 영화의 이름을 movie id로 바꿔주는 함수
def movie_name_to_id(name):
    return movies[movies['title']==name]['movie_id'].values[0]

In [20]:
my_vector, ET_vector = als_model.user_factors[6041], als_model.item_factors[movie_name_to_id('E.T. the Extra-Terrestrial (1982)')]

In [21]:
# 나의 벡터
my_vector

array([ 0.4304107 ,  0.42740226,  0.4191368 ,  0.7791521 ,  0.29567102,
        0.4434607 , -0.1382024 ,  0.00379903,  0.47501287, -0.11588523,
       -0.02270818, -0.14434542, -0.22349991, -0.02968966, -0.10393265,
        0.3786952 ,  0.42185637, -0.16631505, -0.4835008 ,  0.04832952,
       -0.8657237 ,  0.33017203, -0.23046257, -0.02198965,  1.0219133 ,
       -0.11253673,  0.3303213 , -0.00488936,  0.01444006,  0.31045473,
       -0.3740274 , -0.36763337, -0.696058  , -0.87738967, -0.02206086,
        0.6665292 , -0.29452664,  0.17809868, -0.8956687 ,  0.24170956,
        0.09812008, -0.19897257, -0.85361   ,  0.40892375,  0.6575933 ,
        0.28357002,  0.18586826, -1.0220428 ,  0.03358787,  0.406159  ,
       -0.24320096, -0.30191612, -0.12434583, -0.01211422,  0.44984576,
        0.09287193,  0.7841708 , -0.6915464 ,  0.09129766, -0.7756634 ,
        0.27465293, -0.04108507,  0.07823049,  0.46388462, -0.64915365,
        0.5245542 ,  0.49160588,  0.30189142,  0.4291565 ,  0.18

In [22]:
# ET 영화의 vector
ET_vector

array([-0.01059014,  0.04382207,  0.02901265, -0.03941403, -0.01329181,
       -0.02317097, -0.01692527, -0.02584586,  0.01872879,  0.01249767,
        0.02485432,  0.02253449, -0.00179327, -0.01220331,  0.03446083,
        0.02521083,  0.02757328, -0.02679376,  0.00353213,  0.00757081,
        0.00928762,  0.02758049,  0.00326486,  0.02280938,  0.03284203,
        0.01129107,  0.02327085, -0.0098851 , -0.0387328 ,  0.04201251,
        0.02450721, -0.01542079,  0.03430559, -0.01631511,  0.00735394,
        0.02046118,  0.06313691, -0.01255856,  0.06031089,  0.03619309,
        0.01871468, -0.01697059, -0.03337221,  0.01560731,  0.01165232,
       -0.01873562,  0.02979614, -0.02523451,  0.01627215,  0.02297718,
       -0.01966585, -0.01277805,  0.02213946, -0.02515272, -0.01337661,
        0.02104371,  0.01185875,  0.03759281,  0.04872097, -0.00289211,
        0.00249953,  0.0338308 , -0.02792054,  0.00747701, -0.00253694,
       -0.01645781,  0.00047657,  0.02939017,  0.04213814, -0.06

In [23]:
# 영화 이름을 넣으면 추천정도를 알려주는 함수
def score_movie(movie_name):
    my_vector, movie_vector = als_model.user_factors[6041], als_model.item_factors[movie_name_to_id(movie_name)]
    return np.dot(my_vector, movie_vector)

In [24]:
for i in movie_top30.index:
    tmp = movies[movies['movie_id']==i]['title'].values[0]
    print(tmp," :",  score_movie(tmp))

American Beauty (1999)  : 0.3351394
Star Wars: Episode IV - A New Hope (1977)  : 0.20542446
Star Wars: Episode V - The Empire Strikes Back (1980)  : 0.16084772
Star Wars: Episode VI - Return of the Jedi (1983)  : 0.3533569
Saving Private Ryan (1998)  : 0.6363975
Terminator 2: Judgment Day (1991)  : 0.5661981
Silence of the Lambs, The (1991)  : 0.37173173
Raiders of the Lost Ark (1981)  : 0.043198004
Back to the Future (1985)  : 0.31523883
Matrix, The (1999)  : 0.5897531
Jurassic Park (1993)  : 0.61383975
Sixth Sense, The (1999)  : 0.6496403
Fargo (1996)  : 0.14998648
Braveheart (1995)  : 0.50626993
Men in Black (1997)  : 0.438801
Schindler's List (1993)  : 0.413649
Princess Bride, The (1987)  : 0.08877482
Shakespeare in Love (1998)  : 0.12904958
L.A. Confidential (1997)  : 0.18533793
Shawshank Redemption, The (1994)  : 0.28123033
Godfather, The (1972)  : -0.02416908
Groundhog Day (1993)  : 0.27786258
E.T. the Extra-Terrestrial (1982)  : 0.07935512
Being John Malkovich (1999)  : 0.11991

# 7. 내가 좋아하는 영화와 비슷한 영화를 추천받기

In [25]:
# 포레스트 검프와 비슷한 영화 추천
favorite_movie = 'Forrest Gump (1994)'
movie_id = movie_name_to_id(favorite_movie)
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(356, 0.9999999),
 (1265, 0.5209715),
 (597, 0.4532042),
 (587, 0.4165719),
 (539, 0.41556898),
 (1784, 0.39189842),
 (1777, 0.3841398),
 (985, 0.36806342),
 (1408, 0.3591835),
 (2424, 0.35531276),
 (1569, 0.35283527),
 (39, 0.3402329),
 (1, 0.33937418),
 (2671, 0.3329482),
 (357, 0.3289029)]

In [26]:
for i, k in similar_movie:
    print(movies[movies['movie_id']==i]['title'].values[0],' :', k)

Forrest Gump (1994)  : 0.9999999
Groundhog Day (1993)  : 0.5209715
Pretty Woman (1990)  : 0.4532042
Ghost (1990)  : 0.4165719
Sleepless in Seattle (1993)  : 0.41556898
As Good As It Gets (1997)  : 0.39189842
Wedding Singer, The (1998)  : 0.3841398
Small Wonders (1996)  : 0.36806342
Last of the Mohicans, The (1992)  : 0.3591835
You've Got Mail (1998)  : 0.35531276
My Best Friend's Wedding (1997)  : 0.35283527
Clueless (1995)  : 0.3402329
Toy Story (1995)  : 0.33937418
Notting Hill (1999)  : 0.3329482
Four Weddings and a Funeral (1994)  : 0.3289029


In [27]:
movies[movies['movie_id'].isin([s[0] for s in similar_movie])][['title','genre']]

Unnamed: 0,title,genre
0,Toy Story (1995),Animation|Children's|Comedy
38,Clueless (1995),Comedy|Romance
352,Forrest Gump (1994),Comedy|Romance|War
353,Four Weddings and a Funeral (1994),Comedy|Romance
535,Sleepless in Seattle (1993),Comedy|Romance
583,Ghost (1990),Comedy|Romance|Thriller
593,Pretty Woman (1990),Comedy|Romance
973,Small Wonders (1996),Documentary
1245,Groundhog Day (1993),Comedy|Romance
1385,"Last of the Mohicans, The (1992)",Action|Romance|War


# 8. 내가 가장 좋아할 만한 영화들을 추천받아 보기

In [28]:
# user 6041(Oh이 좋아할만한 영화를 추천
user = 6041
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(589, 0.56619817),
 (110, 0.50626993),
 (457, 0.4504527),
 (1580, 0.438801),
 (527, 0.413649),
 (593, 0.37173173),
 (1210, 0.35335687),
 (2858, 0.33513936),
 (2628, 0.33079985),
 (1270, 0.3152388),
 (2916, 0.30750784),
 (318, 0.28123036),
 (1610, 0.27915716),
 (1265, 0.2778626),
 (1704, 0.24910298),
 (3147, 0.23827149),
 (377, 0.23324946),
 (780, 0.23085177),
 (3753, 0.22459593),
 (260, 0.20542446)]

In [29]:
# 좋아할만한 영화와 수치 장르 확인
for i, k in movie_recommended:
    print(movies[movies['movie_id']==i]['title'].values[0],' :',k)

Terminator 2: Judgment Day (1991)  : 0.56619817
Braveheart (1995)  : 0.50626993
Fugitive, The (1993)  : 0.4504527
Men in Black (1997)  : 0.438801
Schindler's List (1993)  : 0.413649
Silence of the Lambs, The (1991)  : 0.37173173
Star Wars: Episode VI - Return of the Jedi (1983)  : 0.35335687
American Beauty (1999)  : 0.33513936
Star Wars: Episode I - The Phantom Menace (1999)  : 0.33079985
Back to the Future (1985)  : 0.3152388
Total Recall (1990)  : 0.30750784
Shawshank Redemption, The (1994)  : 0.28123036
Hunt for Red October, The (1990)  : 0.27915716
Groundhog Day (1993)  : 0.2778626
Good Will Hunting (1997)  : 0.24910298
Green Mile, The (1999)  : 0.23827149
Speed (1994)  : 0.23324946
Independence Day (ID4) (1996)  : 0.23085177
Patriot, The (2000)  : 0.22459593
Star Wars: Episode IV - A New Hope (1977)  : 0.20542446


In [30]:
movies[movies['movie_id'].isin([m[0] for m in movie_recommended])][['title','genre']]

Unnamed: 0,title,genre
108,Braveheart (1995),Action|Drama|War
257,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
315,"Shawshank Redemption, The (1994)",Drama
373,Speed (1994),Action|Romance|Thriller
453,"Fugitive, The (1993)",Action|Thriller
523,Schindler's List (1993),Drama|War
585,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
589,"Silence of the Lambs, The (1991)",Drama|Thriller
770,Independence Day (ID4) (1996),Action|Sci-Fi|War
1192,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War


# 9. 회고

명시적 판단 지표로 누적 카운트 수가 가장 높은 30개의 영화를 추렸다.<br>
데이터셋 중 좋아하는 영화 5가지를 선택해 ratings에 포함시켰다.<br>
ET 영화를 기준으로 나의 선호도를 분석한 결과 포레스트 검프, 쥐라기 공원, 매트릭스 등이 높게 나왔는데 정확히 무슨 기준으로 도출된건지는 모르겠다.. <br>
하지만 휴매니즘이 느껴진다는 나름의 추상적 기준을 세우고 영화 5가지를 골랐는데 그에 따라 추천받은 영화들(내 선호도가 높을 것이라 예측된 영화들)의 경우, 단순한 액션영화 보단 휴매니즘이 느껴지는 영화들인 것 같아 결과에 만족한다. (브레이브하트, 쇼생크 탈출, 쉰들러리스트 등등) <br>
만약 데이터 풀이 더 크면 더 reliable한 예측결과가 나올 것 같다.