# 아이유팬이 좋아할 만한  다른 아티스트 찾기

### 1) 데이터 준비와 전처리

In [1]:
import os
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import numpy as np

rating_file_path=os.getenv('HOME') + '/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [40]:
ratings = ratings[['user_id', 'movie_id', 'count']]

In [41]:
ratings

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
476,6041,480,5
847,6041,858,5
1250,6041,1270,5
1539,6041,1580,5


In [44]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python',encoding = 'ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### 2) 분석해 봅시다.
- ratings에 있는 유니크한 영화 개수
- ratings에 있는 유니크한 사용자 수
- 가장 인기 있는 영화 30개(인기순)

In [45]:
# ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [46]:
# rating에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6040

In [47]:
# 가장 인기있는 영화 30개(인기순)
movie_data = pd.merge(ratings, movies)
movie_count = movie_data.groupby('title')['count'].count()
movie_count.sort_values(ascending=False).head(30)


title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2461
Matrix, The (1999)                                       2435
Jurassic Park (1993)                                     2414
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2298
Schindler's List (1993)                                  2257
Pr

### 3) 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.

In [70]:
my_favorite = ['Godfather, The (1972)' , 'Back to the Future (1985)' ,'Matrix, The (1999)' ,'Men in Black (1997)' ,'Jurassic Park (1993)']
favorite_movie_id = movies[movies['title'].isin(my_favorite)]
my_movie = pd.DataFrame({'user_id': [6041]*5, 'movie_id': favorite_movie_id['movie_id'], 'count':[5]*5})

if not ratings.isin({'user_id':[6041]})['user_id'].any():
    ratings = ratings.append(my_movie)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
476,6041,480,5
847,6041,858,5
1250,6041,1270,5
1539,6041,1580,5
2502,6041,2571,5


In [71]:
favorite_movie_id

Unnamed: 0,movie_id,title,genre
476,480,Jurassic Park (1993),Action|Adventure|Sci-Fi
847,858,"Godfather, The (1972)",Action|Crime|Drama
1250,1270,Back to the Future (1985),Comedy|Sci-Fi
1539,1580,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
2502,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller


- 좋아하는 영화의 주 장르는 Action, sci-fi 이다

In [72]:
movie_data = pd.merge(ratings, movies, on='movie_id')
movie_data

Unnamed: 0,user_id,movie_id,count,title,genre
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...
836478,5851,3607,5,One Little Indian (1973),Comedy|Drama|Western
836479,5854,3026,4,Slaughterhouse (1987),Horror
836480,5854,690,3,"Promise, The (Versprechen, Das) (1994)",Romance
836481,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)",Documentary


### 4) CSR matrix를 직접 만들어 봅시다.

In [73]:
ratings

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
476,6041,480,5
847,6041,858,5
1250,6041,1270,5
1539,6041,1580,5


In [74]:
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()


In [75]:
num_user

6040

In [76]:
num_movie

3628

In [77]:
ratings.user_id

0          1
1          1
2          1
3          1
4          1
        ... 
476     6041
847     6041
1250    6041
1539    6041
2502    6041
Name: user_id, Length: 836483, dtype: int64

In [78]:
ratings.movie_id


0       1193
1        661
2        914
3       3408
4       2355
        ... 
476      480
847      858
1250    1270
1539    1580
2502    2571
Name: movie_id, Length: 836483, dtype: int64

In [79]:
csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### 5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [80]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [81]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)


In [82]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [83]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [84]:
my_vector, godfather_vector = als_model.user_factors[6041], als_model.item_factors[858]

In [85]:
my_vector

array([-0.47711724, -0.5709188 , -0.17779039,  0.6365697 , -0.25560144,
        0.12834622,  0.70700586,  0.6796128 ,  0.19057603,  0.41553238,
       -0.6882422 , -1.0778173 , -0.5507904 ,  0.60660183, -0.22938178,
       -0.30474967,  0.5673226 ,  0.2551196 ,  0.17907523,  0.07052222,
       -0.8061222 , -0.62796587,  0.46938762, -0.18832237,  0.9937145 ,
       -0.79075575, -0.26133192,  0.09312088,  0.31053242, -0.7461754 ,
       -0.16874628,  0.27706397, -0.12799375,  0.53006464,  0.8559288 ,
        0.83928865, -0.9657429 ,  0.07198054,  0.13236646,  0.27363747,
       -0.37442493, -0.3040087 ,  0.88407683,  0.32664973,  0.62372786,
        0.11682911,  0.15734725, -0.08780102,  0.00788062,  0.20863749,
        0.04099862,  0.402304  , -0.73674875, -1.0388225 ,  0.42168432,
        0.06140669,  0.17183174, -1.8513082 ,  0.65970874, -0.3327651 ,
        0.28573766, -0.73480994, -0.330438  , -0.4553385 , -0.05664244,
        0.5341387 , -0.601994  ,  0.01371689,  0.10448699, -0.55

In [86]:
godfather_vector

array([-0.02271352, -0.00083873, -0.02477639, -0.02633006,  0.02377704,
        0.01297906,  0.02403478,  0.03942993,  0.0064184 , -0.02580241,
       -0.01274949, -0.02225804,  0.01199127,  0.03819625, -0.01327494,
       -0.0236119 ,  0.0184725 ,  0.03090974,  0.01084277, -0.02278167,
        0.00392531, -0.00315903,  0.02408221, -0.01170417,  0.05818903,
       -0.01192905, -0.02130689, -0.00850737, -0.02089489, -0.00298833,
       -0.0202863 , -0.02476284, -0.00771081, -0.00409594,  0.03846562,
        0.01629635, -0.01122376,  0.0206511 ,  0.03247233,  0.03739889,
       -0.02598314,  0.00734444,  0.03478904,  0.00075188,  0.03735644,
       -0.01122444,  0.02138529, -0.00754858,  0.01560471,  0.00574927,
       -0.0375027 ,  0.01573855, -0.01830413,  0.01390817,  0.03074495,
        0.02245529,  0.03246072, -0.03886778,  0.03512993, -0.01384993,
       -0.01129589,  0.01026849,  0.03286959,  0.02061088, -0.03331752,
       -0.02086016,  0.02221641,  0.00793265, -0.01015964,  0.02

In [93]:
np.dot(my_vector, godfather_vector)

0.6201039

In [94]:
toystory_vector = als_model.item_factors[1]
np.dot(my_vector, toystory_vector)

0.0694607

- 좋아하는 영화에 대한 선호도는 0.62 , 좋아하지 않는 영화에 대한 선호도는 0.06으로 측정되었다.

### 7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.


In [89]:
favorite_movie = 'Godfather, The (1972)'
movie_id = movies[movies['title']=='Godfather, The (1972)']['movie_id']
similar_movie = als_model.similar_items(movie_id.values[0], N=15)
similar_movie

[(858, 1.0),
 (1221, 0.97049356),
 (2023, 0.55753404),
 (1953, 0.4377683),
 (1787, 0.37369812),
 (111, 0.36274192),
 (2695, 0.35294864),
 (872, 0.3504672),
 (923, 0.34473372),
 (2887, 0.34434208),
 (624, 0.33724657),
 (1213, 0.33012876),
 (1387, 0.30076006),
 (3595, 0.2900147),
 (912, 0.28796014)]

In [90]:
movies[movies['movie_id'].isin([s[0] for s in similar_movie])]

Unnamed: 0,movie_id,title,genre
109,111,Taxi Driver (1976),Drama|Thriller
619,624,Condition Red (1995),Action|Drama|Thriller
847,858,"Godfather, The (1972)",Action|Crime|Drama
861,872,Aiqing wansui (1994),Drama
900,912,Casablanca (1942),Drama|Romance|War
911,923,Citizen Kane (1941),Drama
1195,1213,GoodFellas (1990),Crime|Drama
1203,1221,"Godfather: Part II, The (1974)",Action|Crime|Drama
1366,1387,Jaws (1975),Action|Horror
1728,1787,Paralyzing Fear: The Story of Polio in America...,Documentary


- Godfather, The (1972)	Action|Crime|Drama
- 장르가 비슷한 영화들을 추천 받았다.

### 8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.


In [91]:
user = 6041
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended


[(589, 0.61389935),
 (1221, 0.54517543),
 (2916, 0.41598353),
 (260, 0.398189),
 (1210, 0.39472914),
 (1196, 0.33069795),
 (110, 0.32651216),
 (1573, 0.3254816),
 (1527, 0.2965849),
 (2023, 0.28315195),
 (780, 0.28159785),
 (2028, 0.26901788),
 (3175, 0.26813224),
 (2628, 0.2576806),
 (2858, 0.25567833),
 (912, 0.25377256),
 (457, 0.24460317),
 (1584, 0.23951553),
 (1097, 0.23545824),
 (1240, 0.23208684)]

In [38]:
movies[movies['movie_id'].isin([m[0] for m in movie_recommended])]


Unnamed: 0,movie_id,title,genre
108,110,Braveheart (1995),Action|Drama|War
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
352,356,Forrest Gump (1994),Comedy|Romance|War
453,457,"Fugitive, The (1993)",Action|Thriller
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
770,780,Independence Day (ID4) (1996),Action|Sci-Fi|War
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1203,1221,"Godfather: Part II, The (1974)",Action|Crime|Drama


- 내가 좋아하는 영화와 장르가 비슷한 영화들을 추천 받았다.

# 회고 
- movies.dat을 불러오는 과정에서 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 3114: invalid continuation byte 오류가 떠서 검색을 해봤더니 encoding = 'ISO-8859-1' 코드를 추가하여 해결할 수 있다고 하셔서 추가를 해서 해결했습니다. 
https://julee23.tistory.com/15