In [2]:
import os
import pandas as pd
import numpy as np
import scipy
import implicit

### 데이터 준비 및 전처리

In [10]:
ratings = pd.read_csv(rating_file_path, sep='::', engine='python', encoding = "ISO-8859-1")
ratings
"""
"ISO-8859-1"은 Latin-1이라고도 불리는 문자 인코딩 방식 중 하나입니다.
이 인코딩은 대부분의 서유럽 언어에 사용되는 문자들을 포함하고 있습니다. 
이 방식은 1바이트로 모든 문자를 표현하며, ASCII 문자 집합과 호환되도록 설계
"""

Unnamed: 0,1,1193,5,978300760
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268
...,...,...,...,...
1000203,6040,1091,1,956716541
1000204,6040,1094,5,956704887
1000205,6040,562,5,956704746
1000206,6040,1096,4,956715648


"::" 나뉘어져있다는 것을 확인

In [12]:
rating_file_path = os.getenv('HOME') + '/aiffel/recommend_system/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
original_data_size = len(ratings)
ratings.head()



Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [15]:
# 3점 이상만 남김
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {original_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / original_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [16]:
# ratings 컬럼의 이름을 counts로 변경
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [17]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [27]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옴
movie_file_path=os.getenv('HOME') + '/aiffel/recommend_system/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
ratings['movie_id'].nunique()

3628

In [36]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [40]:
movie_ratings = pd.merge(ratings, movies, on='movie_id', how='left')
movie_ratings

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy
...,...,...,...,...,...,...
836473,6040,1090,3,956715518,Platoon (1986),Drama|War
836474,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War
836475,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama
836476,6040,1096,4,956715648,Sophie's Choice (1982),Drama


In [51]:
# 사용하는 컬럼만 남김
using_cols = ['user_id', 'counts', 'title', 'genre']
movie_ratings = movie_ratings[using_cols]
movie_ratings.head(10)

Unnamed: 0,user_id,counts,title,genre
0,1,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,3,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,3,My Fair Lady (1964),Musical|Romance
3,1,4,Erin Brockovich (2000),Drama
4,1,5,"Bug's Life, A (1998)",Animation|Children's|Comedy
5,1,3,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
6,1,5,Ben-Hur (1959),Action|Adventure|Drama
7,1,5,"Christmas Story, A (1983)",Comedy|Drama
8,1,4,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
9,1,4,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical


### 분석
- ratings에 있는 유니크한 영화 개수
- ratings에 있는 유니크한 사용자 수
- 가장 인기 있는 영화 30개(인기순)

In [53]:
movie_ratings['title'].nunique()

3628

In [54]:
movie_ratings['user_id'].nunique()

6039

In [77]:
popular_movies = movie_ratings.groupby('title')['user_id'].count()
popular_movies.sort_values(ascending=False).head(100)

title
American Beauty (1999)                                                         3211
Star Wars: Episode IV - A New Hope (1977)                                      2910
Star Wars: Episode V - The Empire Strikes Back (1980)                          2885
Star Wars: Episode VI - Return of the Jedi (1983)                              2716
Saving Private Ryan (1998)                                                     2561
Terminator 2: Judgment Day (1991)                                              2509
Silence of the Lambs, The (1991)                                               2498
Raiders of the Lost Ark (1981)                                                 2473
Back to the Future (1985)                                                      2460
Matrix, The (1999)                                                             2434
Jurassic Park (1993)                                                           2413
Sixth Sense, The (1999)                                               

In [56]:
# 유저별 좋아하는 영화 통계
user_count = movie_ratings.groupby('user_id')['title'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

In [57]:
# 유저별 별점 중앙값에 대한 통계, 3점 이상만 뽑았기 때문에 최저는 3
user_median = movie_ratings.groupby('user_id')['counts'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: counts, dtype: float64

### 내가 선호하는 영화 5가지를 골라서 ratings에 추가

In [75]:
pd.set_option('display.max_rows', None)
print(movie_ratings['title'].value_counts())

American Beauty (1999)                                                                3211
Star Wars: Episode IV - A New Hope (1977)                                             2910
Star Wars: Episode V - The Empire Strikes Back (1980)                                 2885
Star Wars: Episode VI - Return of the Jedi (1983)                                     2716
Saving Private Ryan (1998)                                                            2561
Terminator 2: Judgment Day (1991)                                                     2509
Silence of the Lambs, The (1991)                                                      2498
Raiders of the Lost Ark (1981)                                                        2473
Back to the Future (1985)                                                             2460
Matrix, The (1999)                                                                    2434
Jurassic Park (1993)                                                                  2413

In [76]:
movie_ratings['title'].value_counts()

American Beauty (1999)                                                                3211
Star Wars: Episode IV - A New Hope (1977)                                             2910
Star Wars: Episode V - The Empire Strikes Back (1980)                                 2885
Star Wars: Episode VI - Return of the Jedi (1983)                                     2716
Saving Private Ryan (1998)                                                            2561
Terminator 2: Judgment Day (1991)                                                     2509
Silence of the Lambs, The (1991)                                                      2498
Raiders of the Lost Ark (1981)                                                        2473
Back to the Future (1985)                                                             2460
Matrix, The (1999)                                                                    2434
Jurassic Park (1993)                                                                  2413

In [69]:
movie_ratings[movie_ratings['title'] =='Avengers']

Unnamed: 0,user_id,counts,title,genre


In [None]:
my_favorite=['The Wolf of Wall Street', 'The Dark Knight', 'Avengers: Infinity War', 'Parasite', 'American Beauty']

### CSR matrix를 직접 만들기

### als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련

In [None]:
import implicit.als importAlternatingLeastSquares

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [None]:
# gpu 사용
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=True, iterations=15, dtype=np.float32)

### 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악

### 내가 좋아하는 영화와 비슷한 영화를 추천

### 내가 가장 좋아할 만한 영화들을 추천


![nn](rubric_2.png)