## 데이터 준비

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

import os

In [3]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)
print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')
ratings.head()

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head(5)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 데이터 분석하기

In [6]:
ratings_movie = ratings['movie_id'].nunique()
ratings_user = ratings['user_id'].nunique()
print(f'ratings에 있는 유니크한 영화 개수: {ratings_movie}')
print(f'ratings에 있는 유니크한 사용자 수: {ratings_user}')

ratings에 있는 유니크한 영화 개수: 3628
ratings에 있는 유니크한 사용자 수: 6039


In [7]:
# movies와 ratings를 합쳐준다.
data= pd.merge(movies, ratings)
using_cols = [ 'user_id','title', 'rating']
data = data[using_cols]
data.head(5)

Unnamed: 0,user_id,title,rating
0,1,Toy Story (1995),5
1,6,Toy Story (1995),4
2,8,Toy Story (1995),4
3,9,Toy Story (1995),5
4,10,Toy Story (1995),5


In [8]:
#가장 인기있는 영화 30개(인기순)
data['title'] = data['title'].str.lower()
movies_count = data.groupby('title')['user_id'].count()
movies_count.sort_values(ascending=False).head(10)

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
Name: user_id, dtype: int64

In [9]:
# 유저별 몇 편의 영화를 보고 있는지에 대한 통계
user_count = data.groupby('user_id')['title'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: title, dtype: float64

## 내가 선호하는 영화 추가하기

In [27]:
my_favor_movie = ['men in black (1997)' , 'fugitive, the (1993)' ,'terminator, the (1984)' ,'back to the future (1985)' ,'forrest gump (1994)']
my_playlist = pd.DataFrame({'user_id': ['kimin']*5, 'title': my_favor_movie, 'rating':[5]*5})

data = pd.concat([data, my_playlist], sort=True)
    
data.tail(10)

Unnamed: 0,rating,title,user_id
836473,3,"contender, the (2000)",5682
836474,4,"contender, the (2000)",5812
836475,3,"contender, the (2000)",5831
836476,4,"contender, the (2000)",5837
836477,4,"contender, the (2000)",5998
0,5,men in black (1997),kimin
1,5,"fugitive, the (1993)",kimin
2,5,"terminator, the (1984)",kimin
3,5,back to the future (1985),kimin
4,5,forrest gump (1994),kimin


## 전처리하기

In [29]:
user_unique = data['user_id'].unique()
title_unique = data['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}