# 1) 데이터 준비와 전처리

In [1]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [4]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 2) 분석해 봅시다.

* ratings에 있는 유니크한 영화 개수

In [5]:
ratings['movie_id'].nunique()

3628

* rating에 있는 유니크한 사용자 수

In [6]:
ratings['user_id'].nunique()

6039

* 가장 인기있는 영화 30개(인기순)

In [7]:
ratings.head(2)

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


In [8]:
pd.DataFrame(ratings.groupby('movie_id')['count'].mean()).sort_values('count',ascending=False).head(30)

Unnamed: 0_level_0,count
movie_id,Unnamed: 1_level_1
1830,5.0
3607,5.0
3800,5.0
3280,5.0
989,5.0
1360,5.0
3656,5.0
687,5.0
3881,5.0
787,5.0


# 3) 내가 선호하는 영화를 5가지 골라서 rating에 추가해 줍시다.

In [36]:
five_movies = pd.DataFrame(columns=['user_id', 'movie_id', 'count','timestamp'] )

for i in range(1,6):
    make_movie_row = pd.DataFrame({'user_id': [9999], 'movie_id': [i] , 'count': [5],'timestamp': [930531170] })
    five_movies = five_movies.append(make_movie_row)

Unnamed: 0,user_id,movie_id,count,timestamp
0,9999,1,5,930531170
0,9999,2,5,930531170
0,9999,3,5,930531170
0,9999,4,5,930531170
0,9999,5,5,930531170


In [37]:
#   선호영화 5개 ratings에 추가 
ratings = ratings.append(five_movies)
ratings

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
0,9999,1,5,930531170
0,9999,2,5,930531170
0,9999,3,5,930531170
0,9999,4,5,930531170


# 4) CSR matrix를 직접 만들어 봅시다.

In [57]:
# CSR matrix만들기 전에 장르등을 추가 위해 movie랑 merge 
ratings = ratings.merge(movies,on = 'movie_id')
ratings

Unnamed: 0,user_id,movie_id,count,timestamp,title,genre
0,0,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,11,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,14,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,16,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
836478,5849,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western
836479,5852,3026,4,958346883,Slaughterhouse (1987),Horror
836480,5852,690,3,957744257,"Promise, The (Versprechen, Das) (1994)",Romance
836481,5936,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary


In [59]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 

temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# title_to_idx을 통해 title 컬럼도 동일한 방식으로 인덱싱해 줍니다.

temp_title_data = ratings['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(ratings):
    print('title column indexing OK!!')
    ratings['title'] = temp_title_data
else:
    print('title column indexing Fail!!')
    
temp_genre_data = ratings['genre'].map(genre_to_idx.get).dropna()
if len(temp_genre_data) == len(ratings):
    print('genre column indexing OK!')
    ratings['genre'] = temp_genre_data
else:
    print('genre column indexing Fail!')

ratings

user_id column indexing Fail!!


NameError: name 'title_to_idx' is not defined

In [50]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
artist_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [54]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

ValueError: row index exceeds matrix dimensions

In [46]:
ratings['count'].nunique()

3

In [45]:
# ratings['user_id'].nunique()
ratings['movie_id'].nunique()

3629