# 간단한 추천시스템 만들기
  1. MovieLens 데이터셋 불러오기
  2. MovieLens 데이터셋 중 학습셋과 평가셋 나누기
  3. 간단한 추천 알고리즘 만들기(평점을 예측하고 평가는 RMSE로 한다.)

## 필요한 라이브러리 정의(Configuration)

In [35]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings('ignore')

## MovieLens 데이터셋 불러오기
  * ratings.csv, movies.csv, tags.csv

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/data/movielens/'

In [4]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [6]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [7]:
print(movies_df.shape)
print(movies_df.index)

(9742, 2)
Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585,
            193587, 193609],
           dtype='int64', name='movieId', length=9742)


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
ratings_df.shape

(100836, 4)

## ratings 데이터 정보 확인하기
   * 몇 명의 유저가 몇 개의 영화에 평점을 줬는지
   * 각 유저가 어떤 영화에 평점을 줬는지 sparse_matrix

In [11]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

In [12]:
print(f'총 유저 수 : {len(num_users)}')
print(f'총 영화 수 : {len(num_movies)}')

총 유저 수 : 610
총 영화 수 : 9724


In [13]:
# pivot ratings into movie features
user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

# convert dataframe of movie features to scipy sparse matrix
sparse_mat = csr_matrix(user_movie_matrix.values)

In [14]:
user_movie_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [18]:
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],index= user_movie_matrix.columns, columns=['movies_rated'])

In [23]:
sum(list(user_movie_matrix[1].value_counts())[1:])

232

In [24]:
user_info_df.head()

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44


In [25]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index], 
                             index = user_movie_matrix.index, columns=['users_rated'])

In [26]:
movie_info_df.head()

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49


## MovieLens 데이터셋 중 학습셋과 평가셋 나누기

In [27]:
train_df, test_df =  train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [28]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


## test set에는 존재하지만, train_set에는 없는 영화 또는 사용자 비율 

In [29]:
# userId
print('사용자:', len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))

# movieId
print('영화:', len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))
print('test set의 전체 영화 수:', len(test_df['movieId'].unique()))

사용자: 0
영화: 786
test set의 전체 영화 수: 5171


In [30]:
movies_not_included = list(set(test_df["movieId"].unique()) - set(train_df['movieId'].unique()))
print(sorted(movies_not_included[:10]))

[6145, 26645, 75803, 79897, 83969, 88069, 110603, 122888, 131098, 182299]


In [33]:
not_include_df = test_df[test_df.movieId.isin(movies_not_included)].sort_values(by='movieId')
print(not_include_df.head(10))
print('train set에 없고, test set에만 있는 영화 데이터 수 :', not_include_df.shape)

       userId  movieId  rating   timestamp
29386     202       49     3.0   974925453
97066     604      117     3.0   832080636
99501     609      137     3.0   847221054
27959     191      178     1.0   829760898
98493     607      241     4.0   964744490
96182     603      320     3.0   953925390
728         6      359     3.0   845556412
92825     599      478     2.5  1498515125
73214     474      488     3.0  1047569232
96218     603      495     5.0   953927108
train set에 없고, test set에만 있는 영화 데이터 수 : (852, 4)


## 간단한 추천알고리즘 만들기
  1. 랜덤으로 평점 예측하기
  2. 영화 평균 평점기반 예측하기
  3. 사용자 평균 평점기반 예측하기
  4. Rule기반 영화 랭킹 예측하기

  * test에 있고 train에 없는 경우

### 랜덤으로 평점 예측하기

In [36]:
ratings_range = np.arange(0.5, 5.5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [37]:
import random
pred_random = [random.choice(ratings_range) for _ in range(len(test_df))]
pred_random[:10]

[2.0, 1.5, 5.0, 4.5, 1.0, 0.5, 3.0, 1.0, 3.0, 1.0]

In [38]:
test_df['pred_ratings_random'] = pred_random

In [39]:
test_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random
99731,610,3527,5.0,1479545223,2.0
97583,606,1250,3.5,1171376891,1.5
38197,262,213,5.0,840310907,5.0
11474,68,69406,3.0,1261622505,4.5
34105,232,4728,3.0,1218166950,1.0
4767,29,170,3.0,1307905797,0.5
25019,177,1029,4.0,1435534825,3.0
95872,601,912,4.0,1521467834,1.0
83405,528,91658,4.0,1391736693,3.0
4663,28,53123,5.0,1234341270,1.0


In [40]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)
print(mse, rmse)

3.7503966679888934 1.9365940896297533
