In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
MOVIE_FILE_PATH = '/content/drive/MyDrive/01_Machain Learning/5. 추천시스템/data/ml-latest-small/movies.csv'
RATING_FILE_PATH = '/content/drive/MyDrive/01_Machain Learning/5. 추천시스템/data/ml-latest-small/ratings.csv'

In [19]:
import pandas as pd

movies = pd.read_csv(MOVIE_FILE_PATH)
ratings = pd.read_csv(RATING_FILE_PATH)

In [20]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [22]:
ratings_movies = pd.merge(ratings, movies, on='movieId')
ratings_movies.head()

# 데이터 통합:
# ratings 데이터프레임과 movies 데이터프레임을 합치는 작업을 통해, 각각의 영화에 대한 평점과 영화의 메타데이터(제목 등)를 한 데이터프레임에서 함께 사용할 수 있도록 합니다.
# 이는 데이터를 분석하거나 모델을 학습시킬 때 필요한 추가적인 정보(예: 영화 제목 등)를 제공하는 데 유용합니다.

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


# 사용자-영화 희소 행렬
Pivot을 통해 사용자-아이템 행렬로 변환(희소행렬)
- 희소행렬 : 대부분의 값이 비어 있는 행렬..
- 모든 사용자가 모든 영화에 대해 평점을 부여하진 않는다.

In [23]:
ratings_matrix = ratings_movies.pivot_table('rating', index='userId', columns='title')
ratings_matrix.head()

# 피벗 테이블 생성:
# pivot_table 메서드는 데이터를 재구조화하는 기능을 제공합니다.
# 여기서는 rating 값을 기준으로 userId를 행(index)으로, title을 열(columns)로 하는 피벗 테이블을 생성합니다. 결과적으로 각 셀에는 특정 사용자가 특정 영화에 부여한 평점이 들어갑니다.

# # 왜 필요?
# 데이터 재구조화: 영화 평점 데이터를 사용자-영화 매트릭스로 재구조화하여, 사용자가 각 영화에 부여한 평점을 쉽게 확인하고 분석할 수 있도록 합니다.
# 이는 협업 필터링 알고리즘을 적용하는 데 필수적인 형태입니다.

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [24]:
# NaN값을 모두 0으로 변환 -> NaN이 있으면 유사도를 구할 수가 없다.
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

# 왜 필요?
# 결측값 처리: 사용자-영화 매트릭스에서 결측값(즉, 사용자가 특정 영화에 평점을 부여하지 않은 경우)을 0으로 채워서, 데이터 분석이나 모델 학습에 방해가 되지 않도록 합니다. 이는 알고리즘이 결측값을 다루는 방법을 간소화하고, 계산을 쉽게 하기 위해 필요합니다.

# 무엇을 의미?
# 결측값 채우기: fillna(0) 메서드는 데이터프레임의 결측값을 0으로 채웁니다. 여기서는 ratings_matrix의 결측값을 0으로 채워서 모든 셀이 숫자로 채워지도록 합니다. 이는 모델 학습 및 예측 과정에서 결측값으로 인한 오류를 방지합니다.


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 영화-영화 유사도
사용자-영화 행렬을 전치하여 영화-사용자 행렬로 만든 다음 유사도 구하기

In [25]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head()

# 왜 필요?: 아이템 간 유사도를 계산하기 위해 영화-사용자 매트릭스로 변환하여, 각 행이 하나의 영화에 해당하도록 하기 위해 필요합니다.

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# 코사인 유사도 구하기
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# 유사도 행렬을 데이터 프레임으로 만들기
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)
item_sim_df

# 왜 필요?
# 아이템 유사도 계산: 코사인 유사도를 사용하여 각 영화 간의 유사도를 계산합니다. 이 유사도는 아이템 기반 협업 필터링에서 추천 시스템을 만들기 위해 중요한 역할을 합니다.
# 유사도 행렬 생성: 유사도를 계산한 결과를 데이터프레임으로 만들어, 각 영화 쌍 간의 유사도를 쉽게 조회하고 활용할 수 있도록 합니다.

# 무엇을 의미?
  # 코사인 유사도 계산:
    # cosine_similarity 함수는 두 벡터 간의 코사인 유사도를 계산합니다.
    # 여기서는 영화 간 유사도를 계산하기 위해 ratings_matrix_T를 두 번 입력하여 각 영화 간의 유사도를 계산합니다.
  # 유사도 행렬 데이터프레임:
    # item_sim_df는 계산된 유사도를 데이터프레임 형태로 저장한 것입니다.
    # 이 데이터프레임의 행과 열은 영화 제목이며, 각 셀은 해당 영화 쌍의 유사도를 나타냅니다.

# 어떻게 작동?
# 코사인 유사도 계산:

# cosine_similarity(ratings_matrix_T, ratings_matrix_T)는 전치된 사용자-영화 매트릭스를 입력으로 받아, 각 영화 간의 코사인 유사도를 계산합니다.
# ratings_matrix_T는 영화-사용자 매트릭스이므로, 이 매트릭스의 행 벡터들 간의 유사도를 계산하게 됩니다.
# 결과는 각 영화 간의 유사도를 나타내는 행렬 item_sim에 저장됩니다.

# 유사도 행렬 데이터프레임 생성:

# pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)는 유사도 행렬을 데이터프레임으로 변환합니다.
# item_sim은 영화 간 유사도를 나타내는 행렬이며, 이 행렬을 데이터프레임으로 변환하여 각 행과 열이 영화 제목이 되도록 설정합니다.
# item_sim_df는 영화 제목을 행과 열로 하는 유사도 데이터프레임이 됩니다.


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141653,0.000000,...,0.000000,0.342055,0.543305,0.707107,0.0,0.000000,0.139431,0.327327,0.000000,0.0
'Hellboy': The Seeds of Creation (2004),0.000000,1.000000,0.707107,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Round Midnight (1986),0.000000,0.707107,1.000000,0.000000,0.000000,0.0,0.176777,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Salem's Lot (2004),0.000000,0.000000,0.000000,1.000000,0.857493,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Til There Was You (1997),0.000000,0.000000,0.000000,0.857493,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.211467,0.216295,0.097935,0.132489,...,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.192259,0.000000,0.170341,0.0
xXx (2002),0.139431,0.000000,0.000000,0.000000,0.000000,0.0,0.089634,0.000000,0.276512,0.019862,...,0.069716,0.305535,0.173151,0.246482,0.0,0.192259,1.000000,0.270034,0.100396,0.0
xXx: State of the Union (2005),0.327327,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.156764,0.000000,...,0.000000,0.382543,0.177838,0.231455,0.0,0.000000,0.270034,1.000000,0.000000,0.0
¡Three Amigos! (1986),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.372876,0.180009,0.169385,0.249586,...,0.180009,0.000000,0.000000,0.000000,0.0,0.170341,0.100396,0.000000,1.000000,0.0


In [27]:
# 유사한 아이템 찾기: 특정 영화와 가장 유사한 영화를 찾기 위해 필요합니다. 여기서는 [REC] (2007)이라는 영화와 유사한 영화를 찾아내고자 합니다.

item_sim_df['[REC] (2007)'].sort_values(ascending=False)[:10]

# 영화 유사도 정렬: item_sim_df 데이터프레임에서 [REC] (2007) 열을 기준으로 값을 내림차순으로 정렬합니다. 이는 [REC] (2007)과 유사한 영화를 유사도 순으로 정렬하는 것입니다.
# 상위 10개 추출: 정렬된 결과에서 상위 10개의 영화를 추출합니다. 이들은 [REC] (2007)과 가장 유사한 영화 10개입니다.

Unnamed: 0_level_0,[REC] (2007)
title,Unnamed: 1_level_1
[REC] (2007),1.0
Tremors 3: Back to Perfection (2001),0.623737
Dog Soldiers (2002),0.621035
Wolf Creek (2005),0.601937
Joy Ride (2001),0.589097
Scooby-Doo 2: Monsters Unleashed (2004),0.557882
[REC]³ 3 Génesis (2012),0.544207
Buried (2010),0.540871
Evil Dead (2013),0.537966
Dead Silence (2007),0.537966


# 평점 예측
- 아이템 기반 유사도 협업 필터링으로 개인화된 영화 추천

$$
\hat{R}_{u, i}=\frac{\sum\big{(}S_{i,N} \cdot R_{u,N}\big{)}}{\sum\big{(}\big{|}S_{i,N}\big{|}\big{)}}
$$

In [31]:
# 주어진 평점 행렬과 아이템 유사도 행렬을 사용하여 사용자가 영화에 부여할 평점을 예측
import numpy as np

def predict_rating(ratings_arr, item_sim_arr):
  ratings_pred = ratings_arr @ item_sim_arr / np.array([np.abs(item_sim_arr).sum(axis=1)])

  return ratings_pred

# 왜 필요?
    # 평점 예측: 추천 시스템에서 사용자에게 아직 평가되지 않은 아이템(영화)에 대한 평점을 예측하기 위해 필요합니다. 이를 통해 사용자가 좋아할 가능성이 높은 아이템을 추천할 수 있습니다.

# 무엇을 의미?
    # 함수 입력:
    # ratings_arr: 사용자-아이템 평점 행렬입니다. 각 셀에는 특정 사용자가 특정 아이템에 대해 부여한 평점이 들어있습니다.
    # item_sim_arr: 아이템 간 유사도 행렬입니다. 각 셀에는 두 아이템 간의 유사도가 들어있습니다.
    # 함수 출력:
    # ratings_pred: 예측된 평점 행렬입니다. 각 셀에는 특정 사용자가 특정 아이템에 대해 부여할 것으로 예상되는 평점이 들어있습니다.

# 어떻게 작동?
    # 행렬 곱 계산:
    # ratings_arr @ item_sim_arr는 사용자-아이템 평점 행렬과 아이템 유사도 행렬의 행렬 곱을 계산합니다. 이는 각 사용자의 평점을 아이템 유사도로 가중합한 값입니다.
      # 각 사용자의 평점을 아이템 유사도로 가중합 한 값: 매긴 영화 점수를 그 영화와 비슷한 다른 영화의 유사도도로 가중합하여 비슷한 영화의 예상점수를 계산하는 것
      # 비슷한 정도를 모두 고려하여 공정하게 점수를 조정하기 위해, 유사도의 절댓값 합으로 나누어주면, 유사한 영화들의 평점이 더 공정하게 반영됩니다.
    # 정규화:
    # / np.array([np.abs(item_sim_arr).sum(axis=1)])는 각 아이템 유사도의 절댓값 합으로 나누어 정규화합니다. 이는 유사도의 가중치를 고려하여 평점을 조정하는 역할을 합니다.
      # 유사도의 가중치를 고려해 평점을 조정: 계산된 예상 점수를 공정하게 만들기 위해 모든 비슷한 정도로 더한 값을 나누어 조정하는 것
    # np.abs(item_sim_arr).sum(axis=1)는 각 아이템의 유사도 절댓값 합을 계산합니다.
    # np.array([...])로 배열 형태로 변환하여 나누기 연산을 수행합니다.
    # 예측 결과 반환:
    # ratings_pred는 최종적으로 예측된 평점 행렬을 반환합니다.

In [32]:
ratings_matrix.shape, item_sim.shape

((610, 9719), (9719, 9719))

In [35]:
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)

# 평점 예측:

# predict_rating(ratings_matrix.values, item_sim_df.values) 함수는 실제 평점 행렬과 아이템 유사도 행렬을 입력받아 예측 평점을 계산합니다.
# 여기서 ratings_matrix.values와 item_sim_df.values는 각각 평점 행렬과 유사도 행렬의 값들을 넘깁니다.

ratings_pred_df = pd.DataFrame(
    data=ratings_pred,
    index=ratings_matrix.index,
    columns=ratings_matrix.columns
)

# 데이터프레임 변환:

# pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)는 예측된 평점을 데이터프레임 형태로 변환합니다.
# data=ratings_pred는 예측된 평점을 데이터로 사용합니다.
# index=ratings_matrix.index는 원래의 사용자 인덱스를 사용합니다.
# columns=ratings_matrix.columns는 원래의 영화 제목을 열로 사용합니다.

ratings_pred_df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.212070,0.192921,0.136024,0.292955,0.720347
2,0.018260,0.042744,0.018861,0.000000,0.000000,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.015640,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.000000
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.011800,0.012225,0.000000,0.008194,0.007017,0.009229,0.010420,0.084501
4,0.049145,0.277628,0.160448,0.206892,0.309632,0.042337,0.130048,0.116442,0.099785,0.097432,...,0.051269,0.076051,0.055563,0.054137,0.008343,0.159242,0.100941,0.062253,0.146054,0.231187
5,0.007278,0.066951,0.041879,0.013880,0.024842,0.018240,0.026405,0.018673,0.021591,0.018841,...,0.009689,0.022246,0.013360,0.012378,0.000000,0.025839,0.023712,0.018012,0.028133,0.052315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.246832,1.293006,0.752661,0.935924,1.032354,0.407038,0.516819,0.594590,0.489913,0.408661,...,0.227092,0.405114,0.291452,0.276143,0.055006,0.636525,0.510522,0.346652,0.550174,0.893777
607,0.052248,0.305255,0.180669,0.218834,0.179443,0.115288,0.165817,0.075548,0.104890,0.109500,...,0.059516,0.135763,0.093843,0.086643,0.003707,0.144222,0.134705,0.107674,0.156614,0.576894
608,0.324435,1.022541,0.598467,0.425468,0.349562,0.494081,0.529903,0.227746,0.480980,0.442384,...,0.276586,0.594918,0.457094,0.444436,0.038681,0.616733,0.717768,0.538586,0.527639,0.698871
609,0.004835,0.053593,0.026251,0.000000,0.002827,0.015528,0.017849,0.007791,0.013172,0.014981,...,0.006575,0.014368,0.010334,0.007742,0.000000,0.018070,0.015600,0.013108,0.018328,0.033377


# 예측 평가
가중치 평점 부여 후에 예측 성능 평가에 대한 MSE 구하기

In [39]:
ratings_pred.nonzero()

(array([  0,   0,   0, ..., 609, 609, 609]),
 array([   0,    1,    2, ..., 9716, 9717, 9718]))

In [40]:
# 실제 평점 데이터 중 원래 값이 들어있던 위치 구하기 -> target
# 예측 평점 데이터 중 실제 값이 들어있었던 위치의 값과 MSE를 계산
from sklearn.metrics import mean_squared_error

# 사용자가 평점을 부여한 영화에 대하서만 예측 성능 평가 MSE 구하기
def get_mse(pred, actual):
  # 실제 사용자가 평점을 부여한 위치의 데이터 (target)
  actual_y = actual[actual.nonzero()].flatten()

  # 실제 사용자가 평점을 부여한 위치의 예측 데이터 (pred)
  predict_y = pred[actual.nonzero()].flatten()

  return mean_squared_error(predict_y, actual_y)

get_mse(ratings_pred, ratings_matrix.values)

9.895354759094706

단순하게 유저-아이템 행렬과 아이템-아이템 유사도 행렬로 가중 평균 계산을 하면, **유사 하지 않은 아이템들도** 평점 예측에 참여하기 때문에 예측 평점이 낮을 수 밖에 없다.

In [41]:
# top-n 유사도를 가진 데이터들에 대해서만 예측 평점 계산
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
  # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
  pred = np.zeros(ratings_arr.shape)

  # 사용자-아이템 평점 행렬의 열 크기(영화의 개수)만큼 반복 수행
  for col in range(ratings_arr.shape[1]):
    # col 번째 영화와, 다른 모든 영화들 간의 유사도
    sim_items = item_sim_arr[:, col]
    top_n_items = [np.argsort(sim_items)[:-n-1:-1]]

    # 개인화된 예측 평점 계산
    for row in range(ratings_arr.shape[0]):
      # item_sim_arr[col, :][top_n_items] : col 번째 영화와 가장 유사도가 높은 top_n개 영화의 유사도
      # ratings_arr[row, :][top_n_items].T : row 번째 사람이 부여한 유사도가 가장 높은 top_n 영화에 대한 점수
      pred[row, col] = item_sim_arr[col, :][top_n_items] @ ratings_arr[row, :][top_n_items].T
      pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))

  return pred

In [42]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values)

In [43]:
get_mse(ratings_pred, ratings_matrix.values)

3.6949827608772314

In [44]:
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.220798,0.000000,0.000000,1.677291,0.284372
2,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.220798,0.000000,0.000000,0.194828,0.000000
5,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.149633,0.0,0.418273,0.16678,0.0,0.130033,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.344930,0.268465,0.000000,0.694944,0.189602
607,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.194948,0.000000,0.000000,0.000000,0.000000
608,0.0,0.000000,0.0,0.159451,0.00000,0.0,0.243703,0.0,0.000000,0.0,...,0.0,0.129289,0.000000,0.112856,0.0,1.587302,2.988072,0.175489,0.702430,0.000000
609,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000


# 추천시스템 작동

In [45]:
target_user_id = 78

In [46]:
# taget_user_id에 대한 모든 영화 정보
user_rating_id = ratings_matrix.loc[target_user_id, :]

# target_user_id가 평점을 부여한 영화 확인
user_rating_id[ user_rating_id > 0 ].sort_values(ascending=False)[:10]

Unnamed: 0_level_0,78
title,Unnamed: 1_level_1
Die Hard (1988),5.0
Airplane! (1980),5.0
"Terminator, The (1984)",5.0
Terminator 2: Judgment Day (1991),4.5
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),4.5
"Shawshank Redemption, The (1994)",4.5
Ghostbusters (a.k.a. Ghost Busters) (1984),4.5
"Matrix, The (1999)",4.5
Dodgeball: A True Underdog Story (2004),4.5
"Naked Gun: From the Files of Police Squad!, The (1988)",4.5


사용자가 보지 않은 영화 중에서 아이템 기반의 유사도 협업 필터링 추천

In [47]:
def get_unseen_movies(ratings_matrix, userId):

  # userId로 입력 받은 사용자의 모든 영화 정보 추출.
  user_rating = ratings_matrix.loc[userId, :]

  # 이미 본 영화에 대한 인덱스 추출(영화 제목)
  already_seen = user_rating[ user_rating > 0 ].index.tolist()

  movie_list = ratings_matrix.columns.tolist()

  unseen_list = [ movie for movie in movie_list if movie not in already_seen ]

  return unseen_list

In [48]:
def recomm_movie_by_userId(pred_df, userId, unseen_list, top_n=10):
  recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
  return recomm_movies

In [49]:
# 사용자가 관람하지 않은 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 90)

# 추천 목록 만들기
recomm_movies = recomm_movie_by_userId(ratings_pred_matrix, target_user_id, unseen_list, top_n=10)

# 평점 데이터를 DataFrame으로
recomm_movies_df = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=["pred_score"])
recomm_movies_df

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Terminator 2: Judgment Day (1991),2.927918
Die Hard (1988),2.633424
Star Wars: Episode VI - Return of the Jedi (1983),2.226222
Indiana Jones and the Last Crusade (1989),2.221552
Mission: Impossible (1996),2.136809
Apollo 13 (1995),2.103989
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),2.033715
Jurassic Park (1993),2.011825
"Terminator, The (1984)",2.009106
Braveheart (1995),1.985336
