<a href="https://colab.research.google.com/github/WoojinJeonkr/Data-Analysis/blob/main/%EC%BD%98%ED%85%90%EC%B8%A0_%EA%B8%B0%EB%B0%98_%ED%95%84%ED%84%B0%EB%A7%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **콘텐츠 기반 필터링**
- 사용자가 특정 아이템을 선호하는 경우, 그와 비슷한 콘텐츠를 가진 다른 아이템을 추천하는 방식   
- ex) 특정 영화에 높은 평점을 줬다면 그 영화의 장르, 배우,감독, 키워드 등이 유사한 다른 영화 추천

## 코드 출처 : [[Python] 머신러닝 완벽가이드 - 09. 추천 시스템[콘텐츠 기반]](https://romg2.github.io/mlguide/01_%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-%EC%99%84%EB%B2%BD%EA%B0%80%EC%9D%B4%EB%93%9C-09.-%EC%B6%94%EC%B2%9C%EC%8B%9C%EC%8A%A4%ED%85%9C-%EC%BD%98%ED%85%90%EC%B8%A0-%EA%B8%B0%EB%B0%98/)

In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

mpl.rc('font', family='NanumGothic') # font
mpl.rc('axes', unicode_minus=False) # unicode_minus

In [3]:
# chart style
sns.set(font="NanumGothic", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc("figure", figsize=(10,8))

warnings.filterwarnings("ignore")

## 데이터 로딩 및 가공

In [4]:
movie = pd.read_csv('/content/tmdb_5000_movies.csv')

In [9]:
movie[:2]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965000.0,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800.0
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000.0,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500.0


In [11]:
movie.shape

(2435, 20)

In [12]:
movie.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [13]:
column_list = ['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']

In [15]:
movie_df = movie[column_list]

In [16]:
pd.set_option('max_colwidth', 80)

In [17]:
movie_df[['genres','keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id..."


In [18]:
# 옵션 초기화
pd.reset_option("max_colwidth")

In [19]:
movie_df['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [20]:
from ast import literal_eval

In [21]:
movie_df['genres'].apply(literal_eval)[0]

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [22]:
# 문자열을 객체로 변경: 리스트 내의 사전
movie_df['genres'] = movie_df['genres'].apply(literal_eval)
movie_df['keywords'] = movie_df['keywords'].apply(literal_eval)

In [23]:
# 객체에서 name 추출: 사전마다 name 추출
movie_df['genres'] = movie_df['genres'].apply(lambda x : [ dic['name'] for dic in x] )
movie_df['keywords'] = movie_df['keywords'].apply(lambda x : [ dic['name'] for dic in x] )

In [24]:
movie_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."


## 장르 콘텐츠 유사도 측정

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
# 리스트 객체를 문자열로 변환: 공백으로 구분
movie_df['genres_literal'] = movie_df['genres'].apply(lambda x : (' ').join(x))

In [27]:
# CountVectorizer
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movie_df['genres_literal'])

In [28]:
genre_mat.shape

(2435, 229)

### 리스트 객체 문자열로 변경 후 Count 피처 벡터화 적용

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
genre_sim = cosine_similarity(genre_mat, genre_mat)
genre_sim[0]

array([1.        , 0.59628479, 0.4472136 , ..., 0.        , 0.75592895,
       0.        ])

### 특정 영화와 장르 유사도가 높은 영화의 정보를 얻기 위한 함수 생성

In [31]:
def find_sim_movie(df, sim_matrix, title_name, top_n=10):
    
    # 입력한 영화의 index
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    # 입력한 영화의 유사도 데이터 프레임 추가
    df["similarity"] = sim_matrix[title_index, :].reshape(-1,1)
    
    # 유사도 내림차순 정렬 후 상위 index 추출
    temp = df.sort_values(by="similarity", ascending=False)
    final_index = temp.index.values[ : top_n]
    
    return df.iloc[final_index]

In [34]:
# Spider-Man 3(스파이더맨 3)와 장르별 유사도가 높은 영화 10개
similar_movies = find_sim_movie(movie_df, genre_sim, 'Spider-Man 3', 10)
similar_movies[['title', 'vote_average', "similarity"]]

Unnamed: 0,title,vote_average,similarity
1438,Krull,5.8,1.0
5,Spider-Man 3,5.9,1.0
486,The Last Witch Hunter,5.7,1.0
241,Teenage Mutant Ninja Turtles: Out of the Shadows,5.8,0.845154
292,Eragon,4.9,0.845154
1,Pirates of the Caribbean: At World's End,6.9,0.8
38,The Amazing Spider-Man 2,6.5,0.8
30,Spider-Man 2,6.7,0.8
2390,Red Sonja,5.0,0.8
836,The Forbidden Kingdom,6.3,0.8


In [35]:
# 평점 순으로 정렬
movie_df[['title','vote_average','vote_count']].sort_values('vote_average', ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count
2386,One Man's Hero,9.3,2.0
1881,The Shawshank Redemption,8.5,8205.0
1818,Schindler's List,8.3,4329.0
2294,Spirited Away,8.3,3840.0
662,Fight Club,8.3,9413.0
2170,Psycho,8.2,2320.0
1990,The Empire Strikes Back,8.2,5879.0
65,The Dark Knight,8.2,12002.0
1663,Once Upon a Time in America,8.2,1069.0
1847,GoodFellas,8.2,3128.0


### 평가 횟수와 평점 모두 고려한 가중 평균 함수 생성

In [36]:
percentile = 0.6
m = movie_df['vote_count'].quantile(percentile)
C = movie_df['vote_average'].mean()

In [37]:
def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ( (v/(v+m)) * R ) + ( (m/(m+v)) * C )   

movie_df['weighted_vote'] = movie_df.apply(weighted_vote_average, axis=1)

In [38]:
# 가중 평점을 기준으로 상위 10개의 영화 출력
temp = movie_df[['title','vote_average','vote_count','weighted_vote']]
temp.sort_values('weighted_vote', ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count,weighted_vote
1881,The Shawshank Redemption,8.5,8205.0,8.312591
662,Fight Club,8.3,9413.0,8.149092
65,The Dark Knight,8.2,12002.0,8.085447
809,Forrest Gump,8.2,7927.0,8.031442
96,Inception,8.1,13752.0,8.004256
1818,Schindler's List,8.3,4329.0,7.997043
95,Interstellar,8.1,10867.0,7.98041
1990,The Empire Strikes Back,8.2,5879.0,7.979104
2294,Spirited Away,8.3,3840.0,7.964531
329,The Lord of the Rings: The Return of the King,8.1,8064.0,7.942235


### 장르 유사도와 가중 평점을 모두 고려한 영화 추천 함수 생성

In [39]:
def find_sim_movie(df, sim_matrix, title_name, top_n=10):
    
    # 입력한 영화의 index
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    # 입력한 영화의 유사도 데이터 프레임 추가
    df["similarity"] = sim_matrix[title_index, :].reshape(-1,1)
        
    # 유사도와 가중 평점순으로 높은 상위 index 추출 (자기 자신 제거)
    temp = df.sort_values(by=["similarity", "weighted_vote"], ascending=False)
    temp = temp[temp.index.values != title_index]
    
    final_index = temp.index.values[:top_n]
    
    return df.iloc[final_index]

In [40]:
similar_movies = find_sim_movie(movie_df, genre_sim, 'Spider-Man 3', 10)

In [41]:
similar_movies[['title', 'vote_average', "weighted_vote", "similarity"]]

Unnamed: 0,title,vote_average,weighted_vote,similarity
1438,Krull,5.8,6.111347,1.0
486,The Last Witch Hunter,5.7,5.864806,1.0
241,Teenage Mutant Ninja Turtles: Out of the Shadows,5.8,5.956767,0.845154
292,Eragon,4.9,5.439528,0.845154
329,The Lord of the Rings: The Return of the King,8.1,7.942235,0.8
262,The Lord of the Rings: The Fellowship of the Ring,8.0,7.860576,0.8
330,The Lord of the Rings: The Two Towers,8.0,7.839876,0.8
199,Pirates of the Caribbean: The Curse of the Bla...,7.5,7.375974,0.8
19,The Hobbit: The Battle of the Five Armies,7.1,6.977939,0.8
98,The Hobbit: An Unexpected Journey,7.0,6.933788,0.8
