## 추천시스템

### 콘텐츠 기반 필터링
1. 콘텐츠에 대한 여러 텍스트 정보들을 피처 벡터화(컴퓨터가 계산 하기 편하라고)
2. 코사인 유사도로 콘텐츠별 유사도 계산
3. 콘텐츠 별로 가중 평점을 계산
4. 유사도가 높은 콘텐츠 중에 평점이 좋은 콘텐츠 순으로 추천

### 실습 : TMDB5000

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv('./tmdb_5000_movies.csv')

print(movies.shape)
movies.head(3)

# 장르 유사도 기반 영화 추천 시스템

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


#### 장르 데이터 전처리

In [2]:
from ast import literal_eval #문자열을 json 의 형식으로 바꿔줌

movies['genres'] = movies['genres'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
movies[['genres','keywords']]

Unnamed: 0,genres,keywords
0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na..."
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 470, 'name': 'spy'}, {'id': 818, 'name..."
3,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'id': 849, 'name': 'dc comics'}, {'id': 853,..."
4,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
...,...,...
4798,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'id': 5616, 'name': 'united states–mexico ba..."
4799,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",[]
4800,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 248, 'name': 'date'}, {'id': 699, 'nam..."
4801,[],[]


In [22]:
type(movies['genres'][0])

list

In [25]:
movies['genres'][0]

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [3]:
movies['genres'] = movies['genres'].apply(lambda x : [y['name'] for y in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [y['name'] for y in x])

movies[['genres','keywords']]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."
...,...,...
4798,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap..."
4799,"[Comedy, Romance]",[]
4800,"[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi..."
4801,[],[]


In [4]:
movies[['genres','keywords']].head(1)

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."


#### 장르 피처 벡터화 - 영화간 유사도 관계
- [`CounterVectorizer`](https://datascienceschool.net/03%20machine%20learning/03.01.03%20Scikit-Learn%EC%9D%98%20%EB%AC%B8%EC%84%9C%20%EC%A0%84%EC%B2%98%EB%A6%AC%20%EA%B8%B0%EB%8A%A5.html "CounterVectorizer")

- 콘텐츠기반필터링

In [5]:
# 장르 문자열을 Count 기반 피처 벡터화 후에 코사인 유사도로 각 영화 비교

from sklearn.feature_extraction.text import CountVectorizer

#CounterVectorizer  적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 반환
movies['genres_literal'] = movies['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies['genres_literal'])
print(genre_mat.shape)

#bigram 으로 피처 수 276개로 증가

(4803, 276)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])

#영화 간 장르 유사도를 코사인 유사도로 계산

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [7]:
genre_sim_sorted_ind = genre_sim.argsort()[:,::-1]
print(genre_sim_sorted_ind[:1])

# 첫 번째 영화와 유사도가 높은 영화 순서 

[[   0 3494  813 ... 3038 3037 2401]]


#### 특정 영화와 장르 유사도가 높은 영화 추천하기 

In [8]:
# 특정 영화와 장르 유사도가 높은 영화를 변환하는 함수

def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    
    # 인자로 입력된 movies DataFrame 에서 'title' 칼럼이 입력된 title_name값 추출
    title_movie = df[df['title']==title_name]
    
    # title_name 을 가진 DataFram의  index 객체를 ndarray로 반환하고
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index,:(top_n)]
    
    #추출된 top_n index 들 출력. top_n index는 2차원 데이터
    #dataframe 에서  index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [9]:
similar_movies = find_sim_movie(movies,genre_sim_sorted_ind,'The Godfather',10)
similar_movies[['title','vote_average']]

# [[영화 갓파더와 장르 유사도가 높은 순서]]
# 문제점: 평가횟수가 낮은 영화도 같이 나옴 -> 평가횟수도 고려한 추천시스템 필요 

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


#### 가중평점 반영한 영화 추천


`가중평점`(Weighted Rating):  
$$ \frac{v}{v+m}*R + \frac{m}{v+m}*C $$

- v : 영화별 평점을 투표한 횟수(vote_count) -> 투표횟수가 많은 영화에 가중치 부여
- m : 평점을 부여하기 위한 최소 투표 횟수 -> 여기서는 투표수 상위 60%
- R : 개별 영화에 대한 평균 평점(vote_average)
- C : 전체 영화에 대한 평균 평점(movies['vote_average'].mean())
- C, m 은 고정값
- v, R 은 영화마다 변동값

In [38]:
# 상위 60%에 해당하는 vote_count를 최소 투표 횟수인 m 으로 지정
C = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.6)

In [39]:
# C: 전체 영화에 대한 평균평점 = 약 6점
# m: 평점을 부여하기 위한 최소 투표 횟수 = 370회(상위 60% 수준)
print('C:',round(C,3), 'm:',round(m,3))

C: 6.092 m: 370.2


#### 가중평점 계산하는 함수

In [40]:
def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ( (v/(v+m)) * R) + ( (m/(m+v)) * C) #가중평균

In [41]:
movies['weighted_vote'] = movies.apply(weighted_vote_average,axis=1)

In [44]:
movies['weighted_vote'].head()

0    7.166301
1    6.838594
2    6.284091
3    7.541095
4    6.098838
Name: weighted_vote, dtype: float64

### 추천ver2 : 장르 유사성 높은 영화 20개 먼저 선정 후 , 가중평점순 10개 선정
- 협업기반 필터링

In [48]:
def find_sim_movie_ver2(df,sorted_ind,title_name, top_n=10):
    title_movie = df[df['title']==title_name]
    title_index = title_movie.index.values
    
    # top_n의 2배에 해당하는 장르 유사성이 높은 index 추출
    similar_indexes = sorted_ind[title_index,:(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    # 기준 영화 index 는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    #top_n의 2배에 해당하는 후보군에서 weighted_vote 높은 순으로 top_n만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote',ascending=False)[:top_n]

In [49]:
similar_movies = find_sim_movie_ver2(movies, genre_sim_sorted_ind,'The Godfather',10)
similar_movies[['title','vote_average','weighted_vote','genres','vote_count']]

Unnamed: 0,title,vote_average,weighted_vote,genres,vote_count
2731,The Godfather: Part II,8.3,8.079586,"[Drama, Crime]",3338
1847,GoodFellas,8.2,7.976937,"[Drama, Crime]",3128
3866,City of God,8.1,7.759693,"[Drama, Crime]",1814
1663,Once Upon a Time in America,8.2,7.657811,"[Drama, Crime]",1069
883,Catch Me If You Can,7.7,7.557097,"[Drama, Crime]",3795
281,American Gangster,7.4,7.141396,"[Drama, Crime]",1502
4041,This Is England,7.4,6.739664,"[Drama, Crime]",363
1149,American Hustle,6.8,6.717525,"[Drama, Crime]",2807
1243,Mean Streets,7.2,6.626569,"[Drama, Crime]",345
2839,Rounders,6.9,6.530427,"[Drama, Crime]",439


### 응용: Spider-Man 3  좋아하는 사람에게 영화 추천해주기
- 장르 : 확인해보기
- 장르 상위 20개 영화 뽑아보고
- 그 중 평가횟수를 반영한 가중평점 기준 상위 10 개 뽑아서 추천

In [50]:
similar_movies = find_sim_movie_ver2(movies,genre_sim_sorted_ind,'Spider-Man 3',10)
similar_movies[['title','vote_average','weighted_vote','genres','vote_count']]

Unnamed: 0,title,vote_average,weighted_vote,genres,vote_count
329,The Lord of the Rings: The Return of the King,8.1,8.011871,"[Adventure, Fantasy, Action]",8064
262,The Lord of the Rings: The Fellowship of the Ring,8.0,7.922175,"[Adventure, Fantasy, Action]",8705
330,The Lord of the Rings: The Two Towers,8.0,7.910111,"[Adventure, Fantasy, Action]",7487
19,The Hobbit: The Battle of the Five Armies,7.1,7.027274,"[Action, Adventure, Fantasy]",4760
98,The Hobbit: An Unexpected Journey,7.0,6.961224,"[Adventure, Fantasy, Action]",8297
126,Thor: The Dark World,6.8,6.748873,"[Action, Adventure, Fantasy]",4755
30,Spider-Man 2,6.7,6.652034,"[Action, Adventure, Fantasy]",4321
129,Thor,6.6,6.572735,"[Adventure, Fantasy, Action]",6525
20,The Amazing Spider-Man,6.5,6.478296,"[Action, Adventure, Fantasy]",6586
38,The Amazing Spider-Man 2,6.5,6.466812,"[Action, Adventure, Fantasy]",4179


In [65]:
def pirnt_recommand_movies(similar_movies_name):
    similar_movies = find_sim_movie_ver2(movies,genre_sim_sorted_ind,similar_movies_name,10)
    return similar_movies['title']

In [72]:
similar_movies_name = input('좋아하는 영화를 입력하세요:')
print('선택한 영화:',similar_movies_name)
pirnt_recommand_movies(similar_movies_name)

선택한 영화: Cars 2


66                                 Up
352                             Mulan
631                           Ice Age
181                     Kung Fu Panda
566                              Cars
514             Ice Age: The Meltdown
348    Ice Age: Dawn of the Dinosaurs
374                               Rio
179            Penguins of Madagascar
546                           Minions
Name: title, dtype: object

### 스스로 해보기
- 콘텐츠 기반 필터링

In [83]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movie_df = pd.read_csv('./movies.csv')
rate_df = pd.read_csv('./ratings.csv')

print(movie_df.shape)
# print(rate_df.shape)

(62423, 3)


In [70]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [90]:
type(movie_df.genres[0])

list

In [77]:
movie_df['genres']

0        Adventure|Animation|Children|Comedy|Fantasy
1                         Adventure|Children|Fantasy
2                                     Comedy|Romance
3                               Comedy|Drama|Romance
4                                             Comedy
                            ...                     
62418                                          Drama
62419                                    Documentary
62420                                   Comedy|Drama
62421                             (no genres listed)
62422                         Action|Adventure|Drama
Name: genres, Length: 62423, dtype: object

In [86]:
movie_df['genres'] = movie_df['genres'].apply(lambda x : x.replace('|',','))
movie_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy"
1,2,Jumanji (1995),"Adventure,Children,Fantasy"
2,3,Grumpier Old Men (1995),"Comedy,Romance"
3,4,Waiting to Exhale (1995),"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),"Comedy,Drama"
62421,209169,A Girl Thing (2001),(no genres listed)


In [89]:
movie_df['genres'] = movie_df['genres'].apply(lambda x : x.split(','))
movie_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
62418,209157,We (2018),[Drama]
62419,209159,Window of the Soul (2001),[Documentary]
62420,209163,Bad Poems (2018),"[Comedy, Drama]"
62421,209169,A Girl Thing (2001),[(no genres listed)]


In [92]:
movie_df['genres_literal'] = movie_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movie_df['genres_literal'])

print(genre_mat.shape)


(62423, 193)


In [98]:
genre_mat_min = genre_mat[:2000]

In [96]:
movie_df['genres_literal']

0        Adventure Animation Children Comedy Fantasy
1                         Adventure Children Fantasy
2                                     Comedy Romance
3                               Comedy Drama Romance
4                                             Comedy
                            ...                     
62418                                          Drama
62419                                    Documentary
62420                                   Comedy Drama
62421                             (no genres listed)
62422                         Action Adventure Drama
Name: genres_literal, Length: 62423, dtype: object

In [97]:
movie_df['genres']

0        [Adventure, Animation, Children, Comedy, Fantasy]
1                           [Adventure, Children, Fantasy]
2                                        [Comedy, Romance]
3                                 [Comedy, Drama, Romance]
4                                                 [Comedy]
                               ...                        
62418                                              [Drama]
62419                                        [Documentary]
62420                                      [Comedy, Drama]
62421                                 [(no genres listed)]
62422                           [Action, Adventure, Drama]
Name: genres, Length: 62423, dtype: object

In [99]:
genre_sim = cosine_similarity(genre_mat_min, genre_mat_min)
print(genre_sim.shape)
print(genre_sim[:2])

(2000, 2000)
[[1.         0.4472136  0.19245009 ... 0.19245009 0.50395263 0.2981424 ]
 [0.4472136  1.         0.         ... 0.25819889 0.50709255 0.2       ]]


In [100]:
genre_sim_sorted_ind = genre_sim.argsort()[:,::-1]
print(genre_sim_sorted_ind[:1])

[[   0  661 1510 ... 1114 1116  936]]


In [107]:
similar_movies_2 = find_sim_movie(movie_df,genre_sim_sorted_ind,'Toy Story (1995)',10)
similar_movies_2[['title','genres']]

[[   0  661 1510  694  580 1760 1944   55  240 1638]]


Unnamed: 0,title,genres
0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
661,Space Jam (1996),"[Adventure, Animation, Children, Comedy, Fanta..."
1510,Hercules (1997),"[Adventure, Animation, Children, Comedy, Musical]"
694,Oliver & Company (1988),"[Adventure, Animation, Children, Comedy, Musical]"
580,Aladdin (1992),"[Adventure, Animation, Children, Comedy, Musical]"
1760,"Borrowers, The (1997)","[Adventure, Children, Comedy, Fantasy]"
1944,"Black Cauldron, The (1985)","[Adventure, Animation, Children, Fantasy]"
55,Kids of the Round Table (1995),"[Adventure, Children, Comedy, Fantasy]"
240,Gordy (1995),"[Children, Comedy, Fantasy]"
1638,Flubber (1997),"[Children, Comedy, Fantasy]"


In [108]:
rate_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [111]:
type(rate_df.userId[0])

numpy.int64

In [113]:
rate_df.userId.describe()

count    2.500010e+07
mean     8.118928e+04
std      4.679172e+04
min      1.000000e+00
25%      4.051000e+04
50%      8.091400e+04
75%      1.215570e+05
max      1.625410e+05
Name: userId, dtype: float64

In [127]:
rate_df.userId.value_counts()

72315     32202
80974      9178
137293     8913
33844      7919
20055      7488
          ...  
52927        20
43931        20
27547        20
27546        20
39653        20
Name: userId, Length: 162541, dtype: int64

In [124]:
rate_df.userId.nunique()

162541