# 영화 추천 시스템

## 유사도를 이용한 추천 시스템

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import files
uploaded =files.upload()
filename = list(uploaded.keys())[0]

Saving movies_metadata.csv to movies_metadata.csv


In [3]:
movie = pd.read_csv(filename, low_memory=False)
movie.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [5]:
df = movie[['title','overview']]
df.head(2)

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...


In [6]:
df = df.head(20000)
df.shape

(20000, 2)

## 데이터 전처리

In [7]:
df.isnull().sum()

title         2
overview    135
dtype: int64

In [8]:
df.dropna(inplace=True)
df.shape

(19863, 2)

In [9]:
df.set_index('title', inplace =True)
df.reset_index(inplace =True)
df.tail()

Unnamed: 0,title,overview
19858,Rebellion,Dissidents in a French colony attack a police ...
19859,Versailles,A young mother Nina and her son Enzo find them...
19860,Two in the Wave,An in-depth analysis of the relationship betwe...
19861,Lotte Reiniger: Homage to the Inventor of the ...,Follows the life and work of animator Lotte Re...
19862,"RKO Production 601: The Making of 'Kong, the E...","An in-depth look at the genesis, production, a..."


## 텍스트 전처리

In [10]:
# 구둣점 제거 
df['clean_doc'] = df.overview.str.replace('[^A-Za-z ]', '')
df.head(2)

Unnamed: 0,title,overview,clean_doc
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Led by Woody Andys toys live happily in his ro...
1,Jumanji,When siblings Judy and Peter discover an encha...,When siblings Judy and Peter discover an encha...


## DTM 변환

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect =TfidfVectorizer(stop_words='english')
tfidf_matrix = tvect.fit_transform(df.overview)
tfidf_matrix.shape

(19863, 47487)

In [13]:
tfidf_clean = tvect.fit_transform(df.clean_doc)
tfidf_clean.shape

(19863, 54245)

## 영화의 타이틀과 인덱스를 가진 테이블 

In [14]:
indices = pd.Series(df.index, index= df.title).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

## 코사인 유사도를 통해 유사한 영화를 찾는 함수

In [16]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_clean = linear_kernel(tfidf_clean, tfidf_clean)

In [17]:
cosine_sim.shape

(19863, 19863)

In [18]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당되는 인덱스를 구함
    idx = indices[title]

    # 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬합니다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아옵니다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
    movie_indices = [i[0] for i in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴합니다.
    return df.title.iloc[movie_indices]

In [19]:
# 텍스트 전처리 하지 않은 데이터
get_recommendations('The Dark Knight Rises')

12447                            The Dark Knight
149                               Batman Forever
1314                              Batman Returns
15444                 Batman: Under the Red Hood
583                                       Batman
9203          Batman Beyond: Return of the Joker
17930                           Batman: Year One
19661    Batman: The Dark Knight Returns, Part 1
3077                Batman: Mask of the Phantasm
10092                              Batman Begins
Name: title, dtype: object

In [20]:
# 텍스트 전처리를 한 데이터
get_recommendations('The Dark Knight Rises', cosine_clean)

1314                              Batman Returns
12447                            The Dark Knight
15444                 Batman: Under the Red Hood
583                                       Batman
149                               Batman Forever
19661    Batman: The Dark Knight Returns, Part 1
3077                Batman: Mask of the Phantasm
9203          Batman Beyond: Return of the Joker
17930                           Batman: Year One
10092                              Batman Begins
Name: title, dtype: object