### 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### 데이터 불러오기

In [2]:
data = pd.read_csv(r'C:\Users\lg\Desktop\archive\movielens.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [4]:
data.shape

(45466, 24)

In [5]:
data = data[['id', 'genres','vote_average','vote_count','popularity','overview']]

vote_average 값의 변경이 필요함. (**불공정**하게 되어있다.)\
예를 들어, vote 수가 적은데 (3개) 3개 전부 5점이라고 하면 vote가 5점으로 되어 있기 때문이다.\
해당 이슈는  url : https://www.quora.com/How-does-IMDbs-rating-system-work 에서 확인할 수 있습니다.

**weighted rating (WR) = (v / (v+m)) x R + (m / (v+m)) x C**
- R : average for the movie (mean) = Rating / 개별 영화 평점
- v : number of votes for the movie = votes / 개별 영화에 평점을 투표한 횟수
- m : minimum votes required to be listed in the Top 250 (currently 25,000) / 250위 안에 들어야하는 최소 투표 (정하기 나름)
- C : the mean vote across the whole report / 전체 영화에 대한 평점

여기서 m은 500위로 가정하고 진행하겠습니다.

500위 정도로 들어오기 위해 vote_count가 상위 몇 %이어야 하는지 quantile을 이용해 구하겠습니다.

In [6]:
tmp_m = data['vote_count'].quantile(0.90)
tmp_m

160.0

In [7]:
tmp_data = data.copy().loc[data['vote_count'] >= tmp_m]
tmp_data.shape

(4555, 6)

상위 90% 했을 때 4800여개가 들어옵니다.\
98%로 했을 떄, 910개가 들어옵니다. 이로 가정하고 하겠습니다.

In [8]:
del tmp_data

m = data['vote_count'].quantile(0.98)
data = data.loc[data['vote_count'] >=m ]

In [9]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,overview
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0,21.946943,"Led by Woody, Andy's toys live happily in his ..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.015539,When siblings Judy and Peter discover an encha...
5,949,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,"Obsessive master thief, Neil McCauley leads a ..."
15,524,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,The life of the gambling paradise – Las Vegas ...
31,63,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",7.4,2470.0,12.297305,"In the year 2035, convict James Cole reluctant..."


In [10]:
C = data['vote_average'].mean()

In [11]:
print(C)
print(m)

6.821868131868129
1236.8199999999997


In [12]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    return ( v / (v+m)) + (m / (v+m) * C)

In [13]:
data['score'] = data.apply(weighted_rating, axis=1)

In [14]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,overview,score
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0,21.946943,"Led by Woody, Andy's toys live happily in his ...",2.082501
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.015539,When siblings Judy and Peter discover an encha...,2.972865
5,949,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,"Obsessive master thief, Neil McCauley leads a ...",3.305801
15,524,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,The life of the gambling paradise – Las Vegas ...,3.791126
31,63,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",7.4,2470.0,12.297305,"In the year 2035, convict James Cole reluctant...",2.942528


In [15]:
data.shape

(910, 7)

weighted_score이 완성되었습니다.\
또한, 지금 genres가 조금 독특한 구조를 가지고 있습니다.

In [16]:
data[['genres']].head()

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
5,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
15,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name..."
31,"[{'id': 878, 'name': 'Science Fiction'}, {'id'..."


list 내부에 dictionary가 있는 구조입니다.\
이렇게 표현한 이유는 하나의 영화가 하나의 장르에만 속하지 않기 때문입니다.\
그리고 문제가 지금 내부에는 문자열로 들어가 있는 것입니다.

이를 해결하기 위해 ast 패키지를 이용하여 literal_eval을 사용하겠습니다.\
그러면 list와 dictionary 형태로 바뀌게 됩니다.

In [17]:
data['genres'] = data['genres'].apply(literal_eval)

In [18]:
data[['genres']].head()

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
5,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
15,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name..."
31,"[{'id': 878, 'name': 'Science Fiction'}, {'id'..."


genres의 id를 제거한 후 name만 뽑아내면 됩니다.

In [19]:
data['genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [20]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,overview,score
0,862,Animation Comedy Family,7.7,5415.0,21.946943,"Led by Woody, Andy's toys live happily in his ...",2.082501
1,8844,Adventure Fantasy Family,6.9,2413.0,17.015539,When siblings Judy and Peter discover an encha...,2.972865
5,949,Action Crime Drama Thriller,7.7,1886.0,17.924927,"Obsessive master thief, Neil McCauley leads a ...",3.305801
15,524,Drama Crime,7.8,1343.0,10.137389,The life of the gambling paradise – Las Vegas ...,3.791126
31,63,Science Fiction Thriller Mystery,7.4,2470.0,12.297305,"In the year 2035, convict James Cole reluctant...",2.942528


In [21]:
data.to_csv(r'C:\Users\lg\Desktop\archive\movielens.csv', index = False)

## content based filtering

In [2]:
movie_data = pd.read_csv(r'C:\Users\lg\Desktop\archive\movies_metadata.csv')
movie_data = movie_data.loc[movie_data['original_language'] == 'en', :]
movie_data = movie_data[['id', 'title', 'original_language','genres']]

print(movie_data.shape)
movie_data.head()

(32269, 4)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,title,original_language,genres
0,862,Toy Story,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,en,"[{'id': 35, 'name': 'Comedy'}]"


In [3]:
movie_keyword = pd.read_csv(r'C:\Users\lg\Desktop\archive\keywords.csv')
print(movie_keyword.shape)
movie_keyword.head()

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


데이터 2개를 가져와서 id를 맞춰 merge하겠습니다.

In [4]:
movie_data.id = movie_data.id.astype(int)
movie_keyword.id = movie_keyword.id.astype(int)
movie_data = pd.merge(movie_data,movie_keyword, on='id')
print(movie_data.shape)
movie_data

(32852, 5)


Unnamed: 0,id,title,original_language,genres,keywords
0,862,Toy Story,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,en,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...,...,...,...
32847,222848,Caged Heat 3000,en,"[{'id': 878, 'name': 'Science Fiction'}]",[]
32848,30840,Robin Hood,en,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",[]
32849,67758,Betrayal,en,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",[]
32850,227506,Satan Triumphant,en,[],[]


### 전처리
앞서 진행한 전처리 방법을 그대로 사용하겠습니다.\
데이터 내부엔 list(dict()) 형태로 구성되어 있습니다. 또한 이 안에는 **문자열** 형태로 들어가 있습니다.\
이를 처리하기 위해 ast의 literal_eval함수를 사용합니다.

In [5]:
movie_data['genres'] = movie_data['genres'].apply(literal_eval)
movie_data['genres'] = movie_data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [6]:
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [7]:
movie_data.head()

Unnamed: 0,id,title,original_language,genres,keywords
0,862,Toy Story,en,Animation Comedy Family,jealousy toy boy friendship friends rivalry bo...
1,8844,Jumanji,en,Adventure Fantasy Family,board game disappearance based on children's b...
2,15602,Grumpier Old Men,en,Romance Comedy,fishing best friend duringcreditsstinger old men
3,31357,Waiting to Exhale,en,Comedy Drama Romance,based on novel interracial relationship single...
4,11862,Father of the Bride Part II,en,Comedy,baby midlife crisis confidence aging daughter ...


### TF-IDF vectorization
전처리한 데이터를 TF-IDF 방법을 이용해 벡터로 만들어줍니다.\
저는 genres와 keyword를 하나로 합친 후 tf-idf vector로 만들었습니다.

In [8]:
tfidf_vector = TfidfVectorizer()
#tfidf_vector = TfidfVectorizer(ngram_range=(1,2))

tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'] + " " + movie_data['keywords']).toarray()
#tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres']).toarray()

tfidf_matrix_feature = tfidf_vector.get_feature_names()

In [9]:
tfidf_matrix.shape

(32852, 11437)

In [10]:
tfidf_matrix = pd.DataFrame(tfidf_matrix, columns = tfidf_matrix_feature, index = movie_data.title)
print(tfidf_matrix.shape)
tfidf_matrix.head()

(32852, 11437)


Unnamed: 0_level_0,077,10,11,13,1500s,15th,16th,17th,1812,18th,...,βάφτηκε,γη,κόκκινο,το,χώμα,миньоны,卧底肥妈,绝地奶霸,自然界大事件,超级妈妈
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 유사도 구하기
tf-idf vector를 코사인 유사도를 활용하여 유사도 값을 구합니다.\
이렇게 하면 영화 개수(n)만큼 nxn 의 matrix 형태가 나옵니다.

In [14]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix)

MemoryError: Unable to allocate 8.04 GiB for an array with shape (32852, 32852) and data type float64

In [13]:
cosine_sim.shape

NameError: name 'cosine_sim' is not defined

In [None]:
cosine_sim_df = pd.DataFrame(cosine_sim, index = movie_data.title, columns = movie_data.title)
print(cosine_sim_df.shape)
cosine_sim_df.head()