In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data/tmdb_5000_movies.csv')

In [5]:
df['overview']

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4798    El Mariachi just wants to play his guitar and ...
4799    A newlywed couple's honeymoon is upended by th...
4800    "Signed, Sealed, Delivered" introduces a dedic...
4801    When ambitious New York attorney Sam is sent t...
4802    Ever since the second grade when he first saw ...
Name: overview, Length: 4803, dtype: object

In [6]:
df['overview'].isnull().any()

np.True_

In [8]:
df['overview'] = df['overview'].fillna('')

In [9]:
df['overview'].isnull().any()

np.False_

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english') #the, a와 같은 의미 없는 단어를 제외한다.


In [13]:
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(4803, 20978)

In [14]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 125840 stored elements and shape (4803, 20978)>

In [17]:
from sklearn.metrics.pairwise import linear_kernel #cosine 유사도를 구하는 함수
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

In [18]:
cosine_sim.shape


(4803, 4803)

In [36]:
title = 'Iron Man'
idx = df[df['title']==title].index[0]
idx

np.int64(68)

In [37]:
df['title']

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4798                                 El Mariachi
4799                                   Newlyweds
4800                   Signed, Sealed, Delivered
4801                            Shanghai Calling
4802                           My Date with Drew
Name: title, Length: 4803, dtype: object

In [38]:
sim_score = list(enumerate(cosine_sim[idx]))
sim_score

[(0, np.float64(0.05604155333848265)),
 (1, np.float64(0.0)),
 (2, np.float64(0.0)),
 (3, np.float64(0.0)),
 (4, np.float64(0.0)),
 (5, np.float64(0.0)),
 (6, np.float64(0.0)),
 (7, np.float64(0.12607342868374588)),
 (8, np.float64(0.0)),
 (9, np.float64(0.0)),
 (10, np.float64(0.0)),
 (11, np.float64(0.0)),
 (12, np.float64(0.0)),
 (13, np.float64(0.0)),
 (14, np.float64(0.0)),
 (15, np.float64(0.016034505957118604)),
 (16, np.float64(0.0)),
 (17, np.float64(0.0)),
 (18, np.float64(0.0)),
 (19, np.float64(0.0)),
 (20, np.float64(0.0)),
 (21, np.float64(0.0)),
 (22, np.float64(0.0)),
 (23, np.float64(0.0)),
 (24, np.float64(0.0)),
 (25, np.float64(0.0)),
 (26, np.float64(0.0)),
 (27, np.float64(0.0)),
 (28, np.float64(0.0)),
 (29, np.float64(0.0)),
 (30, np.float64(0.019596547850930127)),
 (31, np.float64(0.15088998615324578)),
 (32, np.float64(0.0)),
 (33, np.float64(0.0)),
 (34, np.float64(0.0)),
 (35, np.float64(0.0)),
 (36, np.float64(0.0)),
 (37, np.float64(0.023583777495659763)),

In [39]:
sim_score=sorted(sim_score, key=lambda x:x[1], reverse=True)
sim_score = sim_score[1:11]

In [40]:
movie_indices = [i[0] for i in sim_score]
movie_indices

[79, 31, 1868, 7, 538, 119, 1290, 4574, 2044, 2633]

In [42]:
df['title'].iloc[movie_indices]

79                     Iron Man 2
31                     Iron Man 3
1868           Cradle 2 the Grave
7         Avengers: Age of Ultron
538                       Hostage
119                 Batman Begins
1290     Baahubali: The Beginning
4574                     Roadside
2044           The Little Vampire
2633    The Clan of the Cave Bear
Name: title, dtype: object

In [48]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 영화 제목을 통해 영화의 index 값을 얻기
    idx = df[df['title']==title].index[0]
    # cosine 유사도 매트릭스에서 idx에 해당하는 데이터를 [idx, 유사도] 형태로 얻기
    sim_score = list(enumerate(cosine_sim[idx]))
    # cosine 유사도 기준으로 내림차순 정렬
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
    # 자기 자신을 제외한 10개의 추천 영화를 Slicing
    sim_score = sim_score[1: 11]
    # 추천 영화 목록 10개의 index 정보 추출
    movie_indices = [i[0] for i in sim_score]
    # index 정보를 통해 영화 제목 추출
    titles = df['title'].loc[movie_indices]
    return titles

In [50]:
get_recommendations('Iron Man')

79                     Iron Man 2
31                     Iron Man 3
1868           Cradle 2 the Grave
7         Avengers: Age of Ultron
538                       Hostage
119                 Batman Begins
1290     Baahubali: The Beginning
4574                     Roadside
2044           The Little Vampire
2633    The Clan of the Cave Bear
Name: title, dtype: object

In [53]:
movies = df[['id', 'title']].copy()
movies

Unnamed: 0,id,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter
...,...,...
4798,9367,El Mariachi
4799,72766,Newlyweds
4800,231617,"Signed, Sealed, Delivered"
4801,126186,Shanghai Calling


In [54]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

In [55]:
import pickle

In [56]:
pickle.dump(movies, open('data/movies.pickle', 'wb'))


In [57]:
pickle.dump(cosine_sim, open('data/cosine_sim.pickle', 'wb'))
