In [53]:
import pandas as pd # 데이터 핸들링 
import numpy as np 
import warnings
import json
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

In [54]:
ratings = pd.read_csv("./data/ratings.csv")

In [55]:
movie = pd.read_csv("./data/movies.csv")

In [56]:
movies= pd.merge(ratings.groupby('movieId', as_index=False)[['rating']].mean(), movie, on= 'movieId', how= 'inner')

In [57]:
movies

Unnamed: 0,movieId,rating,title,genres
0,1,3.920930,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.431818,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.259615,Grumpier Old Men (1995),Comedy|Romance
3,4,2.357143,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,3.071429,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
9719,193581,4.000000,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9720,193583,3.500000,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9721,193585,3.500000,Flint (2017),Drama
9722,193587,3.500000,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [58]:
# 파이썬 객체 변환을 위한 함수 선언 후 적용

def transform(a):
    return a.split('|') 

movies['genres']= movies['genres'].apply(transform)

In [59]:
movies['genres']

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9719                 [Action, Animation, Comedy, Fantasy]
9720                         [Animation, Comedy, Fantasy]
9721                                              [Drama]
9722                                  [Action, Animation]
9723                                             [Comedy]
Name: genres, Length: 9724, dtype: object

In [60]:
# 객체 변환한 movies['genres']를 movies_literal 로 저장

movies['genres_literal']= movies['genres'].apply(lambda x : (' ').join(x))

In [61]:
# 코사인 유사도 계산을 위해 데이터 피쳐의 벡터화

count_vect= CountVectorizer(min_df= 0)
genre_mat= count_vect.fit_transform(movies['genres_literal'])

In [62]:
genre_sim= cosine_similarity(genre_mat, genre_mat) # 저장시켰던 것을 불러올 수 있도록
genre_sim

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [75]:
genre_sim 데이터를 컬럼과 인덱스를 title 로 하는 데이터프레임으로 데이터 저장

data= pd.DataFrame(data= genre_sim, columns= movies.title, index= movies.title)

In [76]:
genre_sim= data.values

In [77]:
genre_sim

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [65]:
# 유사도가 높은 순서로 정렬 된 비교행 위치 인덱스 값 구한 후 저장

genre_sim_sorted_ind= genre_sim.argsort()[:, ::-1]

In [66]:
print(genre_sim_sorted_ind[:1]) # 0번 레코드를 제외하면 앞에 두개는 유사도 높은순, 뒤에 두개는 유사도 낮은순

[[   0 1705 6177 ... 3498 3499 4861]]


In [67]:
# 장르가 유사한 영화 추천 함수 선언

def REC(df, sorted_ind, title_name, top_n= 10):
    
    title_movie= df[df['title']== title_name] # title 해당 데이터 추출
    
    # title_name 을 가진 데이터프레임의 index 객체를 ndarray로 반환 / sorted_ind 인자로 입력 된 genre_sim_sorted_ind 객체에서 유사도 순 추출
    
    title_index= title_movie.index.values
    similar_indexes= sorted_ind[title_index, :(top_n)]
    
    #  top_n index 는 2차원 데이터 ㅡ> 1차원 array로 변경
    
    print(similar_indexes)
    similar_indexes= similar_indexes.reshape(-1)
    
    # index 에 해당되는 데이터프레임 반환
    
    return df.iloc[similar_indexes]

In [68]:
# 영화 선택에 따른 유사 영화 추천

similar_movies= REC(movies, genre_sim_sorted_ind, 'Jumanji (1995)', 10)
similar_movies[['title', 'rating', 'genres']]

[[6372 6734   53 9318 1616 8623 8782 1798 6058 1617]]


Unnamed: 0,title,rating,genres
6372,Bridge to Terabithia (2007),2.777778,"[Adventure, Children, Fantasy]"
6734,"Chronicles of Narnia: Prince Caspian, The (2008)",3.541667,"[Adventure, Children, Fantasy]"
53,"Indian in the Cupboard, The (1995)",3.235294,"[Adventure, Children, Fantasy]"
9318,Pete's Dragon (2016),3.0,"[Adventure, Children, Fantasy]"
1616,"NeverEnding Story, The (1984)",3.581395,"[Adventure, Children, Fantasy]"
8623,Seventh Son (2014),2.25,"[Adventure, Children, Fantasy]"
8782,Pan (2015),2.5,"[Adventure, Children, Fantasy]"
1798,Santa Claus: The Movie (1985),2.25,"[Adventure, Children, Fantasy]"
6058,"Chronicles of Narnia: The Lion, the Witch and ...",3.443548,"[Adventure, Children, Fantasy]"
1617,"NeverEnding Story II: The Next Chapter, The (1...",2.5,"[Adventure, Children, Fantasy]"


In [69]:
usr_input = 'Jumanji (1995)' #사용자 데이터
genre_sim_sorted_ind = genre_sim_sorted_ind= genre_sim.argsort()[:, ::-1]
# usr_input 의 index값을 먼저 추출 후 movies 타이틀의 인덱스가 함수로 넘어가야 함
similar_index= REC(movies, genre_sim_sorted_ind, 'Jumanji (1995)', 10)
# db에서 위 인덱스에 해당하는 영화정보 추출해 와야함

[[6372 6734   53 9318 1616 8623 8782 1798 6058 1617]]


In [80]:
movies

Unnamed: 0,movieId,rating,title,genres,genres_literal
0,1,3.920930,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy
1,2,3.431818,Jumanji (1995),"[Adventure, Children, Fantasy]",Adventure Children Fantasy
2,3,3.259615,Grumpier Old Men (1995),"[Comedy, Romance]",Comedy Romance
3,4,2.357143,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",Comedy Drama Romance
4,5,3.071429,Father of the Bride Part II (1995),[Comedy],Comedy
...,...,...,...,...,...
9719,193581,4.000000,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",Action Animation Comedy Fantasy
9720,193583,3.500000,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",Animation Comedy Fantasy
9721,193585,3.500000,Flint (2017),[Drama],Drama
9722,193587,3.500000,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",Action Animation


In [79]:
movies.to_csv('./data/movies_df.csv')

In [72]:
# indices 를 title 를 인덱스로 하는 movie 인덱스 값을 시리즈로 저장

indices= pd.Series(movies.index, index= movies['title'])
indices.head()

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Waiting to Exhale (1995)              3
Father of the Bride Part II (1995)    4
dtype: int64

In [83]:
idx= indices['Toy Story (1995)']
print(idx)

0


In [73]:
# 장르가 유사한 영화 추천 함수 선언

def REC(title, genre_sim= genre_sim):
    choice= []
    
    # 앞서 저장한 시리즈 값의 title 을 idx 로 저장
    idx= indices[title]
    
    # 유사도 계산 된 값을 리스트로 반환
    sim_scores= list(enumerate(genre_sim[idx]))
    
    # 반환 된 리스트 값을 유사도 높은 순으로 정렬
    sim_scores= sorted(sim_scores, key= lambda x: x[1], reverse= True)
    
    # 높은 순으로 정렬 된 유사도중 높은 것 10개 추출
    sim_scores= sim_scores[1:11]
    
    # 추출 된 높은 유사도를 가진 10개의 영화를 movie_indices 에 저장
    movie_indices= [i[0] for i in sim_scores]
    
    for i in range(10):
        choice.append(movies['title'][movie_indices[i]])
        
    print('**영화 추천 목록**')
    for i in range(10):
        print(str(i+1) + choice[i])

In [84]:
REC('Jumanji (1995)')

**영화 추천 목록**
1Indian in the Cupboard, The (1995)
2NeverEnding Story III, The (1994)
3Escape to Witch Mountain (1975)
4Darby O'Gill and the Little People (1959)
5Return to Oz (1985)
6NeverEnding Story, The (1984)
7NeverEnding Story II: The Next Chapter, The (1990)
8Santa Claus: The Movie (1985)
9Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
10Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)
