In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim # word2vec의 알고리즘을 호출
import warnings
warnings.filterwarnings('ignore')

# 1. Load movie data

In [14]:
path = '../input/recom/movies/ratings.csv'
movie = pd.read_csv(path, low_memory=False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [15]:
# 시간순으로 정렬해주는 함수 sort_values: 시간순으로 정렬!
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [21]:
path = '../input/recom/movies/movies_metadata.csv'
meta = pd.read_csv(path, low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


### 2. Movie Id를 활용하여 데이터 합치기

In [23]:
meta = meta.rename(columns={'id':'movieId'})
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId']  = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], on='movieId', how='left')
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,383,21,3.0,789652009,The Endless Summer
1,383,47,5.0,789652009,


In [24]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,383,21,3.0,789652009,The Endless Summer
1,409,21,5.0,828212412,The Endless Summer


In [25]:
agg = movie.groupby(['userId'])['original_title'].agg(['unique'])
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [27]:
movie['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

In [30]:
# int형식은 Word2Vec에서 학습이 안되어서 String으로 변경해준다
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))
sentence[:2]

[['Jay and Silent Bob Strike Back',
  'Vivement dimanche!',
  'Rocky III',
  'American Pie',
  'My Tutor',
  'Greed'],
 ['Terminator 3: Rise of the Machines',
  'The Conversation',
  'The Hours',
  '48 Hrs.',
  'Back to the Future Part II',
  'Silent Hill',
  'Crustacés et coquillages',
  'Lost in Translation',
  'Night on Earth',
  "Dave Chappelle's Block Party",
  "Ocean's Eleven",
  'Sissi',
  'Live and Let Die',
  'A Clockwork Orange',
  'Солярис',
  'Sommer vorm Balkon',
  'La science des rêves',
  'Trois couleurs : Rouge',
  'Grbavica',
  'Czlowiek z zelaza',
  'Le Mépris',
  'Batman Returns',
  'Romeo + Juliet',
  'Monsoon Wedding',
  'Stand by Me',
  'Lucky Number Slevin',
  'Cat on a Hot Tin Roof',
  'The Dark',
  'The Devil Wears Prada',
  'Lili Marleen',
  'Star Trek IV: The Voyage Home',
  'A Nightmare on Elm Street',
  'Notting Hill',
  'Once Were Warriors',
  'Reservoir Dogs',
  '2001: A Space Odyssey',
  'Rebecca',
  'Psycho',
  'The Poseidon Adventure',
  'Batman Begins

# 3. Word2Vec 학습

In [34]:
# Word2Vec의 학습을 진행
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, vector_size=20, window=5,
                          min_count=1, workers=4, epochs=200, sg=1) # sg:0 - CBOW / sg:1 - SKIP-GRAM

In [35]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

[('Snow Cake', 0.8173764944076538),
 ('Forrest Gump', 0.7496981620788574),
 ('Cop Land', 0.7461470365524292),
 ('Gandhi', 0.7373486161231995),
 ('Heavenly Creatures', 0.7373000979423523),
 ('Domicile Conjugal', 0.7340729236602783),
 ("L'Aile ou la Cuisse", 0.7210757732391357),
 ('Star Trek: Nemesis', 0.7198588848114014),
 ('Conquest of the Planet of the Apes', 0.7191023826599121),
 ('The Wedding Planner', 0.7180280685424805)]