In [33]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [34]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [35]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [36]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [37]:
df = pd.merge(df_movies,df_ratings)

In [38]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517


In [39]:
df.shape

(25000095, 6)

In [40]:
df.dropna(inplace=True)

In [41]:
df['movieId']= df['movieId'].astype(str)

In [42]:
users = df["userId"].unique().tolist()
len(users)

162541

In [43]:
random.shuffle(users)

# extract 90% of user ID's
users_train = [users[i] for i in range(round(0.9*len(users)))]

# split data into train and validation set
train_df = df[df['userId'].isin(users_train)]
validation_df = df[~df['userId'].isin(users_train)]

In [44]:
#list to capture watch history of the users
watch_train = []

# populate the list with the movie ID
for i in tqdm(users_train):
    temp = train_df[train_df["userId"] == i]["movieId"].tolist()
    watch_train.append(temp)

100%|████████████████████████████████████████████████████████████████████████| 146287/146287 [2:21:01<00:00, 17.29it/s]


In [None]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, 
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(watch_train, progress_per=200)

model.train(watch_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

In [14]:
X = model[model.wv.vocab]
X.shape

(30099, 100)

In [15]:
watch = train_df[["movieId", "title"]]

# remove duplicates
watch.drop_duplicates(inplace=True, subset='movieId', keep="last")

# create movie id and tittle dictionary
watch_dict = watch.groupby('movieId')['title'].apply(list).to_dict()

In [32]:
print(watch_dict['307'])

['Three Colors: Blue (Trois couleurs: Bleu) (1993)']


In [17]:
def similar_watch(v, n = 6):
    
    # extract most similar movies for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar movies
    new_ms = []
    for j in ms:
        pair = (watch_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        

In [25]:
print(watch_dict['307'])
similar_watch(model['307'])

['Three Colors: Blue (Trois couleurs: Bleu) (1993)']


[('Three Colors: Red (Trois couleurs: Rouge) (1994)', 0.9889792799949646),
 ('Three Colors: White (Trzy kolory: Bialy) (1994)', 0.9852802753448486),
 ('Shallow Grave (1994)', 0.9403821229934692),
 ('Queen Margot (Reine Margot, La) (1994)', 0.8966005444526672),
 ('Once Were Warriors (1994)', 0.8932065963745117),
 ('To Live (Huozhe) (1994)', 0.8869592547416687)]

In [26]:
print(watch_dict['1'])
similar_watch(model['1'])

['Toy Story (1995)']


[('Balto (1995)', 0.7646825313568115),
 ('Jumanji (1995)', 0.7470300793647766),
 ('Clueless (1995)', 0.7298091650009155),
 ('Wings of Courage (1995)', 0.7201133966445923),
 ('Persuasion (1995)', 0.7152753472328186),
 ('Misérables, Les (1995)', 0.711568295955658)]

In [27]:
print(watch_dict['23'])
similar_watch(model['23'])

['Assassins (1995)']


[('Money Train (1995)', 0.9601371884346008),
 ('Cutthroat Island (1995)', 0.8917180299758911),
 ('Powder (1995)', 0.8563741445541382),
 ('Dead Presidents (1995)', 0.8489820957183838),
 ('Now and Then (1995)', 0.8438311815261841),
 ('Four Rooms (1995)', 0.8427832126617432)]