In [33]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [34]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [35]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [36]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [37]:
df = pd.merge(df_movies,df_ratings)

In [38]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517


In [39]:
df.shape

(25000095, 6)

In [40]:
df.dropna(inplace=True)

In [41]:
df['movieId']= df['movieId'].astype(str)

In [42]:
users = df["userId"].unique().tolist()
len(users)

162541

In [43]:
random.shuffle(users)

# extract 90% of user ID's
users_train = [users[i] for i in range(round(0.9*len(users)))]

# split data into train and validation set
train_df = df[df['userId'].isin(users_train)]
validation_df = df[~df['userId'].isin(users_train)]

In [44]:
#list to capture watch history of the users
watch_train = []

# populate the list with the movie ID
for i in tqdm(users_train):
    temp = train_df[train_df["userId"] == i]["movieId"].tolist()
    watch_train.append(temp)

100%|████████████████████████████████████████████████████████████████████████| 146287/146287 [2:21:01<00:00, 17.29it/s]


In [65]:
watch_train

[['6',
  '47',
  '111',
  '150',
  '215',
  '318',
  '356',
  '527',
  '593',
  '608',
  '778',
  '805',
  '858',
  '1089',
  '1208',
  '1221',
  '1222',
  '1228',
  '1230',
  '1244',
  '1246',
  '1343',
  '1580',
  '1682',
  '1704',
  '2028',
  '2324',
  '2357',
  '2427',
  '2571',
  '2858',
  '2932',
  '3105',
  '3252',
  '3362',
  '3418',
  '3420',
  '3735',
  '3741',
  '4027',
  '4226',
  '4448',
  '4628',
  '4973',
  '4975',
  '4995',
  '5010',
  '5995',
  '6016',
  '6373',
  '6378',
  '6711',
  '6934',
  '7361',
  '8638',
  '8798',
  '8961',
  '30707',
  '41285',
  '44555',
  '46578',
  '48394',
  '55820',
  '58559',
  '64614',
  '69122',
  '69757',
  '72011',
  '80463',
  '81791',
  '86882',
  '86898',
  '88129',
  '90430',
  '91658',
  '95873',
  '97304',
  '100714',
  '104374',
  '104841',
  '106916',
  '109487',
  '112290',
  '115569',
  '116797',
  '134214',
  '139415',
  '139644',
  '142488',
  '149334',
  '158972',
  '159193',
  '160980',
  '164179',
  '164909',
  '168250'

In [45]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, 
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(watch_train, progress_per=200)

model.train(watch_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(223722668, 224802590)

In [46]:
X = model[model.wv.vocab]
X.shape

(31673, 100)

In [47]:
watch = train_df[["movieId", "title"]]

# remove duplicates
watch.drop_duplicates(inplace=True, subset='movieId', keep="last")

# create movie id and tittle dictionary
watch_dict = watch.groupby('movieId')['title'].apply(list).to_dict()

In [66]:
watch.head()

Unnamed: 0,movieId,title
57308,1,Toy Story (1995)
81536,2,Jumanji (1995)
93340,3,Grumpier Old Men (1995)
95863,4,Waiting to Exhale (1995)
107577,5,Father of the Bride Part II (1995)


In [63]:
watch_dict

{'1': ['Toy Story (1995)'],
 '10': ['GoldenEye (1995)'],
 '100': ['City Hall (1996)'],
 '1000': ['Curdled (1996)'],
 '100001': ['Comic, The (1969)'],
 '100003': ['Up in Smoke (1957)'],
 '100008': ['Flaw, The (2011)'],
 '100015': ['Chicago Massacre: Richard Speck (2007)'],
 '100017': ['Keep the Lights On (2012)'],
 '100032': ['Beauty Is Embarrassing (2012)'],
 '100034': ['Girl Model (2011)'],
 '100036': ['Crossfire Hurricane (2012)'],
 '100038': ['Middle of Nowhere (2012)'],
 '100040': ['True Blue (2001) '],
 '100042': ['Guns of Fort Petticoat, The (1957)'],
 '100044': ['Human Planet (2011)'],
 '100046': ['Madagascar (2011)'],
 '100048': ["Omar Killed Me (Omar m'a tuer) (2011)"],
 '100052': ['Red Hook Summer (2012)'],
 '100054': ['Stella Maris (1918)'],
 '100056': ['Die (2010)'],
 '100058': ["Patrice O'Neal: Elephant in the Room (2011)"],
 '100060': ['Sunny (Sseo-ni) (2011)'],
 '100062': ['My Way (Mai Wei) (2011)'],
 '100070': ['Punching the Clown (2009)'],
 '100072': ['Metsän tarina (2

In [64]:
print(watch_dict['10'])

['GoldenEye (1995)']


In [49]:
def similar_watch(v, n = 6):
    
    # extract most similar movies for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar movies
    new_ms = []
    for j in ms:
        pair = (watch_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        

In [50]:
print(watch_dict['307'])
similar_watch(model['307'])

['Three Colors: Blue (Trois couleurs: Bleu) (1993)']


[('Three Colors: Red (Trois couleurs: Rouge) (1994)', 0.9883521795272827),
 ('Three Colors: White (Trzy kolory: Bialy) (1994)', 0.985454261302948),
 ('Shallow Grave (1994)', 0.9445573091506958),
 ('Once Were Warriors (1994)', 0.8964623212814331),
 ('Secret of Roan Inish, The (1994)', 0.8813999891281128),
 ('Queen Margot (Reine Margot, La) (1994)', 0.8778895139694214)]

In [51]:
print(watch_dict['1'])
similar_watch(model['1'])

['Toy Story (1995)']


[('Balto (1995)', 0.7544434666633606),
 ('Wings of Courage (1995)', 0.7466940879821777),
 ('Clueless (1995)', 0.7247020602226257),
 ('Jumanji (1995)', 0.7224775552749634),
 ('Now and Then (1995)', 0.7146523594856262),
 ('Across the Sea of Time (1995)', 0.7133582830429077)]

In [52]:
print(watch_dict['23'])
similar_watch(model['23'])

['Assassins (1995)']


[('Money Train (1995)', 0.9605783820152283),
 ('Cutthroat Island (1995)', 0.8987183570861816),
 ('Dead Presidents (1995)', 0.8757539987564087),
 ('Four Rooms (1995)', 0.8435318470001221),
 ('Now and Then (1995)', 0.8397533297538757),
 ('Powder (1995)', 0.8395148515701294)]

In [53]:
print(watch_dict['231'])
similar_watch(model['231'])

['Dumb & Dumber (Dumb and Dumber) (1994)']


[('Billy Madison (1995)', 0.7691785097122192),
 ('Mighty Morphin Power Rangers: The Movie (1995)', 0.7655748128890991),
 ('Waterworld (1995)', 0.7643465995788574),
 ('Mallrats (1995)', 0.7625446319580078),
 ('Don Juan DeMarco (1995)', 0.7608530521392822),
 ('Before Sunrise (1995)', 0.7590253353118896)]