In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [2]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [3]:
gk = df_movies.groupby(df_ratings['rating'])

In [4]:
df = pd.merge(df_movies,df_ratings)

In [5]:
df.shape

(25000095, 6)

In [6]:
df.dropna(inplace=True)

In [7]:
df['movieId']= df['movieId'].astype(str)

In [8]:
customers = df["userId"].unique().tolist()
len(customers)

162541

In [9]:
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['userId'].isin(customers_train)]
validation_df = df[~df['userId'].isin(customers_train)]

In [10]:
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["userId"] == i]["movieId"].tolist()
    purchases_train.append(temp)

100%|████████████████████████████████████████████████████████████████████████| 146287/146287 [2:46:22<00:00, 14.65it/s]


In [11]:

# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(223936419, 224846530)

In [12]:
model.init_sims(replace=True)

In [13]:
print(model)

Word2Vec(vocab=30099, size=100, alpha=0.03)


In [14]:
X = model[model.wv.vocab]

X.shape

(30099, 100)

In [15]:

products = train_df[["movieId", "title"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='movieId', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('movieId')['title'].apply(list).to_dict()

In [16]:
products_dict['296']


['Pulp Fiction (1994)']

In [17]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        

In [18]:
similar_products(model['307'])

[('Three Colors: Red (Trois couleurs: Rouge) (1994)', 0.9889792799949646),
 ('Three Colors: White (Trzy kolory: Bialy) (1994)', 0.9852802753448486),
 ('Shallow Grave (1994)', 0.9403821229934692),
 ('Queen Margot (Reine Margot, La) (1994)', 0.8966005444526672),
 ('Once Were Warriors (1994)', 0.8932065963745117),
 ('To Live (Huozhe) (1994)', 0.8869592547416687)]

In [19]:
similar_products(model['1'])

[('Balto (1995)', 0.7646825313568115),
 ('Jumanji (1995)', 0.7470300793647766),
 ('Clueless (1995)', 0.7298091650009155),
 ('Wings of Courage (1995)', 0.7201133966445923),
 ('Persuasion (1995)', 0.7152753472328186),
 ('Misérables, Les (1995)', 0.711568295955658)]

In [20]:
similar_products(model['23'])

[('Money Train (1995)', 0.9601371884346008),
 ('Cutthroat Island (1995)', 0.8917180299758911),
 ('Powder (1995)', 0.8563741445541382),
 ('Dead Presidents (1995)', 0.8489820957183838),
 ('Now and Then (1995)', 0.8438311815261841),
 ('Four Rooms (1995)', 0.8427832126617432)]

In [21]:
similar_products(model['3'])

[('Sudden Death (1995)', 0.8445239067077637),
 ('Sabrina (1995)', 0.8309876918792725),
 ('Cutthroat Island (1995)', 0.820378303527832),
 ('Dracula: Dead and Loving It (1995)', 0.8156200051307678),
 ('Tom and Huck (1995)', 0.8052608370780945),
 ('Father of the Bride Part II (1995)', 0.8032545447349548)]