In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('Online Retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
df.shape

(541909, 8)

In [4]:
#check for missing values
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [5]:
#remove missing values
df.dropna(inplace=True)

In [6]:
#Convert StockCode to str
df['StockCode']= df['StockCode'].astype(str)

In [7]:
#no. of unique customers
customers = df["CustomerID"].unique().tolist()
len(customers)

4372

In [8]:
#set a small part of the dataset for validation purposes. Use the data of 90% of the 
#customers to create word2vec embeddings. Let’s split the data.
# shuffle customer ID's
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

In [9]:
#create sequences of purchases made by the customers 
#in the dataset for both the train and validation set.
# list to capture purchase history of the customers
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

100%|████████████████████████████████████████████████████████████████████████████| 3935/3935 [00:03<00:00, 1249.49it/s]


In [10]:
# list to capture purchase history of the customers
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

100%|██████████████████████████████████████████████████████████████████████████████| 437/437 [00:00<00:00, 1475.16it/s]


In [11]:
#Build word2vec embeddings for products
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(3657471, 3694520)

In [12]:
#Since we are not planning to train the model any further, we 
#are calling init_sims( ) here. This will make the model much more memory-efficient
model.init_sims(replace=True)

In [13]:
print(model)

Word2Vec(vocab=3161, size=100, alpha=0.03)


In [14]:
#Our model has a vocabulary of 3,182 unique words and their vectors of size 100 each. 
#Next, we will extract the vectors of all the words in our vocabulary and store it in 
#one place for easy access.
# extract all vectors
X = model[model.wv.vocab]

X.shape

(3161, 100)

In [26]:
products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [27]:
#Let’s first create a product-ID and product-description 
#dictionary to easily map a product’s description to its ID and vice versa.
# test the dictionary
products_dict['84029E']

['RED WOOLLY HOTTIE WHITE HEART.']

In [28]:
#take a product’s vector (n) as input and return top 6 similar products:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        

In [29]:
similar_products(model['90019A'])

[('SILVER M.O.P ORBIT DROP EARRINGS', 0.8259179592132568),
 ('PINK HEART OF GLASS BRACELET', 0.7778369188308716),
 ('SILVER M.O.P. ORBIT NECKLACE', 0.7639400362968445),
 ('PINK BOUDICCA LARGE BRACELET', 0.7577416896820068),
 ('WHITE VINT ART DECO CRYSTAL NECKLAC', 0.756995677947998),
 ('ANT COPPER RED BOUDICCA BRACELET', 0.7507659196853638)]

In [None]:
#this output is based on the vector of a single product only. What 
#if we want to recommend products based on the multiple purchases he or 
#she has made in the past?
#One simple solution is to take the average of all the vectors of the 
#products the user has bought so far and use this resultant vector to find 
#similar products. We will use the function below that takes in a list of product 
#IDs and gives out a 100-dimensional vector which is a mean of vectors of the 
#products in the input list

In [30]:
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [31]:
len(purchases_val[0])

196

In [32]:
aggregate_vectors(purchases_val[0]).shape

(100,)

In [33]:
similar_products(aggregate_vectors(purchases_val[0]))

[('JAM MAKING SET WITH JARS', 0.6887817978858948),
 ('SET OF 3 REGENCY CAKE TINS', 0.6810645461082458),
 ('SET OF TEA COFFEE SUGAR TINS PANTRY', 0.6778563857078552),
 ('JAM MAKING SET PRINTED', 0.6732151508331299),
 ('REGENCY CAKESTAND 3 TIER', 0.6690331697463989),
 ('SET OF 3 CAKE TINS PANTRY DESIGN ', 0.658674418926239)]

In [None]:
#The system has recommended 6 products based on the entire purchase 
#history of a user. Moreover, if you want to get product suggestions 
#based on the last few purchases, only then also you can use the same set of functions.

#Below is the last 10 products purchased as input:

In [34]:
similar_products(aggregate_vectors(purchases_val[0][-10:]))

[("BOX OF 6 MINI 50'S CRACKERS", 0.6572619676589966),
 ('PACK OF SIX LED TEA LIGHTS', 0.6454877853393555),
 ('PAPER CHAIN KIT VINTAGE CHRISTMAS', 0.6448880434036255),
 ('SET OF 12 FAIRY CAKE BAKING CASES', 0.6440631747245789),
 ('SET OF 3 REGENCY CAKE TINS', 0.6433660387992859),
 ("PAPER CHAIN KIT 50'S CHRISTMAS ", 0.6421225070953369)]