In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('online-retail-data.xlsx')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


Here is the description of the fields in this dataset:

    InvoiceNo: Invoice number - a unique number assigned to each transaction

    StockCode: Product/item code - a unique number assigned to each distinct product

    Description: Product description

    Quantity: The quantities of each product per transaction

    InvoiceDate: Invoice date and time

    CustomerID: Customer number - a unique number assigned to each customer

In [4]:
df.shape

(541909, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

Without CustomerID, we cannot map to a unique customer to make sense of purchases. Similarly, without a description, word2vec embeddings are impossible. We should be ok to drop and use remaining data.

In [7]:
# This works but maybe making things a bit too difficult ;)
df1 = df[~df['Description'].isna()]

In [8]:
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [10]:
df['StockCode'] = df['StockCode'].astype(str)

In [11]:
customers = set(df['CustomerID'])
customers = list(customers)
len(customers)

4372

In [12]:
(round(len(customers)*.9))

3935

Setting aside 10% of customers as a validation set.

In [13]:
# Seed
random.seed(117)

# # Shuffling Customer IDs in the list
# random.shuffle(customers)

# Training list
customers_train = random.sample(customers, (round(len(customers)*.9)))
print(len(customers_train))

# Training DF using customer list
train_df = df[df['CustomerID'].isin(customers_train)]
test_df = df[~df['CustomerID'].isin(customers_train)]
print(len(train_df))
print(len(test_df))

3935
366644
40185


In [14]:
# Purchase history sequence - Train
purchases_train = []

for i in tqdm(customers_train):
    in_training = train_df[train_df['CustomerID'] == i]['StockCode'].tolist()
    purchases_train.append(in_training)

100%|██████████| 3935/3935 [00:04<00:00, 901.50it/s] 


In [15]:
# Same thing, purchase history - Test
purchases_test = []

for i in tqdm(test_df['CustomerID'].unique()):
    in_training = test_df[test_df['CustomerID'] == i]['StockCode'].tolist()
    purchases_test.append(in_training)

100%|██████████| 437/437 [00:00<00:00, 1204.58it/s]


In [16]:
# Good sanity check - Customer IDs add up to 4372

In [17]:
# Build & train W2V model
model = Word2Vec(window=10, sg=1, hs=0, negative=10, alpha=0.03, min_alpha=0.0007, seed=4)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples=model.corpus_count, epochs=10, report_delay=1)

(3630458, 3666440)

In [18]:
print(model)

Word2Vec(vocab=3177, size=100, alpha=0.03)


In [19]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

(3177, 100)

In [20]:
import umap

reducer = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=2, random_state=42)


AttributeError: module 'umap' has no attribute 'UMAP'

In [74]:
cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)

plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')

AttributeError: module 'umap' has no attribute 'UMAP'

In [21]:

products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [22]:
# test the dictionary
products_dict['84029E']

['RED WOOLLY HOTTIE WHITE HEART.']

In [23]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms      

In [24]:
similar_products(model['90019A'])

[('SILVER M.O.P ORBIT DROP EARRINGS', 0.7904248237609863),
 ('PINK HEART OF GLASS BRACELET', 0.7410053610801697),
 ('PINK BOUDICCA LARGE BRACELET', 0.7337702512741089),
 ('SILVER LARIAT 40CM', 0.729084849357605),
 ('GREEN HEART OF GLASS BRACELET', 0.7288806438446045),
 ('MIDNIGHT BLUE GLASS/SILVER BRACELET', 0.7271733283996582)]

In [25]:
similar_products(model['84029E'])

[('KNITTED UNION FLAG HOT WATER BOTTLE', 0.8280028700828552),
 ('RETROSPOT HEART HOT WATER BOTTLE', 0.7420725226402283),
 ('WHITE SKULL HOT WATER BOTTLE ', 0.7399932146072388),
 ('SCOTTIE DOG HOT WATER BOTTLE', 0.7398636341094971),
 ('CHICK GREY HOT WATER BOTTLE', 0.686881422996521),
 ('CHOCOLATE HOT WATER BOTTLE', 0.6762336492538452)]

In [26]:
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [28]:
similar_products(aggregate_vectors(purchases_test[0]))

[('SET OF 20 VINTAGE CHRISTMAS NAPKINS', 0.6576131582260132),
 ('PAPER CHAIN KIT VINTAGE CHRISTMAS', 0.6481579542160034),
 ("BOX OF 6 MINI 50'S CRACKERS", 0.6350186467170715),
 ('CHRISTMAS CRAFT TREE TOP ANGEL', 0.631074070930481),
 ('VINTAGE CHRISTMAS GIFT SACK', 0.6308308839797974),
 ('CHRISTMAS CRAFT LITTLE FRIENDS', 0.6290112733840942)]

In [31]:
similar_products(aggregate_vectors(purchases_test[0][-10:]))


[('GARDENERS KNEELING PAD KEEP CALM ', 0.6836995482444763),
 ('ROTATING SILVER ANGELS T-LIGHT HLDR', 0.6665749549865723),
 ('CHRISTMAS CRAFT WHITE FAIRY ', 0.6661779880523682),
 ('CHRISTMAS CRAFT TREE TOP ANGEL', 0.66289222240448),
 ('SET OF 4 SANTA PLACE SETTINGS', 0.6485909819602966),
 ('CHRISTMAS CRAFT LITTLE FRIENDS', 0.6472638845443726)]