<a href="https://colab.research.google.com/github/Otobi1/Online-Retail-Transactions/blob/master/Online_Retail_Transactions_Collaborative_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from matplotlib.image import imread
import seaborn as sns
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import  cosine_similarity
import datetime as dt

%matplotlib inline
sns.set_context('notebook')
plt.style.use('fivethirtyeight')
from warnings import filterwarnings
filterwarnings('ignore')

print ('Libraries successfully imported!')

Libraries successfully imported!


In [2]:
# Mount the drive, because the data is already on the GDrive

from google.colab import drive # for more on writing and reading files through colab https://towardsdatascience.com/reading-and-writing-files-with-google-colaboratory-f0c234683946
drive.mount("/drive") 

Mounted at /drive


In [3]:
# Loading the dataset from the GDrive

clean_final_retail_data = pd.read_csv("/drive/My Drive/Colab Notebooks/clean_final_retail_data.csv")

In [4]:
# Printing out the keys to the features in the original complete dataset

clean_final_retail_data.keys()

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

In [5]:
# Creating a customer-item matrix
# Essentially matching the customers and the items they've bought

customer_item_matrix = clean_final_retail_data.pivot_table(index='Customer ID', columns='StockCode', values='Quantity', aggfunc='sum')

customer_item_matrix.loc[18284:].head()

StockCode,10002,10080,10109,10120,10123C,10123G,10124A,10124G,10125,10133,10134,10135,10138,11001,15030,15034,15036,15039,15044A,15044B,15044C,15044D,15056BL,15056N,15056P,15058A,15058B,15058C,15059A,15060B,16008,16010,16011,16012,16014,16015,16016,16020C,16033,16043,...,90211A,90211B,90212B,90212C,90214A,90214B,90214C,90214D,90214E,90214F,90214G,90214H,90214I,90214J,90214K,90214L,90214M,90214N,90214O,90214P,90214R,90214S,90214T,90214U,90214V,90214W,90214Y,90214Z,ADJUST,ADJUST2,BANK CHARGES,C2,D,DOT,M,PADS,POST,SP1002,TEST001,TEST002
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
18284.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,
18285.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,
18286.0,,,,,,,,,,,,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,
18287.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,


In [6]:
# Exploring the shape of the customer-item matrix. 

customer_item_matrix.shape

(5878, 4631)

In [7]:
# Confirming the number of unique customers in the cleaned final dataset

clean_final_retail_data['Customer ID'].nunique()

5878

In [8]:
# Confirming the number of unique stockcodes/items in the cleaned final dataset

clean_final_retail_data['StockCode'].nunique()

4631

In [9]:
# Exploring the number of items purchased in the total transaction record

customer_item_matrix.loc[13090.0].sum()

3445.0

In [10]:
# Mapping the stock/item purchased to 1 and items/stocks not purchased to 0
# Essentially, any purchase above 0 units is mapped to 1 and everything else is mapped to zero - onehot encoding using a lambda function

customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)

In [11]:
# Exploring the first 4 entries of the mapped customer-item-matrix

customer_item_matrix.loc[18284:].head()

StockCode,10002,10080,10109,10120,10123C,10123G,10124A,10124G,10125,10133,10134,10135,10138,11001,15030,15034,15036,15039,15044A,15044B,15044C,15044D,15056BL,15056N,15056P,15058A,15058B,15058C,15059A,15060B,16008,16010,16011,16012,16014,16015,16016,16020C,16033,16043,...,90211A,90211B,90212B,90212C,90214A,90214B,90214C,90214D,90214E,90214F,90214G,90214H,90214I,90214J,90214K,90214L,90214M,90214N,90214O,90214P,90214R,90214S,90214T,90214U,90214V,90214W,90214Y,90214Z,ADJUST,ADJUST2,BANK CHARGES,C2,D,DOT,M,PADS,POST,SP1002,TEST001,TEST002
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
18284.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
18285.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
18286.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
18287.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [12]:
# User-based Collaborative Filtering 

# Here, the user-based collaborative filtering aims to explore the similarities among the distinct users based on the items they have purchased based within the customer-item matrix

# Applying cosine similarity to the customer item matrix (read more here - https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a (incognito))

# Cosine Similarity explores the similarity between two or more non zero vectors and it has a value between 0 and 1
# The smaller the angle between the vectors, the more similar they are 

user_to_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))

In [13]:
# Exploring the dimensions of the user-to-user similarity matrix

user_to_user_sim_matrix.shape

(5878, 5878)

In [14]:
# Exploring the first 5 observations of the user similarity matrix

# As observed below, the user_to_user_similarity_matrix shows the similarity among the customers based on the cosine similarities computed above

user_to_user_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5838,5839,5840,5841,5842,5843,5844,5845,5846,5847,5848,5849,5850,5851,5852,5853,5854,5855,5856,5857,5858,5859,5860,5861,5862,5863,5864,5865,5866,5867,5868,5869,5870,5871,5872,5873,5874,5875,5876,5877
0,1.0,0.0,0.0,0.13106,0.0,0.0,0.023002,0.0,0.0,0.0,0.0,0.178685,0.180021,0.066118,0.015215,0.0,0.081446,0.080257,0.0,0.0,0.0,0.0,0.090722,0.0,0.0,0.03325,0.0,0.0,0.0,0.0,0.0,0.0,0.080364,0.0,0.030817,0.0,0.0,0.15097,0.095482,0.0,...,0.061633,0.0,0.0,0.0,0.068853,0.218218,0.0,0.0,0.0,0.0,0.0,0.0,0.039653,0.0,0.0,0.0,0.0,0.058026,0.0,0.046676,0.06415,0.0,0.0,0.0,0.015982,0.0,0.0,0.0,0.0,0.21148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070535,0.0
1,0.0,1.0,0.053452,0.045502,0.043214,0.038881,0.031944,0.055728,0.023395,0.090351,0.131036,0.093744,0.047619,0.122427,0.091558,0.0,0.131958,0.055728,0.05324,0.075974,0.0,0.053722,0.020998,0.093906,0.013746,0.138527,0.09167,0.059391,0.080185,0.095238,0.0,0.117655,0.138177,0.093582,0.099857,0.149071,0.048601,0.087357,0.121549,0.0,...,0.014265,0.062994,0.058321,0.047756,0.071714,0.0,0.0,0.0,0.0,0.114379,0.048795,0.0,0.055067,0.020438,0.084515,0.046393,0.059391,0.107443,0.0,0.0,0.0,0.067344,0.0,0.0,0.051788,0.0,0.026861,0.0,0.059391,0.048948,0.037987,0.0,0.067344,0.06482,0.102869,0.113961,0.067344,0.0,0.076186,0.024398
2,0.0,0.053452,1.0,0.017025,0.048507,0.0,0.023905,0.0,0.026261,0.067612,0.137281,0.01238,0.026726,0.011452,0.063246,0.04714,0.028214,0.29192,0.143427,0.04264,0.0,0.060302,0.0,0.042164,0.01543,0.017277,0.0343,0.033333,0.045004,0.053452,0.060302,0.052827,0.107379,0.026261,0.048038,0.047809,0.043644,0.039223,0.024807,0.0,...,0.032026,0.0,0.043644,0.235865,0.017889,0.0,0.0,0.0,0.0,0.067574,0.0,0.0,0.109888,0.0,0.094868,0.130189,0.0,0.0,0.106904,0.0,0.0,0.0,0.0,0.0,0.033218,0.0,0.060302,0.0,0.155556,0.027472,0.0,0.0,0.0,0.0,0.0,0.13858,0.0,0.0,0.0,0.054772
3,0.13106,0.045502,0.017025,1.0,0.041292,0.037152,0.152617,0.071,0.04471,0.057555,0.141903,0.210766,0.125129,0.175474,0.074028,0.080257,0.138099,0.05325,0.09157,0.036298,0.098295,0.025666,0.040129,0.044865,0.065676,0.088245,0.116791,0.05675,0.067042,0.022751,0.051333,0.104929,0.116798,0.08942,0.095417,0.081396,0.120744,0.133556,0.116144,0.080757,...,0.109048,0.0,0.130032,0.009126,0.129436,0.064349,0.042563,0.021281,0.0,0.166815,0.077709,0.03576,0.087697,0.039058,0.094217,0.011082,0.042563,0.102665,0.0,0.020646,0.113501,0.0,0.020646,0.0,0.162594,0.0,0.076999,0.0,0.037834,0.093543,0.054447,0.0,0.032174,0.020646,0.049147,0.140654,0.016087,0.0,0.062399,0.038854
4,0.0,0.043214,0.048507,0.041292,1.0,0.0,0.028989,0.050572,0.0,0.0,0.071348,0.045038,0.03241,0.027775,0.057522,0.057166,0.085536,0.0,0.057977,0.0,0.0,0.073127,0.057166,0.025565,0.018712,0.020952,0.083189,0.040423,0.054575,0.06482,0.073127,0.085416,0.014468,0.063693,0.058255,0.028989,0.052926,0.047565,0.030083,0.076696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049167,0.044281,0.0,0.033315,0.0,0.0,0.031575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.038782,0.0,0.0,0.02963,0.0


In [15]:
# Using the index of the customer_item_matrix as the column label in the user_to_user_sim_matrix

user_to_user_sim_matrix.columns = customer_item_matrix.index

In [16]:
# Mapping the index of the user_to_user_sim_matrix as the custmer_item_matrix index, such that the customer id is also the row label

user_to_user_sim_matrix['Customer ID'] = customer_item_matrix.index

In [17]:
# Setting the index as the customer id

user_to_user_sim_matrix = user_to_user_sim_matrix.set_index('Customer ID')

In [18]:
# Exploring the first 5 observations of the user_to_user_sim_matrix

user_to_user_sim_matrix.head()

Customer ID,12346.0,12347.0,12348.0,12349.0,12350.0,12351.0,12352.0,12353.0,12354.0,12355.0,12356.0,12357.0,12358.0,12359.0,12360.0,12361.0,12362.0,12363.0,12364.0,12365.0,12366.0,12367.0,12368.0,12369.0,12370.0,12371.0,12372.0,12373.0,12374.0,12375.0,12376.0,12377.0,12378.0,12379.0,12380.0,12381.0,12383.0,12384.0,12385.0,12386.0,...,18248.0,18249.0,18250.0,18251.0,18252.0,18253.0,18254.0,18255.0,18256.0,18257.0,18258.0,18259.0,18260.0,18261.0,18262.0,18263.0,18264.0,18265.0,18266.0,18267.0,18268.0,18269.0,18270.0,18271.0,18272.0,18273.0,18274.0,18275.0,18276.0,18277.0,18278.0,18279.0,18280.0,18281.0,18282.0,18283.0,18284.0,18285.0,18286.0,18287.0
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
12346.0,1.0,0.0,0.0,0.13106,0.0,0.0,0.023002,0.0,0.0,0.0,0.0,0.178685,0.180021,0.066118,0.015215,0.0,0.081446,0.080257,0.0,0.0,0.0,0.0,0.090722,0.0,0.0,0.03325,0.0,0.0,0.0,0.0,0.0,0.0,0.080364,0.0,0.030817,0.0,0.0,0.15097,0.095482,0.0,...,0.061633,0.0,0.0,0.0,0.068853,0.218218,0.0,0.0,0.0,0.0,0.0,0.0,0.039653,0.0,0.0,0.0,0.0,0.058026,0.0,0.046676,0.06415,0.0,0.0,0.0,0.015982,0.0,0.0,0.0,0.0,0.21148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070535,0.0
12347.0,0.0,1.0,0.053452,0.045502,0.043214,0.038881,0.031944,0.055728,0.023395,0.090351,0.131036,0.093744,0.047619,0.122427,0.091558,0.0,0.131958,0.055728,0.05324,0.075974,0.0,0.053722,0.020998,0.093906,0.013746,0.138527,0.09167,0.059391,0.080185,0.095238,0.0,0.117655,0.138177,0.093582,0.099857,0.149071,0.048601,0.087357,0.121549,0.0,...,0.014265,0.062994,0.058321,0.047756,0.071714,0.0,0.0,0.0,0.0,0.114379,0.048795,0.0,0.055067,0.020438,0.084515,0.046393,0.059391,0.107443,0.0,0.0,0.0,0.067344,0.0,0.0,0.051788,0.0,0.026861,0.0,0.059391,0.048948,0.037987,0.0,0.067344,0.06482,0.102869,0.113961,0.067344,0.0,0.076186,0.024398
12348.0,0.0,0.053452,1.0,0.017025,0.048507,0.0,0.023905,0.0,0.026261,0.067612,0.137281,0.01238,0.026726,0.011452,0.063246,0.04714,0.028214,0.29192,0.143427,0.04264,0.0,0.060302,0.0,0.042164,0.01543,0.017277,0.0343,0.033333,0.045004,0.053452,0.060302,0.052827,0.107379,0.026261,0.048038,0.047809,0.043644,0.039223,0.024807,0.0,...,0.032026,0.0,0.043644,0.235865,0.017889,0.0,0.0,0.0,0.0,0.067574,0.0,0.0,0.109888,0.0,0.094868,0.130189,0.0,0.0,0.106904,0.0,0.0,0.0,0.0,0.0,0.033218,0.0,0.060302,0.0,0.155556,0.027472,0.0,0.0,0.0,0.0,0.0,0.13858,0.0,0.0,0.0,0.054772
12349.0,0.13106,0.045502,0.017025,1.0,0.041292,0.037152,0.152617,0.071,0.04471,0.057555,0.141903,0.210766,0.125129,0.175474,0.074028,0.080257,0.138099,0.05325,0.09157,0.036298,0.098295,0.025666,0.040129,0.044865,0.065676,0.088245,0.116791,0.05675,0.067042,0.022751,0.051333,0.104929,0.116798,0.08942,0.095417,0.081396,0.120744,0.133556,0.116144,0.080757,...,0.109048,0.0,0.130032,0.009126,0.129436,0.064349,0.042563,0.021281,0.0,0.166815,0.077709,0.03576,0.087697,0.039058,0.094217,0.011082,0.042563,0.102665,0.0,0.020646,0.113501,0.0,0.020646,0.0,0.162594,0.0,0.076999,0.0,0.037834,0.093543,0.054447,0.0,0.032174,0.020646,0.049147,0.140654,0.016087,0.0,0.062399,0.038854
12350.0,0.0,0.043214,0.048507,0.041292,1.0,0.0,0.028989,0.050572,0.0,0.0,0.071348,0.045038,0.03241,0.027775,0.057522,0.057166,0.085536,0.0,0.057977,0.0,0.0,0.073127,0.057166,0.025565,0.018712,0.020952,0.083189,0.040423,0.054575,0.06482,0.073127,0.085416,0.014468,0.063693,0.058255,0.028989,0.052926,0.047565,0.030083,0.076696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049167,0.044281,0.0,0.033315,0.0,0.0,0.031575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.038782,0.0,0.0,0.02963,0.0


In [19]:
# Checking the complete user_to_user_sim_matrix for one customer with id 13085

# As expected, the customer is perfectly similar to itself and completely dissimilar to the customers on the lower part of the list, based on the sorted values

user_to_user_sim_matrix.loc[13085.0].sort_values(ascending = False)

Customer ID
13085.0    1.000000
18087.0    0.213201
13750.0    0.198030
13658.0    0.187734
12411.0    0.180907
             ...   
13747.0    0.000000
14658.0    0.000000
16598.0    0.000000
13751.0    0.000000
15314.0    0.000000
Name: 13085.0, Length: 5878, dtype: float64

In [20]:
# Listing out the actual items (with stockcode) bought by the specific customer of interest 13085

items_bought_by_A = set(customer_item_matrix.loc[13085.0].iloc[customer_item_matrix.loc[13085.0].to_numpy().nonzero()].index)

In [21]:
# The list of the specific items bought by customer 13085

items_bought_by_A

{'20749',
 '20750',
 '21068',
 '21137',
 '21198',
 '21199',
 '21232',
 '21523',
 '21563',
 '21564',
 '21790',
 '21791',
 '21871',
 '21889',
 '21955',
 '22041',
 '22064',
 '22136',
 '22138',
 '22147',
 '22179',
 '22192',
 '22193',
 '22195',
 '22200',
 '22201',
 '22202',
 '22204',
 '22244',
 '22245',
 '22271',
 '22299',
 '22326',
 '22328',
 '22349',
 '22350',
 '22353',
 '22414',
 '22418',
 '22617',
 '22745',
 '22746',
 '22748',
 '23242',
 '40046A',
 '48138',
 '79323P',
 '79323W',
 '84992',
 '85048'}

In [22]:
# Exploring the specific items/stock bought by customer B with id 18284

items_bought_by_B = set(customer_item_matrix.loc[18284.0].iloc[customer_item_matrix.loc[18284.0].to_numpy().nonzero()].index)

In [23]:
# Specific items bought by customer 18284

items_bought_by_B

{'16237',
 '20970',
 '21051',
 '21498',
 '21559',
 '21561',
 '21805',
 '21807',
 '21817',
 '21819',
 '21826',
 '21827',
 '21828',
 '21830',
 '22045',
 '22149',
 '22150',
 '22155',
 '22187',
 '22418',
 '22569',
 '22570',
 '22591',
 '22741',
 '51014A',
 '51014L',
 '84270',
 'C2'}

In [24]:
# Deciding what to recommend to customer B based on the difference with the item bought by A

items_to_recommend_User_B = items_bought_by_A - items_bought_by_B

In [25]:
# List of specific items to recommend to user B

items_to_recommend_User_B

{'20749',
 '20750',
 '21068',
 '21137',
 '21198',
 '21199',
 '21232',
 '21523',
 '21563',
 '21564',
 '21790',
 '21791',
 '21871',
 '21889',
 '21955',
 '22041',
 '22064',
 '22136',
 '22138',
 '22147',
 '22179',
 '22192',
 '22193',
 '22195',
 '22200',
 '22201',
 '22202',
 '22204',
 '22244',
 '22245',
 '22271',
 '22299',
 '22326',
 '22328',
 '22349',
 '22350',
 '22353',
 '22414',
 '22617',
 '22745',
 '22746',
 '22748',
 '23242',
 '40046A',
 '48138',
 '79323P',
 '79323W',
 '84992',
 '85048'}

In [26]:
# Printing out the specific list of items (the stockcode and description) to recommend to user B based on the collaborative filtering 

clean_final_retail_data.loc[clean_final_retail_data['StockCode'].isin(items_to_recommend_User_B), 
                            ['StockCode', 'Description']].drop_duplicates().set_index('StockCode')

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS
79323P,PINK CHERRY LIGHTS
79323W,WHITE CHERRY LIGHTS
22041,"RECORD FRAME 7"" SINGLE SIZE"
21232,STRAWBERRY CERAMIC TRINKET BOX
...,...
20750,RED RETROSPOT MINI CASES
22202,MILK PAN PINK POLKADOT
22179,SET 10 NIGHT OWL LIGHTS
23242,TREASURE TIN BUFFALO BILL


In [27]:
# Item-based Collaborative Filtering 

# Instead of the user, the item-based filtering focuses on the stock, and explores the similarities among the stock

# Applying cosine similarity to the item item matrix (read more here - https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a (incognito))

# Cosine Similarity explores the similarity between two or more non zero vectors and it has a value between 0 and 1
# The smaller the angle between the vectors, the more similar they are 

item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T))

In [28]:
# Exploring the first 5 observations of the item_item_sim_matrix

# As observed below, the cosine similarities range from 0 to 1, with each product, perfectly similar to itself

item_item_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4591,4592,4593,4594,4595,4596,4597,4598,4599,4600,4601,4602,4603,4604,4605,4606,4607,4608,4609,4610,4611,4612,4613,4614,4615,4616,4617,4618,4619,4620,4621,4622,4623,4624,4625,4626,4627,4628,4629,4630
0,1.0,0.032564,0.0,0.075801,0.075023,0.045083,0.147242,0.078087,0.100462,0.06313,0.035316,0.076087,0.062609,0.071885,0.032564,0.112654,0.139065,0.110865,0.130503,0.134785,0.109322,0.15241,0.117676,0.104798,0.080159,0.089572,0.077484,0.083478,0.0,0.100636,0.087848,0.11713,0.111425,0.085858,0.046053,0.03704,0.0724,0.083241,0.095265,0.047088,...,0.069843,0.027608,0.029514,0.034922,0.060111,0.043315,0.03408,0.047088,0.046852,0.047088,0.08663,0.02087,0.049386,0.071657,0.032564,0.052382,0.071657,0.023544,0.070632,0.021657,0.080648,0.045942,0.052058,0.0,0.059028,0.0,0.0,0.034922,0.046852,0.045083,0.035829,0.011641,0.0,0.0,0.145514,0.0,0.085364,0.110432,0.0,0.0
1,0.032564,1.0,0.0,0.028916,0.033389,0.060193,0.049147,0.0,0.067065,0.033715,0.031435,0.046886,0.0,0.019195,0.0,0.131608,0.047915,0.037005,0.01936,0.021171,0.020851,0.040698,0.023276,0.033581,0.045867,0.023918,0.025863,0.055728,0.0,0.047422,0.052129,0.0,0.045775,0.050948,0.0,0.0,0.0,0.0,0.031798,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062869,0.057831,0.0,0.0,0.047836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059779,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080845,0.0,0.074953,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055815,0.053683,0.0,0.0,0.0,0.133631,0.0,0.07581,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047782,0.0,0.0,0.0,0.0,0.0
3,0.075801,0.028916,0.0,1.0,0.044412,0.080064,0.065372,0.046225,0.05947,0.044845,0.0,0.051971,0.037062,0.089363,0.057831,0.100031,0.079667,0.061527,0.038627,0.056321,0.041603,0.067666,0.085141,0.044667,0.040673,0.031814,0.017201,0.037062,0.0,0.07359,0.017334,0.069338,0.091329,0.152477,0.06134,0.043853,0.055104,0.059131,0.063443,0.041812,...,0.062017,0.0,0.052414,0.062017,0.0,0.0,0.060523,0.083624,0.0,0.0,0.038462,0.037062,0.0,0.031814,0.0,0.062017,0.031814,0.041812,0.0,0.038462,0.035806,0.054393,0.046225,0.0,0.0,0.0,0.0,0.0,0.027735,0.0,0.0,0.0,0.0,0.0,0.106018,0.0,0.013782,0.098058,0.0,0.0
4,0.075023,0.033389,0.0,0.044412,1.0,0.323575,0.075485,0.053376,0.154508,0.012946,0.02414,0.0,0.085592,0.058964,0.0,0.028877,0.036796,0.014209,0.029735,0.032517,0.032026,0.046881,0.008937,0.025788,0.023482,0.055104,0.039723,0.085592,0.0,0.0,0.020016,0.0,0.123034,0.078251,0.070829,0.025318,0.021209,0.0,0.024419,0.193122,...,0.0,0.0,0.0,0.0,0.0,0.044412,0.0,0.04828,0.096077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041345,0.062807,0.0,0.0,0.0,0.0,0.0,0.0,0.032026,0.0,0.0,0.0,0.0,0.0,0.091815,0.0,0.039784,0.0,0.0,0.0


In [29]:
# Mapping the index of the item_item_sim_matrix to the customer_item_matrix

item_item_sim_matrix.columns = customer_item_matrix.T.index

# Mapping the rows for the item_item_sim_matrix to be the index of the customer_item _matrix index

item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index


item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')

In [30]:
# Exploring the first 5 item_item_similarity matrix for the stocks

item_item_sim_matrix.head()

StockCode,10002,10080,10109,10120,10123C,10123G,10124A,10124G,10125,10133,10134,10135,10138,11001,15030,15034,15036,15039,15044A,15044B,15044C,15044D,15056BL,15056N,15056P,15058A,15058B,15058C,15059A,15060B,16008,16010,16011,16012,16014,16015,16016,16020C,16033,16043,...,90211A,90211B,90212B,90212C,90214A,90214B,90214C,90214D,90214E,90214F,90214G,90214H,90214I,90214J,90214K,90214L,90214M,90214N,90214O,90214P,90214R,90214S,90214T,90214U,90214V,90214W,90214Y,90214Z,ADJUST,ADJUST2,BANK CHARGES,C2,D,DOT,M,PADS,POST,SP1002,TEST001,TEST002
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
10002,1.0,0.032564,0.0,0.075801,0.075023,0.045083,0.147242,0.078087,0.100462,0.06313,0.035316,0.076087,0.062609,0.071885,0.032564,0.112654,0.139065,0.110865,0.130503,0.134785,0.109322,0.15241,0.117676,0.104798,0.080159,0.089572,0.077484,0.083478,0.0,0.100636,0.087848,0.11713,0.111425,0.085858,0.046053,0.03704,0.0724,0.083241,0.095265,0.047088,...,0.069843,0.027608,0.029514,0.034922,0.060111,0.043315,0.03408,0.047088,0.046852,0.047088,0.08663,0.02087,0.049386,0.071657,0.032564,0.052382,0.071657,0.023544,0.070632,0.021657,0.080648,0.045942,0.052058,0.0,0.059028,0.0,0.0,0.034922,0.046852,0.045083,0.035829,0.011641,0.0,0.0,0.145514,0.0,0.085364,0.110432,0.0,0.0
10080,0.032564,1.0,0.0,0.028916,0.033389,0.060193,0.049147,0.0,0.067065,0.033715,0.031435,0.046886,0.0,0.019195,0.0,0.131608,0.047915,0.037005,0.01936,0.021171,0.020851,0.040698,0.023276,0.033581,0.045867,0.023918,0.025863,0.055728,0.0,0.047422,0.052129,0.0,0.045775,0.050948,0.0,0.0,0.0,0.0,0.031798,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062869,0.057831,0.0,0.0,0.047836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059779,0.0,0.0,0.0,0.0,0.0
10109,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080845,0.0,0.074953,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055815,0.053683,0.0,0.0,0.0,0.133631,0.0,0.07581,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047782,0.0,0.0,0.0,0.0,0.0
10120,0.075801,0.028916,0.0,1.0,0.044412,0.080064,0.065372,0.046225,0.05947,0.044845,0.0,0.051971,0.037062,0.089363,0.057831,0.100031,0.079667,0.061527,0.038627,0.056321,0.041603,0.067666,0.085141,0.044667,0.040673,0.031814,0.017201,0.037062,0.0,0.07359,0.017334,0.069338,0.091329,0.152477,0.06134,0.043853,0.055104,0.059131,0.063443,0.041812,...,0.062017,0.0,0.052414,0.062017,0.0,0.0,0.060523,0.083624,0.0,0.0,0.038462,0.037062,0.0,0.031814,0.0,0.062017,0.031814,0.041812,0.0,0.038462,0.035806,0.054393,0.046225,0.0,0.0,0.0,0.0,0.0,0.027735,0.0,0.0,0.0,0.0,0.0,0.106018,0.0,0.013782,0.098058,0.0,0.0
10123C,0.075023,0.033389,0.0,0.044412,1.0,0.323575,0.075485,0.053376,0.154508,0.012946,0.02414,0.0,0.085592,0.058964,0.0,0.028877,0.036796,0.014209,0.029735,0.032517,0.032026,0.046881,0.008937,0.025788,0.023482,0.055104,0.039723,0.085592,0.0,0.0,0.020016,0.0,0.123034,0.078251,0.070829,0.025318,0.021209,0.0,0.024419,0.193122,...,0.0,0.0,0.0,0.0,0.0,0.044412,0.0,0.04828,0.096077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041345,0.062807,0.0,0.0,0.0,0.0,0.0,0.0,0.032026,0.0,0.0,0.0,0.0,0.0,0.091815,0.0,0.039784,0.0,0.0,0.0


In [31]:
# Exploring the top 10 similar items to the first stock code 10002

top_10_similar_items = list(
    item_item_sim_matrix\
    .loc['10002']\
    .sort_values(ascending = False)\
    .iloc[:10]\
    .index)

In [32]:
# Printing out the stockcode of the top 10 similar items to item 10002

# As expected, stock 10002 is perfectly similar to itself

top_10_similar_items

['10002',
 '22631',
 '22243',
 '21988',
 '22328',
 '21544',
 '20725',
 '21989',
 '22556',
 '84212']

In [33]:
# Printing out the actual items

clean_final_retail_data.loc[clean_final_retail_data['StockCode'].isin(top_10_similar_items), 
                            ['StockCode', 'Description']].drop_duplicates().set_index('StockCode').loc[top_10_similar_items]

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
10002,INFLATABLE POLITICAL GLOBE
22631,CIRCUS PARADE LUNCHBOX
22631,CIRCUS PARADE LUNCH BOX
22243,"HOOK, 5 HANGER ,MAGIC TOADSTOOL RED"
22243,5 HOOK HANGER RED MAGIC TOADSTOOL
21988,PACK OF 6 SKULL PAPER PLATES
22328,"ROUND SNACK BOXES ,SET 4, FRUITS"
22328,ROUND SNACK BOXES SET OF 4 FRUITS
21544,SKULLS WATER TRANSFER TATTOOS
20725,LUNCH BAG RED SPOTTY
