In [None]:
# Data is from here https://archive.ics.uci.edu/ml/datasets/online+retail#

# The following analysis is adapted from the book "Hands on Data Science for Marketing"

In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_excel('../data/Online Retail.xlsx', sheet_name='Online Retail')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [21]:
df = df.loc[df['Quantity']>0]
df = df.dropna(subset=['CustomerID'])


In [22]:
df.shape

(397924, 8)

In [23]:
customer_item_matrix = df.pivot_table(index='CustomerID', columns='StockCode',values='Quantity',aggfunc='sum')

# Replacing the Nan with 0 and the rest with 1
customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x>0 else 0)
customer_item_matrix.head()

StockCode,10002,10080,10120,10125,10133,10135,11001,15030,15034,15036,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
user_user_similarity_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_similarity_matrix.head()
# Notice how the diagonal elements are all 1s

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4329,4330,4331,4332,4333,4334,4335,4336,4337,4338
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.063022,0.04613,0.047795,0.038484,0.0,0.025876,0.136641,0.094742,...,0.0,0.029709,0.052668,0.0,0.032844,0.062318,0.0,0.113776,0.109364,0.012828
2,0.0,0.063022,1.0,0.024953,0.051709,0.027756,0.0,0.027995,0.118262,0.146427,...,0.0,0.064282,0.113961,0.0,0.0,0.0,0.0,0.0,0.170905,0.083269
3,0.0,0.04613,0.024953,1.0,0.056773,0.137137,0.0,0.030737,0.032461,0.144692,...,0.0,0.105868,0.0,0.0,0.039014,0.0,0.0,0.067574,0.137124,0.030475
4,0.0,0.047795,0.051709,0.056773,1.0,0.031575,0.0,0.0,0.0,0.033315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044866,0.0


In [25]:
# Change the column names and the indices

# Turn the numerically named columns into the customer IDs 
user_user_similarity_matrix.columns = customer_item_matrix.index

# Create a CustomerID column and then set that as the index
user_user_similarity_matrix['CustomerID'] = customer_item_matrix.index
user_user_similarity_matrix = user_user_similarity_matrix.set_index('CustomerID')

# Note that the above steps only work because the rows are not jumbled

In [28]:
user_user_similarity_matrix.loc[12350.0].sort_values(ascending=False).head()

CustomerID
12350.0    1.000000
17935.0    0.183340
12414.0    0.181902
12652.0    0.175035
16692.0    0.171499
Name: 12350.0, dtype: float64

In [45]:
# now compare the items bought be 12350 and 17935 and recommend them to each other.
item_list_A = set(customer_item_matrix.loc[12350.0].iloc[customer_item_matrix.loc[12350.0].nonzero()].index)
item_list_B = set(customer_item_matrix.loc[17935.0].iloc[customer_item_matrix.loc[17935.0].nonzero()].index)

# the author is wrong because it should be items to recommend to A.
# Because customer 17935 could have other users where the pairwise score is higher
# this is how you figure out the items to recommend to B
item_list_B - item_list_A

{20657,
 20659,
 20828,
 20856,
 21051,
 21867,
 22208,
 22209,
 22210,
 22211,
 22449,
 22450,
 22553,
 22640,
 22659,
 22749,
 22752,
 22753,
 22754,
 22755,
 23290,
 23292,
 23309,
 '85099B'}

In [51]:
user_user_similarity_matrix.loc[17935.0].sort_values(ascending=False).head()
# My suspicions were correct.

CustomerID
17935.0    1.000000
14813.0    0.264575
12650.0    0.211289
16305.0    0.201456
18174.0    0.188982
Name: 17935.0, dtype: float64

In [None]:
# Now lets create this programmatically. Lets assume that we only want to show 10 recommended items
# from the user with the highest pairwise score

In [58]:
# Create a dictionary to store the recommended products
from collections import Counter
recommend_dict = Counter()

# of which there are no more than 10
no_of_items_to_show = 10

for customerID in user_user_similarity_matrix.index:
    
    # Sort it in descending order of the scores and pick the customerID of the 2nd row (1st because of zero indexing)
    # NTFU make this step more efficient runtime wise.
    similar_custID = user_user_similarity_matrix.loc[customerID].sort_values(ascending=False).index[1]
    
    # Find the unique items that the customer has already purchased
    item_list_A = set(customer_item_matrix.loc[customerID].iloc[customer_item_matrix.loc[customerID].nonzero()].index)
    item_list_B = set(customer_item_matrix.loc[similar_custID].iloc[customer_item_matrix.loc[similar_custID].nonzero()].index)
    
    recommend_item_list = list(item_list_B - item_list_A)
    
    recommend_dict[customerID] = recommend_item_list[:no_of_items_to_show]
#     print(customerID, similar_custID)
    
    # Future improvements - sort the items by R, F, M scores before showing to the customer in the UI
    # There needs to be backend logic where those recommendations that have fewer than 10 are topped up by other criteria

In [55]:
temp_list = [1,2,3,4,5,6,7]
temp_list[:10]

[1, 2, 3, 4, 5, 6, 7]