In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_csv('sales_data.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
df.shape

(1609280, 8)

In [4]:
df.Description = df.Description.str.strip()
df.dropna(axis = 0, subset=['InvoiceNo'], inplace=True)
df.InvoiceNo = df.InvoiceNo.astype('str') # convert invoice number to string so algorithm will understand it
df = df[~df.InvoiceNo.str.contains('C')] # remove all credit transactions

In [5]:
df.shape

(1580498, 8)

In [6]:
df.Country.value_counts()

United Kingdom          1452302
Germany                   25745
EIRE                      25248
France                    22349
Netherlands                7456
Spain                      6205
Switzerland                5104
Belgium                    5100
Portugal                   4063
Australia                  3000
Norway                     2509
Channel Islands            2317
Italy                      2226
Sweden                     1789
Cyprus                     1769
Finland                    1717
Austria                    1320
Unspecified                1198
Denmark                    1178
Poland                      842
Japan                       806
Greece                      802
Israel                      664
Hong Kong                   642
USA                         588
Singapore                   561
United Arab Emirates        535
Iceland                     435
Malta                       394
Canada                      379
RSA                         226
Lithuani

In [7]:
# Only run analysis for Netherlands customers
my_basket  = (df[df.Country == 'Netherlands'].groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [8]:
my_basket.head()

Description,10 COLOUR SPACEBOY PEN,12 ASS ZINC CHRISTMAS DECORATIONS,12 EGG HOUSE PAINTED WOOD,12 IVORY ROSE PEG PLACE SETTINGS,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,12 PINK ROSE PEG PLACE SETTINGS,12 RED ROSE PEG PLACE SETTINGS,...,YELLOW METAL CHICKEN HEART,YULETIDE IMAGES GIFT WRAP SET,ZINC HEART T-LIGHT HOLDER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,216.0,0.0,0.0,0.0
489890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490158,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0
490964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
492248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Convert all positive values to 1 and every other thing to 0
def encoded_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

my_basket_sets = my_basket.applymap(encoded_units)
my_basket_sets.drop('POSTAGE', axis='columns', inplace=True) # Remove postage as an item as it doesn't make much sense

### Train Model

In [10]:
# Generate frequent itemsets
freq_itemsets = apriori(my_basket_sets, min_support=0.07, use_colnames=True)



In [11]:
freq_itemsets.head()

Unnamed: 0,support,itemsets
0,0.082969,(CARD DOLLY GIRL)
1,0.087336,(CHARLOTTE BAG DOLLY GIRL DESIGN)
2,0.087336,(CREAM CUPID HEARTS COAT HANGER)
3,0.183406,(DOLLY GIRL LUNCH BOX)
4,0.091703,(FAWN BLUE HOT WATER BOTTLE)


In [12]:
# Create rules
rules = association_rules(freq_itemsets, metric='lift', min_threshold=1)

In [14]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(DOLLY GIRL LUNCH BOX),(ROUND SNACK BOXES SET OF 4 FRUITS),0.183406,0.144105,0.082969,0.452381,3.13925,0.05654,1.562939
1,(ROUND SNACK BOXES SET OF 4 FRUITS),(DOLLY GIRL LUNCH BOX),0.144105,0.183406,0.082969,0.575758,3.13925,0.05654,1.924828
2,(ROUND SNACK BOXES SET OF4 WOODLAND),(DOLLY GIRL LUNCH BOX),0.196507,0.183406,0.091703,0.466667,2.544444,0.055663,1.531114
3,(DOLLY GIRL LUNCH BOX),(ROUND SNACK BOXES SET OF4 WOODLAND),0.183406,0.196507,0.091703,0.5,2.544444,0.055663,1.606987
4,(DOLLY GIRL LUNCH BOX),(SPACEBOY LUNCH BOX),0.183406,0.20524,0.165939,0.904762,4.408308,0.128297,8.344978


### Make Recommendations

In [18]:
# Filter rules based on condition
rule_filters = rules[(rules['lift'] >= 3) & (rules['confidence'] >= 0.3)]
rule_filters.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(DOLLY GIRL LUNCH BOX),(ROUND SNACK BOXES SET OF 4 FRUITS),0.183406,0.144105,0.082969,0.452381,3.13925,0.05654,1.562939
1,(ROUND SNACK BOXES SET OF 4 FRUITS),(DOLLY GIRL LUNCH BOX),0.144105,0.183406,0.082969,0.575758,3.13925,0.05654,1.924828
4,(DOLLY GIRL LUNCH BOX),(SPACEBOY LUNCH BOX),0.183406,0.20524,0.165939,0.904762,4.408308,0.128297,8.344978
5,(SPACEBOY LUNCH BOX),(DOLLY GIRL LUNCH BOX),0.20524,0.183406,0.165939,0.808511,4.408308,0.128297,4.264435
6,(PLASTERS IN TIN SPACEBOY),(PLASTERS IN TIN WOODLAND ANIMALS),0.117904,0.10917,0.087336,0.740741,6.785185,0.074465,3.436057
