In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_csv('sales_data.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
df.shape

(1609280, 8)

In [5]:
df.Description = df.Description.str.strip()
df.dropna(axis = 0, subset=['InvoiceNo'], inplace=True)
df.InvoiceNo = df.InvoiceNo.astype('str') # convert invoice number to string so algorithm will understand it
df = df[~df.InvoiceNo.str.contains('C')] # remove all credit transactions

In [6]:
df.Country.value_counts()

United Kingdom          1452302
Germany                   25745
EIRE                      25248
France                    22349
Netherlands                7456
Spain                      6205
Switzerland                5104
Belgium                    5100
Portugal                   4063
Australia                  3000
Norway                     2509
Channel Islands            2317
Italy                      2226
Sweden                     1789
Cyprus                     1769
Finland                    1717
Austria                    1320
Unspecified                1198
Denmark                    1178
Poland                      842
Japan                       806
Greece                      802
Israel                      664
Hong Kong                   642
USA                         588
Singapore                   561
United Arab Emirates        535
Iceland                     435
Malta                       394
Canada                      379
RSA                         226
Lithuani

In [7]:
# Only run analysis for German customers
my_basket  = (df[df.Country == 'Germany'].groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))

In [8]:
my_basket.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 MINI TOADSTOOL PEGS,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE RED SPOTTY,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,...,ZINC HEART T-LIGHT HOLDER,ZINC STAR T-LIGHT HOLDER,ZINC BOX SIGN HOME,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,24.0
490395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Convert all positive values to 1 and every other thing to 0
def encoded_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

my_basket_sets = my_basket.applymap(encoded_units)
my_basket_sets.drop('POSTAGE', axis='columns', inplace=True) # Remove postage as an item as it doesn't make much sense

### Train Model

In [11]:
# Generate frequent itemsets
freq_itemsets = apriori(my_basket_sets, min_support=0.07, use_colnames=True)



In [12]:
# Create rules
rules = association_rules(freq_itemsets, metric='lift', min_threshold=1)

In [22]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PLASTERS IN TIN WOODLAND ANIMALS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.135615,0.262357,0.072243,0.53271,2.030475,0.036664,1.578555
1,(ROUND SNACK BOXES SET OF4 WOODLAND),(PLASTERS IN TIN WOODLAND ANIMALS),0.262357,0.135615,0.072243,0.275362,2.030475,0.036664,1.192852
2,(ROUND SNACK BOXES SET OF 4 FRUITS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.169835,0.262357,0.136882,0.80597,3.072031,0.092325,3.801696
3,(ROUND SNACK BOXES SET OF4 WOODLAND),(ROUND SNACK BOXES SET OF 4 FRUITS),0.262357,0.169835,0.136882,0.521739,3.072031,0.092325,1.735799
4,(ROUND SNACK BOXES SET OF4 WOODLAND),(WOODLAND CHARLOTTE BAG),0.262357,0.166033,0.08365,0.318841,1.920345,0.04009,1.224335


### Make Recommendations

In [18]:
# Filter rules based on condition
rules[(rules['lift'] >= 3) & (rules['confidence'] >= 0.3)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(ROUND SNACK BOXES SET OF 4 FRUITS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.169835,0.262357,0.136882,0.80597,3.072031,0.092325,3.801696
3,(ROUND SNACK BOXES SET OF4 WOODLAND),(ROUND SNACK BOXES SET OF 4 FRUITS),0.262357,0.169835,0.136882,0.521739,3.072031,0.092325,1.735799


In [20]:
my_basket_sets['ROUND SNACK BOXES SET OF4 WOODLAND'].sum()

207

In [19]:
my_basket_sets['ROUND SNACK BOXES SET OF 4 FRUITS'].sum()

134