# Apriori Content Recommendations

In [1]:
import warnings
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
%matplotlib inline

df = pd.read_excel('Online Retail.xlsx')

### Data cleaning

In [2]:
df['Description'] = df['Description'].str.strip() # Removes blank spaces 
df['InvoiceNo']   = df['InvoiceNo'].astype('str') # Converts to string
df = df[~df['InvoiceNo'].str.contains('C')]       # Removes 'Cancelled' transactions 
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### Handling missing values

In [3]:
for feature in df.columns:
    print(feature, 'has', round(df[feature].isnull().mean(),4), '% of missing values')   
df.shape

InvoiceNo has 0.0 % of missing values
StockCode has 0.0 % of missing values
Description has 0.0027 % of missing values
Quantity has 0.0 % of missing values
InvoiceDate has 0.0 % of missing values
UnitPrice has 0.0 % of missing values
CustomerID has 0.2529 % of missing values
Country has 0.0 % of missing values


(532621, 8)

In [4]:
df.dropna(axis=0, inplace=True) # Removes missing values
df.shape

(397924, 8)

### Create Market Basket by Country

In [5]:
top_countries = df['Country'].value_counts()
top_countries

United Kingdom          354345
Germany                   9042
France                    8342
EIRE                      7238
Spain                     2485
Netherlands               2363
Belgium                   2031
Switzerland               1842
Portugal                  1462
Australia                 1185
Norway                    1072
Italy                      758
Channel Islands            748
Finland                    685
Cyprus                     614
Sweden                     451
Austria                    398
Denmark                    380
Poland                     330
Japan                      321
Israel                     248
Unspecified                244
Singapore                  222
Iceland                    182
USA                        179
Canada                     151
Greece                     145
Malta                      112
United Arab Emirates        68
European Community          60
RSA                         58
Lebanon                     45
Lithuani

In [6]:
baskets = []
i = 0
    
for country in top_countries.index:
    individual_basket = (df[ df['Country'] == country ]
                         .groupby(['InvoiceNo', 'Description'])['Quantity']
                         .sum()
                         .unstack()
                         .reset_index()
                         .fillna(0)
                         .set_index('InvoiceNo'))
    baskets.insert(i, individual_basket)
    i += 1


Convert to one-hot DataFrame

In [7]:
for i in range(len(baskets)):
    baskets[i] = baskets[i].apply(lambda x: x>=1, 1).replace(False, 0).replace(True,1)

In [8]:
baskets[1].head() # Germany's Market Basket

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE SKULLS,...,YULETIDE IMAGES GIFT WRAP SET,ZINC HEART T-LIGHT HOLDER,ZINC STAR T-LIGHT HOLDER,ZINC BOX SIGN HOME,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536861,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train Recommendation Models
#### Examples
- Germany 
- Brazil

In [17]:
print(top_countries.index.get_loc('Germany'))
print(top_countries.index.get_loc('Spain'))
len(baskets[33])

1
4


1

In [21]:
frequent_products_germany = apriori(baskets[1], 
                                    min_support  = 0.07,
                                    use_colnames = True)

frequent_products_spain   = apriori(baskets[4], 
                                    min_support  = 0.07,
                                    use_colnames = True)

In [22]:
germany_rules = association_rules(frequent_products_germany,
                             metric        = "lift",
                             min_threshold = 1)

spain_rules  = association_rules(frequent_products_spain,
                             metric        = "lift",
                             min_threshold = 1)

- Association Rules 

In [23]:
germany_rules.head().sort_values(by = ['lift'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(POSTAGE),(PLASTERS IN TIN CIRCUS PARADE),0.818381,0.115974,0.100656,0.122995,1.060539,0.005746,1.008006
2,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.100656,0.818381,0.087527,0.869565,1.062544,0.005152,1.392414
3,(POSTAGE),(JUMBO BAG WOODLAND ANIMALS),0.818381,0.100656,0.087527,0.106952,1.062544,0.005152,1.007049
0,(6 RIBBONS RUSTIC CHARM),(POSTAGE),0.102845,0.818381,0.091904,0.893617,1.091933,0.007738,1.707221
1,(POSTAGE),(6 RIBBONS RUSTIC CHARM),0.818381,0.102845,0.091904,0.112299,1.091933,0.007738,1.010651


In [24]:
spain_rules.head().sort_values(by = ['lift'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(6 RIBBONS RUSTIC CHARM),(POSTAGE),0.166667,0.688889,0.144444,0.866667,1.258065,0.02963,2.333333
3,(POSTAGE),(6 RIBBONS RUSTIC CHARM),0.688889,0.166667,0.144444,0.209677,1.258065,0.02963,1.054422
4,(POSTAGE),(ASSORTED COLOUR BIRD ORNAMENT),0.688889,0.133333,0.122222,0.177419,1.330645,0.03037,1.053595
1,(ASSORTED COLOUR BIRD ORNAMENT),(6 RIBBONS RUSTIC CHARM),0.133333,0.166667,0.1,0.75,4.5,0.077778,3.333333
0,(6 RIBBONS RUSTIC CHARM),(ASSORTED COLOUR BIRD ORNAMENT),0.166667,0.133333,0.1,0.6,4.5,0.077778,2.166667


In [25]:
baskets[1].filter(items=['ROUND SNACK BOXES SET OF4 WOODLAND']).sum()

Description
ROUND SNACK BOXES SET OF4 WOODLAND    112
dtype: int64

###  Recommendations based on Rules 
    - lift
    - confidence
    - support

In [26]:
germany_rules[(germany_rules['lift'] >= 2) & 
              (germany_rules['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11,(PLASTERS IN TIN WOODLAND ANIMALS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.137856,0.245077,0.074398,0.539683,2.202098,0.040613,1.640006
24,(ROUND SNACK BOXES SET OF4 WOODLAND),(ROUND SNACK BOXES SET OF 4 FRUITS),0.245077,0.157549,0.131291,0.535714,3.400298,0.092679,1.814509
25,(ROUND SNACK BOXES SET OF 4 FRUITS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.157549,0.245077,0.131291,0.833333,3.400298,0.092679,4.52954
27,(SPACEBOY LUNCH BOX),(ROUND SNACK BOXES SET OF4 WOODLAND),0.102845,0.245077,0.070022,0.680851,2.778116,0.044817,2.365427
28,"(POSTAGE, ROUND SNACK BOXES SET OF 4 FRUITS)",(ROUND SNACK BOXES SET OF4 WOODLAND),0.150985,0.245077,0.124726,0.826087,3.37073,0.087724,4.34081
29,"(POSTAGE, ROUND SNACK BOXES SET OF4 WOODLAND)",(ROUND SNACK BOXES SET OF 4 FRUITS),0.225383,0.157549,0.124726,0.553398,3.51254,0.089218,1.886357
32,(ROUND SNACK BOXES SET OF 4 FRUITS),"(POSTAGE, ROUND SNACK BOXES SET OF4 WOODLAND)",0.157549,0.225383,0.124726,0.791667,3.51254,0.089218,3.718162
33,(ROUND SNACK BOXES SET OF4 WOODLAND),"(POSTAGE, ROUND SNACK BOXES SET OF 4 FRUITS)",0.245077,0.150985,0.124726,0.508929,3.37073,0.087724,1.728904


In [32]:
spain_rules[(spain_rules['confidence'] >= 0.7) & 
              (spain_rules['lift'] >= 2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(ASSORTED COLOUR BIRD ORNAMENT),(6 RIBBONS RUSTIC CHARM),0.133333,0.166667,0.1,0.75,4.5,0.077778,3.333333
14,(POPPY'S PLAYHOUSE KITCHEN),(POPPY'S PLAYHOUSE BEDROOM),0.077778,0.088889,0.077778,1.0,11.25,0.070864,inf
15,(POPPY'S PLAYHOUSE BEDROOM),(POPPY'S PLAYHOUSE KITCHEN),0.088889,0.077778,0.077778,0.875,11.25,0.070864,7.377778
26,"(POSTAGE, ASSORTED COLOUR BIRD ORNAMENT)",(6 RIBBONS RUSTIC CHARM),0.122222,0.166667,0.1,0.818182,4.909091,0.07963,4.583333
29,(ASSORTED COLOUR BIRD ORNAMENT),"(6 RIBBONS RUSTIC CHARM, POSTAGE)",0.133333,0.144444,0.1,0.75,5.192308,0.080741,3.422222
