# Activity 3 : Association Rule Mining
Extract frequent itemsets and association rules countrywise from the transactional data: https://archive.ics.uci.edu/ml/datasets/online+retail

In [2]:
#!pip install mlxtend

In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori

In [4]:
#Dataset citation: Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry:
#A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and 
#Customer Strategy Management, Vol. 19, No. 3, pp. 197â€“208, 2012 
#(Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17).
#Url: https://archive.ics.uci.edu/ml/datasets/online+retail#

In [5]:
transaction_df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
transaction_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [6]:
transaction_df.shape

(541909, 8)

In [7]:
transaction_df['Description'].value_counts()[:100]

WHITE HANGING HEART T-LIGHT HOLDER     2369
REGENCY CAKESTAND 3 TIER               2200
JUMBO BAG RED RETROSPOT                2159
PARTY BUNTING                          1727
LUNCH BAG RED RETROSPOT                1638
ASSORTED COLOUR BIRD ORNAMENT          1501
SET OF 3 CAKE TINS PANTRY DESIGN       1473
PACK OF 72 RETROSPOT CAKE CASES        1385
LUNCH BAG  BLACK SKULL.                1350
NATURAL SLATE HEART CHALKBOARD         1280
POSTAGE                                1252
JUMBO BAG PINK POLKADOT                1251
HEART OF WICKER SMALL                  1237
JAM MAKING SET WITH JARS               1229
JUMBO STORAGE BAG SUKI                 1214
PAPER CHAIN KIT 50'S CHRISTMAS         1210
JUMBO SHOPPER VINTAGE RED PAISLEY      1202
LUNCH BAG CARS BLUE                    1197
LUNCH BAG SPACEBOY DESIGN              1192
JAM MAKING SET PRINTED                 1182
RECIPE BOX PANTRY YELLOW DESIGN        1180
SPOTTY BUNTING                         1172
LUNCH BAG SUKI DESIGN           

In [8]:
transaction_df_small = transaction_df[transaction_df['Description']\
                            .isin(transaction_df['Description'].value_counts()[:100].reset_index()['index'])]

In [9]:
transaction_df_small.shape

(99739, 8)

In [10]:
transaction_df_small['Country'].value_counts()

United Kingdom          90086
Germany                  2082
France                   2068
EIRE                     1492
Belgium                   571
Netherlands               531
Spain                     484
Portugal                  339
Switzerland               338
Australia                 280
Norway                    179
Italy                     174
Channel Islands           161
Finland                   138
Sweden                    103
Denmark                    84
Cyprus                     83
Austria                    72
Poland                     66
Israel                     55
Unspecified                47
Hong Kong                  38
Singapore                  38
Japan                      37
Iceland                    34
USA                        28
Greece                     25
Canada                     19
RSA                        19
Malta                      17
United Arab Emirates       13
Brazil                     12
Lebanon                     9
European C

In [11]:
dataset = [[str(itm) for itm in trans] for trans in list(transaction_df_small.groupby('Country')\
                                         .agg({'Description' : lambda x: set(x)}).reset_index()['Description'])]
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.55, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.675676,(BAKING SET 9 PIECE RETROSPOT )
1,0.567568,(GREEN REGENCY TEACUP AND SAUCER)
2,0.675676,(JAM MAKING SET PRINTED)
3,0.702703,(JAM MAKING SET WITH JARS)
4,0.567568,(PACK OF 72 RETROSPOT CAKE CASES)
5,0.621622,(PLASTERS IN TIN WOODLAND ANIMALS)
6,0.648649,(POSTAGE)
7,0.810811,(REGENCY CAKESTAND 3 TIER)
8,0.594595,(RETROSPOT TEA SET CERAMIC 11 PC )
9,0.567568,(ROSES REGENCY TEACUP AND SAUCER )


In [12]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.55)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(JAM MAKING SET WITH JARS),(BAKING SET 9 PIECE RETROSPOT ),0.702703,0.675676,0.567568,0.807692,1.195385,0.092768,1.686486
1,(BAKING SET 9 PIECE RETROSPOT ),(JAM MAKING SET WITH JARS),0.675676,0.702703,0.567568,0.84,1.195385,0.092768,1.858108
2,(REGENCY CAKESTAND 3 TIER),(BAKING SET 9 PIECE RETROSPOT ),0.810811,0.675676,0.621622,0.766667,1.134667,0.073776,1.389961
3,(BAKING SET 9 PIECE RETROSPOT ),(REGENCY CAKESTAND 3 TIER),0.675676,0.810811,0.621622,0.92,1.134667,0.073776,2.364865
4,(BAKING SET 9 PIECE RETROSPOT ),(SET OF 3 REGENCY CAKE TINS),0.675676,0.648649,0.567568,0.84,1.295,0.129291,2.195946
5,(SET OF 3 REGENCY CAKE TINS),(BAKING SET 9 PIECE RETROSPOT ),0.648649,0.675676,0.567568,0.875,1.295,0.129291,2.594595
6,(GREEN REGENCY TEACUP AND SAUCER),(REGENCY CAKESTAND 3 TIER),0.567568,0.810811,0.567568,1.0,1.233333,0.107378,inf
7,(REGENCY CAKESTAND 3 TIER),(GREEN REGENCY TEACUP AND SAUCER),0.810811,0.567568,0.567568,0.7,1.233333,0.107378,1.441441
8,(JAM MAKING SET PRINTED),(JAM MAKING SET WITH JARS),0.675676,0.702703,0.567568,0.84,1.195385,0.092768,1.858108
9,(JAM MAKING SET WITH JARS),(JAM MAKING SET PRINTED),0.702703,0.675676,0.567568,0.807692,1.195385,0.092768,1.686486
