In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import scale

## Load Data

In [2]:
df_retail = pd.read_csv('./retail_transactions.csv', encoding='ISO-8859-1')

In [3]:
# Replace spaces and remove characters

df_retail['clean_description'] = df_retail['Description']
df_retail['clean_description'] = df_retail['clean_description'].str.replace(' ','_')
df_retail['clean_description'].str.replace('\W','')

0           WHITE_HANGING_HEART_TLIGHT_HOLDER
1                         WHITE_METAL_LANTERN
2              CREAM_CUPID_HEARTS_COAT_HANGER
3         KNITTED_UNION_FLAG_HOT_WATER_BOTTLE
4               RED_WOOLLY_HOTTIE_WHITE_HEART
                         ...                 
541904            PACK_OF_20_SPACEBOY_NAPKINS
541905            CHILDRENS_APRON_DOLLY_GIRL_
541906          CHILDRENS_CUTLERY_DOLLY_GIRL_
541907        CHILDRENS_CUTLERY_CIRCUS_PARADE
541908          BAKING_SET_9_PIECE_RETROSPOT_
Name: clean_description, Length: 541909, dtype: object

In [4]:
# Drop missing values
df_retail.dropna(inplace=True)

In [5]:
# Convert to list format

retail_list = df_retail.groupby('InvoiceNo').clean_description.apply(list)

## Build Model

Association Rule Model

In [6]:
# Encode data as transaction matrix

from mlxtend.preprocessing import TransactionEncoder

te_model = TransactionEncoder()
te_model_arr = te_model.fit(retail_list).transform(retail_list)
df_retail = pd.DataFrame(te_model_arr, columns=te_model.columns_)
df_retail

Unnamed: 0,10_COLOUR_SPACEBOY_PEN,12_COLOURED_PARTY_BALLOONS,12_DAISY_PEGS_IN_WOOD_BOX,12_EGG_HOUSE_PAINTED_WOOD,12_HANGING_EGGS_HAND_PAINTED,12_IVORY_ROSE_PEG_PLACE_SETTINGS,12_MESSAGE_CARDS_WITH_ENVELOPES,12_PENCILS_SMALL_TUBE_RED_RETROSPOT,12_PENCILS_SMALL_TUBE_SKULL,12_PENCILS_TALL_TUBE_POSY,...,_DOLLY_GIRL_BEAKER,_I_LOVE_LONDON_MINI_BACKPACK,_I_LOVE_LONDON_MINI_RUCKSACK,_NINE_DRAWER_OFFICE_TIDY,_OVAL_WALL_MIRROR_DIAMANTE_,_RED_SPOT_GIFT_BAG_LARGE,_SET_2_TEA_TOWELS_I_LOVE_LONDON_,_SPACEBOY_BABY_GIFT_SET,_TOADSTOOL_BEDSIDE_LIGHT_,_TRELLIS_COAT_RACK
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22185,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22186,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22187,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22188,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# Determine the items and itemsets with at least 1% support

from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df_retail, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.011221,(10_COLOUR_SPACEBOY_PEN)
1,0.014015,(12_PENCILS_SMALL_TUBE_RED_RETROSPOT)
2,0.013249,(12_PENCILS_SMALL_TUBE_SKULL)
3,0.010680,(12_PENCILS_TALL_TUBE_RED_RETROSPOT)
4,0.012528,(12_PENCIL_SMALL_TUBE_WOODLAND)
...,...,...
734,0.010140,"(LUNCH_BAG_SUKI_DESIGN_, LUNCH_BAG_RED_RETROSP..."
735,0.011447,"(LUNCH_BAG_SUKI_DESIGN_, LUNCH_BAG_RED_RETROSP..."
736,0.010455,"(LUNCH_BAG__BLACK_SKULL., LUNCH_BAG_RED_RETROS..."
737,0.012213,"(PINK_REGENCY_TEACUP_AND_SAUCER, REGENCY_CAKES..."


In [8]:
# Evaluate metrics and filter items/itemsets that have at least 70% confidence

from mlxtend.frequent_patterns import association_rules

a_rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)
a_rules.sort_values(by=['confidence'],ascending=False, inplace=True)
a_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
35,"(PINK_REGENCY_TEACUP_AND_SAUCER, REGENCY_CAKES...",(GREEN_REGENCY_TEACUP_AND_SAUCER),0.012213,0.033033,0.010861,0.889299,26.921613,0.010457,8.734936
24,"(PINK_REGENCY_TEACUP_AND_SAUCER, ROSES_REGENCY...",(GREEN_REGENCY_TEACUP_AND_SAUCER),0.020324,0.033033,0.017891,0.880266,26.648164,0.01722,8.075966
33,"(GREEN_REGENCY_TEACUP_AND_SAUCER, PINK_REGENCY...",(ROSES_REGENCY_TEACUP_AND_SAUCER_),0.012348,0.037675,0.010861,0.879562,23.34627,0.010396,7.990217
21,"(PINK_REGENCY_TEACUP_AND_SAUCER, REGENCY_CAKES...",(GREEN_REGENCY_TEACUP_AND_SAUCER),0.014376,0.033033,0.012348,0.858934,26.002386,0.011873,6.854722
32,"(PINK_REGENCY_TEACUP_AND_SAUCER, REGENCY_CAKES...",(ROSES_REGENCY_TEACUP_AND_SAUCER_),0.014376,0.037675,0.012213,0.84953,22.549122,0.011671,6.395454
