### Imports

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from time import time
from scipy.stats import fisher_exact
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='talk', fscale=1.8, spines=False, gridlines='--', ticks=True, grid=False, figsize=(12, 8))
import warnings
warnings.filterwarnings('ignore')

### Load the data & build the product-transaction matrix

In [2]:
def get_transaction_data():
    """Load groceries transaction data into DataFrame"""
    df = pd.read_csv('grocery_transactions.csv')
    df = df.stack().reset_index(-1, drop=True)
    df.index.names = ['tx_id']
    return pd.get_dummies(df, prefix='', prefix_sep='').groupby(level='tx_id').sum()

In [3]:
data = get_transaction_data()

item_id = pd.Series(dict(enumerate(data.columns)))
transactions = data.values
n_txn = transactions.shape[0]
min_support = 0.01

item_length = 1
candidates = list(zip(item_id.index))
itemsets = pd.DataFrame(columns=['support', 'length'])

new_rules = []
rule_data = ['itemset', 'antecedent', 'consequent',
             'support_rule', 'support_antecedent', 'support_consequent',
             'confidence', 'lift', 'pvalue']
rules = pd.DataFrame(columns=rule_data)

In [4]:
def prune_candidates(all_txn, candidates, candidate_size, min_support):
    """Return DataFrame with itemsets of candidate_size with min_support
        all_txn: numpy array of transaction-product matrix
        candidates: list of tuples containing product id
        candidate_size: length of item set
        min_support: support threshold
        """
    itemsets = {}
    for candidate in candidates:
        candidate_txn = all_txn[:, candidate].reshape(-1, candidate_size)
        relevant_txn = candidate_txn[(candidate_txn == 1).all(axis=1)]
        support = relevant_txn.shape[0] / all_txn.shape[0]
        if support >= min_support:
            itemsets[frozenset(candidate)] = support
    return pd.Series(itemsets).to_frame('support').assign(length=candidate_size)

In [5]:
def find_association_rules(itemsets, n_txn, n_items, min_confidence=0, min_lift=0, min_pvalue=0):
    """Find rules {antecedent} => {consequent} with min_confidence, min_lift and min_pvalue
    itemsets: DataFrame containing all itemsets and their support
    min_confidence, min_lift, min_pvalue: confidence & lift & pvalue thresholds
    """
    support = itemsets.loc[:, 'support'].to_dict()
    new_rules = []
    for itemset in itemsets.loc[itemsets.length == n_items].index:
        for n_antecedents in range(1, n_items):
            antecedents = [frozenset(a)
                           for a in combinations(itemset, r=n_antecedents)]
            for antecedent in antecedents:
                consequent = itemset.difference(antecedent)
                sAC = support[itemset]
                sA, sC = support[antecedent], support[consequent]
                confidence = sAC / sA
                lift = sAC / (sA * sC)
                contingency_table = n_txn * np.array([[sAC, sA - sAC],
                                        [sC - sAC, 1 - sA - sC + sAC]])
                _, p_value = fisher_exact(contingency_table,
                                          alternative='greater')

                if (confidence >= min_confidence) and (lift >= min_lift) and (p_value >= min_pvalue):
                    new_rule = [itemset, antecedent, consequent,
                                support[itemset], support[antecedent], support[consequent],
                                confidence, lift, p_value]
                    new_rules.append(new_rule)
    return new_rules

In [6]:
while candidates:
    new_items = prune_candidates(transactions, candidates, item_length, min_support)
    itemsets = itemsets.append(new_items)

    if item_length > 1:
        new_rules = find_association_rules(itemsets, n_txn, item_length)
        rules = pd.concat([rules, pd.DataFrame(new_rules, columns=rules.columns)], ignore_index=True)
        
    print('Itemset Length {}\tCandidates: {:>7,.0f}\tNew Items: {:>7,.0f}\tNew Rules: {:>7,.0f}'.format(
            item_length, len(candidates), len(new_items), len(new_rules)))
    
    item_length += 1
    remaining_items = np.unique([item for t in new_items.index for item in t])
    candidates = list(combinations(remaining_items, r=item_length))

rules = rules.apply(pd.to_numeric, errors='ignore')

Itemset Length 1	Candidates:     169	New Items:      88	New Rules:       0
Itemset Length 2	Candidates:   3,828	New Items:     213	New Rules:     426
Itemset Length 3	Candidates:  16,215	New Items:      32	New Rules:     192
Itemset Length 4	Candidates:   3,060	New Items:       0	New Rules:       0


In [7]:
rules.info()
rules.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 9 columns):
itemset               618 non-null object
antecedent            618 non-null object
consequent            618 non-null object
support_rule          618 non-null float64
support_antecedent    618 non-null float64
support_consequent    618 non-null float64
confidence            618 non-null float64
lift                  618 non-null float64
pvalue                618 non-null float64
dtypes: float64(6), object(3)
memory usage: 43.5+ KB


Unnamed: 0,itemset,antecedent,consequent,support_rule,support_antecedent,support_consequent,confidence,lift,pvalue
0,"(9, 103)",(9),(103),0.019727,0.052471,0.193512,0.375969,1.942869,2.229585e-23
1,"(9, 103)",(103),(9),0.019727,0.193512,0.052471,0.101944,1.942869,2.229585e-23
2,"(9, 123)",(9),(123),0.013626,0.052471,0.183954,0.25969,1.411714,7.615064e-06
3,"(9, 123)",(123),(9),0.013626,0.183954,0.052471,0.074074,1.411714,7.655712e-06
4,"(9, 124)",(9),(124),0.017389,0.052471,0.10901,0.331395,3.040058,7.774255e-45


In [8]:
with pd.HDFStore('rules.h5') as store:
    store.put('rules', rules)