In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import itertools

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df = pd.read_csv("GroceryStoreDataSet.csv",names=['products'])
df

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"
5,"BREAD,TEA,BOURNVITA"
6,"MAGGI,TEA,CORNFLAKES"
7,"MAGGI,BREAD,TEA,BISCUIT"
8,"JAM,MAGGI,BREAD,TEA"
9,"BREAD,MILK"


In [3]:
df.shape

(20, 1)

In [4]:
data = list(df["products"].apply(lambda x:x.split(',')))
te = TransactionEncoder()
te_data = te.fit(data).transform(data)
df = pd.DataFrame(te_data,columns=te.columns_).astype(int)

df

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,1,1,1,0,0
4,1,0,0,0,0,0,0,1,0,0,1
5,0,1,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0,1,0,0,1
7,1,0,1,0,0,0,0,1,0,0,1
8,0,0,1,0,0,0,1,1,0,0,1
9,0,0,1,0,0,0,0,0,1,0,0


In [5]:
# Find Frequency of Items
df.sum()

BISCUIT        7
BOURNVITA      4
BREAD         13
COCK           3
COFFEE         8
CORNFLAKES     6
JAM            2
MAGGI          5
MILK           5
SUGER          6
TEA            7
dtype: int64

In [6]:
# Product Frequency / Total Sales
first = pd.DataFrame(df.sum() / df.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
first

Unnamed: 0,Support
BREAD,0.65
COFFEE,0.4
BISCUIT,0.35
TEA,0.35
CORNFLAKES,0.3
SUGER,0.3
MAGGI,0.25
MILK,0.25
BOURNVITA,0.2
COCK,0.15


In [7]:
# Elimination by Support Value
first[first.Support >= 0.15]

Unnamed: 0,Support
BREAD,0.65
COFFEE,0.4
BISCUIT,0.35
TEA,0.35
CORNFLAKES,0.3
SUGER,0.3
MAGGI,0.25
MILK,0.25
BOURNVITA,0.2
COCK,0.15


In [8]:
second = list(itertools.combinations(first.index, 2))
second = [list(i) for i in second]
# Sample of combinations
second[:10]

[['BREAD', 'COFFEE'],
 ['BREAD', 'BISCUIT'],
 ['BREAD', 'TEA'],
 ['BREAD', 'CORNFLAKES'],
 ['BREAD', 'SUGER'],
 ['BREAD', 'MAGGI'],
 ['BREAD', 'MILK'],
 ['BREAD', 'BOURNVITA'],
 ['BREAD', 'COCK'],
 ['BREAD', 'JAM']]

In [9]:
# Finding support values
value = []
for i in range(0, len(second)):
    temp = df.T.loc[second[i]].sum() 
    temp = len(temp[temp == df.T.loc[second[i]].shape[0]]) / df.shape[0]
    value.append(temp)
# Create a data frame            
secondIteration = pd.DataFrame(value, columns = ["Support"])
secondIteration["index"] = [tuple(i) for i in second]
secondIteration['length'] = secondIteration['index'].apply(lambda x:len(x))
secondIteration = secondIteration.set_index("index").sort_values("Support", ascending = False)
# Elimination by Support Value
secondIteration = secondIteration[secondIteration.Support > 0.1]
secondIteration

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(TEA, MAGGI)",0.2,2
"(BREAD, TEA)",0.2,2
"(BREAD, SUGER)",0.2,2
"(BREAD, MILK)",0.2,2
"(BREAD, BISCUIT)",0.2,2
"(COFFEE, CORNFLAKES)",0.2,2
"(COFFEE, SUGER)",0.2,2
"(BREAD, COFFEE)",0.15,2
"(BREAD, MAGGI)",0.15,2
"(BREAD, BOURNVITA)",0.15,2


In [10]:
def ar_iterations(data, num_iter = 1, support_value = 0.1, iterationIndex = None):
    
    # Next Iterations
    def ar_calculation(iterationIndex = iterationIndex): 
        # Calculation of support value
        value = []
        for i in range(0, len(iterationIndex)):
            result = data.T.loc[iterationIndex[i]].sum() 
            result = len(result[result == data.T.loc[iterationIndex[i]].shape[0]]) / data.shape[0]
            value.append(result)
        # Bind results
        result = pd.DataFrame(value, columns = ["Support"])
        result["index"] = [tuple(i) for i in iterationIndex]
        result['length'] = result['index'].apply(lambda x:len(x))
        result = result.set_index("index").sort_values("Support", ascending = False)
        # Elimination by Support Value
        result = result[result.Support > support_value]
        return result    
    
    # First Iteration
    first = pd.DataFrame(df.T.sum(axis = 1) / df.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
    first = first[first.Support > support_value]
    first["length"] = 1
    
    if num_iter == 1:
        res = first.copy()
        
    # Second Iteration
    elif num_iter == 2:
        
        second = list(itertools.combinations(first.index, 2))
        second = [list(i) for i in second]
        res = ar_calculation(second)
        
    # All Iterations > 2
    else:
        nth = list(itertools.combinations(set(list(itertools.chain(*iterationIndex))), num_iter))
        nth = [list(i) for i in nth]
        res = ar_calculation(nth)
    
    return res

In [11]:
iteration1 = ar_iterations(df, num_iter=1, support_value=0.1)
iteration1

Unnamed: 0,Support,length
BREAD,0.65,1
COFFEE,0.4,1
BISCUIT,0.35,1
TEA,0.35,1
CORNFLAKES,0.3,1
SUGER,0.3,1
MAGGI,0.25,1
MILK,0.25,1
BOURNVITA,0.2,1
COCK,0.15,1


In [12]:
iteration2 = ar_iterations(df, num_iter=2, support_value=0.1)
iteration2

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(COFFEE, CORNFLAKES)",0.2,2
"(BREAD, TEA)",0.2,2
"(TEA, MAGGI)",0.2,2
"(BREAD, SUGER)",0.2,2
"(BREAD, BISCUIT)",0.2,2
"(BREAD, MILK)",0.2,2
"(COFFEE, SUGER)",0.2,2
"(BREAD, COFFEE)",0.15,2
"(COFFEE, COCK)",0.15,2
"(BISCUIT, CORNFLAKES)",0.15,2


In [13]:
iteration3 = ar_iterations(df, num_iter=3, support_value=0.01,
              iterationIndex=iteration2.index)
iteration3

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(CORNFLAKES, COCK, BISCUIT)",0.1,3
"(COFFEE, BREAD, SUGER)",0.1,3
"(COFFEE, COCK, BISCUIT)",0.1,3
"(COFFEE, CORNFLAKES, BISCUIT)",0.1,3
"(TEA, MAGGI, BISCUIT)",0.1,3
"(COFFEE, CORNFLAKES, COCK)",0.1,3
"(TEA, BREAD, MAGGI)",0.1,3
"(MILK, BREAD, BISCUIT)",0.1,3
"(TEA, BOURNVITA, BREAD)",0.1,3
"(TEA, COFFEE, MILK)",0.05,3


In [14]:
iteration4 = ar_iterations(df, num_iter=4, support_value=0.01,
              iterationIndex=iteration3.index)
iteration4

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(COFFEE, CORNFLAKES, COCK, BISCUIT)",0.1,4
"(MILK, BREAD, CORNFLAKES, BISCUIT)",0.05,4
"(TEA, COFFEE, MILK, CORNFLAKES)",0.05,4
"(TEA, BREAD, MAGGI, BISCUIT)",0.05,4


In [15]:
# Apriori
freq_items = apriori(df, min_support = 0.1, use_colnames = True, verbose = 1)
freq_items.sort_values("support", ascending = False)

Processing 110 combinations | Sampling itemset size 2Processing 234 combinations | Sampling itemset size 3Processing 108 combinations | Sampling itemset size 4




Unnamed: 0,support,itemsets
2,0.65,(BREAD)
4,0.4,(COFFEE)
0,0.35,(BISCUIT)
10,0.35,(TEA)
5,0.3,(CORNFLAKES)
9,0.3,(SUGER)
7,0.25,(MAGGI)
8,0.25,(MILK)
30,0.2,"(SUGER, COFFEE)"
34,0.2,"(TEA, MAGGI)"


In [16]:
freq_items.sort_values("support", ascending = False).head(5)

Unnamed: 0,support,itemsets
2,0.65,(BREAD)
4,0.4,(COFFEE)
0,0.35,(BISCUIT)
10,0.35,(TEA)
5,0.3,(CORNFLAKES)


In [17]:
freq_items.sort_values("support", ascending = False).tail(5)

Unnamed: 0,support,itemsets
15,0.1,"(BISCUIT, MAGGI)"
16,0.1,"(BISCUIT, MILK)"
17,0.1,"(BISCUIT, TEA)"
22,0.1,"(JAM, BREAD)"
45,0.1,"(CORNFLAKES, BISCUIT, COFFEE, COCK)"


In [18]:
# Association Rules & Info
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0.5)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BISCUIT),(BREAD),0.35,0.65,0.20,0.571429,0.879121,-0.0275,0.816667,-0.174603
1,(COCK),(BISCUIT),0.15,0.35,0.10,0.666667,1.904762,0.0475,1.950000,0.558824
2,(CORNFLAKES),(BISCUIT),0.30,0.35,0.15,0.500000,1.428571,0.0450,1.300000,0.428571
3,(BOURNVITA),(BREAD),0.20,0.65,0.15,0.750000,1.153846,0.0200,1.400000,0.166667
4,(BOURNVITA),(SUGER),0.20,0.30,0.10,0.500000,1.666667,0.0400,1.400000,0.500000
...,...,...,...,...,...,...,...,...,...,...
61,"(CORNFLAKES, COCK)","(BISCUIT, COFFEE)",0.10,0.10,0.10,1.000000,10.000000,0.0900,inf,1.000000
62,"(BISCUIT, COFFEE)","(CORNFLAKES, COCK)",0.10,0.10,0.10,1.000000,10.000000,0.0900,inf,1.000000
63,"(BISCUIT, COCK)","(COFFEE, CORNFLAKES)",0.10,0.20,0.10,1.000000,5.000000,0.0800,inf,0.888889
64,"(COFFEE, COCK)","(BISCUIT, CORNFLAKES)",0.15,0.15,0.10,0.666667,4.444444,0.0775,2.550000,0.911765


In [19]:
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
8,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
19,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75
9,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714
13,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
15,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
0,(BISCUIT),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667,-0.174603
10,(TEA),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667,-0.174603
18,(TEA),(MAGGI),0.35,0.25,0.2,0.571429,2.285714,0.1125,1.75,0.865385
