In [4]:
# Apriori algorithm: Generating frequent itemsets
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from collections import defaultdict


In [8]:
# load basket dataset
#  transforming the data such that each row is a transaction
basket = pd.read_csv(r"C:\Users\Checkout\Downloads\baskets.csv")

combined = defaultdict(list)
for index, row in basket.iterrows():
    if row['Item'] != 'NONE' :
        combined[(row["Transaction"])].append(row['Item']) #created dictionary where key is transaction number and value is list of items

records = []
for key, val in combined.items():
    records.append(list(set(val))) #created list of lists from the dictionary to get the data in required data structure for further data transformation
display(records)


[['Bread'],
 ['Scandinavian'],
 ['Jam', 'Cookies', 'Hot chocolate'],
 ['Muffin'],
 ['Bread', 'Pastry', 'Coffee'],
 ['Medialuna', 'Muffin', 'Pastry'],
 ['Medialuna', 'Tea', 'Pastry', 'Coffee'],
 ['Bread', 'Pastry'],
 ['Muffin', 'Bread'],
 ['Medialuna', 'Scandinavian'],
 ['Medialuna', 'Bread'],
 ['Pastry', 'Coffee', 'Tartine', 'Jam', 'Tea'],
 ['Basket', 'Bread', 'Coffee'],
 ['Medialuna', 'Bread', 'Pastry'],
 ['Mineral water', 'Scandinavian'],
 ['Medialuna', 'Bread', 'Coffee'],
 ['Hot chocolate'],
 ['Farm House'],
 ['Farm House', 'Bread'],
 ['Medialuna', 'Bread'],
 ['Medialuna', 'Bread', 'Coffee'],
 ['Jam'],
 ['Muffin', 'Scandinavian'],
 ['Bread'],
 ['Scandinavian'],
 ['Fudge'],
 ['Scandinavian'],
 ['Bread', 'Coffee'],
 ['Jam', 'Bread'],
 ['Bread'],
 ['Basket'],
 ['Muffin', 'Scandinavian'],
 ['Coffee'],
 ['Muffin', 'Coffee'],
 ['Muffin', 'Scandinavian'],
 ['Tea', 'Bread'],
 ['Bread', 'Coffee'],
 ['Tea', 'Bread'],
 ['Scandinavian'],
 ['Tartine', 'Muffin', 'Juice', 'Coffee'],
 ['Scandinavia

In [6]:
# TransactionEncoder transforms the data into the correct format. Pandas helps us to create the dataframe:

te = TransactionEncoder()
te_ary = te.fit(records).transform(records) #records in form of list of lists to load it in TransactionEncoder() 
df = pd.DataFrame(te_ary, columns=te.columns_)
display(df)

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9527,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
9528,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9529,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# Let us return the items and itemsets with at least 50% support: By default, apriori returns the column indices of the items, which is helpful for association rule mining. Set use_colnames=True to convert these integer values into the respective item names:

from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
#As the there are frequent itemsets with min_support of 0.5, there's not output below.
display(frequent_itemsets)
#After changing value as low as 0.01, found frequent itemset and the maximum support for the dataset is 0.475 for {Coffee}
frequent_itemsets = apriori(df, min_support=0.0001, use_colnames=True)
display(frequent_itemsets)

# The association_rules() function allows to (1) specify your metric of interest (2) the according threshold. In this notebook, the implemented measures are confidence and lift. Let's say you are interested in rules derived from the frequent itemsets only if the level of confidence is above the 60 % threshold (min_threshold=0.6):
from mlxtend.frequent_patterns import association_rules
"""
For confidence metric with min thrshold of 0.6 there's only one association rule, {Toast}=>{Coffee}
On decreasing the min_threshold value to 0.1 for confidence metrics whoch has frequent itemset of min support of 0.01, gives some more 
association rules like {Spanish Brunch} => {Coffee} with confidence = 0.598837, slightly less than 0.6
Also, found the rules from same itemset have same support values.
"""
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
display(rules)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
display(rules)

# Q1. Display associations rules for metric = 'lift' and min_threshold = 1.2
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
print("Q1")
display(rules)
# Q2. Display associations rules for metric = 'support' and min_threshold = 0.6
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.05)
print("Q2")
display(rules)
# # Let us add a new feature to the dataframe showing the length of antecedents. Following code does this feature creation:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
display(rules)
# Q3. At least 2 antecedents and confidence greater than or equal to 0.75
print("Q3")
"""
There are no rules with atleast 2 antecedent and confidence >= 0.75 even if the min support for frequent item set 
is reduced to 0.0001
"""
# frequent_itemsets = apriori(df, min_support=0.0001, use_colnames=True)
# # display(frequent_itemsets)
# display(rules[(rules['antecedent_len'] >=2) & (rules['confidence'] >= 0.75) ])

# frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
# display(frequent_itemsets)
display(rules[(rules['antecedent_len'] >=1) & (rules['confidence'] >= 0.45) ])
# Q4. support atleast 0.8 and lift atleast 1.00
print("Q4")
"""
There are no rules with support>=0.8 and lift >= 1.00 even if the min support for frequent item set 
is 0.1
"""
display(rules[(rules['support'] >=0.8) & (rules['lift'] >= 1.00) ])
"""
reducing values to support>=0.05 and lift >= 0.005
"""
display(rules[(rules['support'] >=0.05) & (rules['lift'] >= 0.005) ])
# Q5. sort the rules in descending order first by length of antecedents and then by lift
print("Q5")
display(rules.sort_values(['antecedent_len', 'lift'], ascending =[False, False]))

Unnamed: 0,support,itemsets


Unnamed: 0,support,itemsets
0,0.000105,(Adjustment)
1,0.004512,(Afternoon with the baker)
2,0.036093,(Alfajores)
3,0.000734,(Argentina Night)
4,0.003987,(Art Tray)
...,...,...
14496,0.000105,"(Pastry, Coffee, Cookies, Juice, Tiffin, Tea, ..."
14497,0.000105,"(Pastry, Coffee, Cookies, Juice, Tiffin, Tea, ..."
14498,0.000105,"(Brownie, Bare Popcorn, Art Tray, Coffee, Cook..."
14499,0.000105,"(Chicken Stew, Duck egg, Brownie, Coffee, Cook..."


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bacon),(Art Tray),0.000105,0.003987,0.000105,1.000000,250.815789,0.000105,inf
1,(Art Tray),(Coffee),0.003987,0.475081,0.002728,0.684211,1.440197,0.000834,1.662243
2,(Hack the stack),(Art Tray),0.000210,0.003987,0.000210,1.000000,250.815789,0.000209,inf
3,(Bacon),(Coffee),0.000105,0.475081,0.000105,1.000000,2.104903,0.000055,inf
4,(Bacon),(Juice),0.000105,0.038296,0.000105,1.000000,26.112329,0.000101,inf
...,...,...,...,...,...,...,...,...,...
167382,"(Medialuna, Bread, Granola)","(Coffee, Sandwich, Cake, Hot chocolate, Muffin...",0.000105,0.000105,0.000105,1.000000,9531.000000,0.000105,inf
167383,"(Sandwich, Muffin, Cake)","(Coffee, Granola, Hot chocolate, Tea, NONE, Me...",0.000105,0.000105,0.000105,1.000000,9531.000000,0.000105,inf
167384,"(Medialuna, Sandwich, Hot chocolate)","(Coffee, Granola, Cake, Muffin, Tea, NONE, Bread)",0.000105,0.000105,0.000105,1.000000,9531.000000,0.000105,inf
167385,"(Muffin, Granola)","(Coffee, Sandwich, Cake, Hot chocolate, Tea, N...",0.000105,0.000105,0.000105,1.000000,9531.000000,0.000105,inf


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Argentina Night),(Afternoon with the baker),0.000734,0.004512,0.000210,0.285714,63.328904,0.000207,1.393684
1,(Alfajores),(Bread),0.036093,0.324940,0.010282,0.284884,0.876728,-0.001446,0.943987
2,(Brioche and salami),(Alfajores),0.000315,0.036093,0.000105,0.333333,9.235465,0.000094,1.445861
3,(Alfajores),(Cake),0.036093,0.103137,0.004092,0.113372,1.099236,0.000369,1.011544
4,(Chocolates),(Alfajores),0.000944,0.036093,0.000105,0.111111,3.078488,0.000071,1.084396
...,...,...,...,...,...,...,...,...,...
266296,"(Muffin, Granola)","(Coffee, Sandwich, Cake, Hot chocolate, Tea, N...",0.000105,0.000105,0.000105,1.000000,9531.000000,0.000105,inf
266297,"(Tea, Granola)","(Coffee, Sandwich, Cake, Hot chocolate, Muffin...",0.000734,0.000105,0.000105,0.142857,1361.571429,0.000105,1.166544
266298,"(NONE, Granola)","(Coffee, Sandwich, Cake, Hot chocolate, Muffin...",0.000839,0.000105,0.000105,0.125000,1191.375000,0.000105,1.142737
266299,"(Medialuna, Granola)","(Coffee, Sandwich, Cake, Hot chocolate, Muffin...",0.000105,0.000105,0.000105,1.000000,9531.000000,0.000105,inf


Q1


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Argentina Night),(Afternoon with the baker),0.000734,0.004512,0.000210,0.285714,63.328904,0.000207,1.393684
1,(Afternoon with the baker),(Argentina Night),0.004512,0.000734,0.000210,0.046512,63.328904,0.000207,1.048010
2,(Afternoon with the baker),(Duck egg),0.004512,0.001259,0.000105,0.023256,18.470930,0.000099,1.022520
3,(Duck egg),(Afternoon with the baker),0.001259,0.004512,0.000105,0.083333,18.470930,0.000099,1.085987
4,(Afternoon with the baker),(Extra Salami or Feta),0.004512,0.003987,0.000105,0.023256,5.832925,0.000087,1.019728
...,...,...,...,...,...,...,...,...,...
408669,(Muffin),"(Granola, Coffee, Sandwich, Cake, Hot chocolat...",0.038191,0.000105,0.000105,0.002747,26.184066,0.000101,1.002650
408670,(Tea),"(Granola, Coffee, Sandwich, Cake, Hot chocolat...",0.141643,0.000105,0.000105,0.000741,7.060000,0.000090,1.000636
408671,(NONE),"(Granola, Coffee, Sandwich, Cake, Hot chocolat...",0.079005,0.000105,0.000105,0.001328,12.657371,0.000097,1.001225
408672,(Medialuna),"(Granola, Coffee, Sandwich, Cake, Hot chocolat...",0.061379,0.000105,0.000105,0.001709,16.292308,0.000098,1.001607


Q2


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bread),(Coffee),0.32494,0.475081,0.089393,0.275105,0.579069,-0.06498,0.724131
1,(Coffee),(Bread),0.475081,0.32494,0.089393,0.188163,0.579069,-0.06498,0.831522
2,(Cake),(Coffee),0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667
3,(Coffee),(Cake),0.475081,0.103137,0.054349,0.114399,1.109196,0.00535,1.012717


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Bread),(Coffee),0.32494,0.475081,0.089393,0.275105,0.579069,-0.06498,0.724131,1
1,(Coffee),(Bread),0.475081,0.32494,0.089393,0.188163,0.579069,-0.06498,0.831522,1
2,(Cake),(Coffee),0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667,1
3,(Coffee),(Cake),0.475081,0.103137,0.054349,0.114399,1.109196,0.00535,1.012717,1


Q3


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
2,(Cake),(Coffee),0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667,1


Q4


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Bread),(Coffee),0.32494,0.475081,0.089393,0.275105,0.579069,-0.06498,0.724131,1
1,(Coffee),(Bread),0.475081,0.32494,0.089393,0.188163,0.579069,-0.06498,0.831522,1
2,(Cake),(Coffee),0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667,1
3,(Coffee),(Cake),0.475081,0.103137,0.054349,0.114399,1.109196,0.00535,1.012717,1


Q5


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
2,(Cake),(Coffee),0.103137,0.475081,0.054349,0.526958,1.109196,0.00535,1.109667,1
3,(Coffee),(Cake),0.475081,0.103137,0.054349,0.114399,1.109196,0.00535,1.012717,1
0,(Bread),(Coffee),0.32494,0.475081,0.089393,0.275105,0.579069,-0.06498,0.724131,1
1,(Coffee),(Bread),0.475081,0.32494,0.089393,0.188163,0.579069,-0.06498,0.831522,1
