In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [3]:
df = pd.read_csv('Groceries_dataset.csv', encoding='latin-1')
df


Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [4]:
df_grouped = df.groupby('Member_number')['itemDescription'].apply(list).reset_index()
df_grouped

Unnamed: 0,Member_number,itemDescription
0,1000,"[soda, canned beer, sausage, sausage, whole mi..."
1,1001,"[frankfurter, frankfurter, beef, sausage, whol..."
2,1002,"[tropical fruit, butter milk, butter, frozen v..."
3,1003,"[sausage, root vegetables, rolls/buns, deterge..."
4,1004,"[other vegetables, pip fruit, root vegetables,..."
...,...,...
3893,4996,"[dessert, salty snack, rolls/buns, misc. bever..."
3894,4997,"[tropical fruit, white wine, whole milk, curd,..."
3895,4998,"[rolls/buns, curd]"
3896,4999,"[bottled water, butter milk, tropical fruit, b..."


one list transaction


In [5]:
transactions = df_grouped['itemDescription'].tolist()

In [6]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
df_encoded

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3894,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3896,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False


In [7]:
frequent_itemsets = apriori(df_encoded, min_support=0.002, use_colnames=True)
print(frequent_itemsets)

        support                                           itemsets
0      0.015393                 frozenset({Instant food products})
1      0.078502                              frozenset({UHT-milk})
2      0.005644                      frozenset({abrasive cleaner})
3      0.007440                      frozenset({artif. sweetener})
4      0.031042                         frozenset({baking powder})
...         ...                                                ...
52596  0.002565  frozenset({soda, rolls/buns, root vegetables, ...
52597  0.002309  frozenset({soda, rolls/buns, whole milk, yogur...
52598  0.002052  frozenset({soda, root vegetables, whole milk, ...
52599  0.002052  frozenset({soda, rolls/buns, root vegetables, ...
52600  0.002052  frozenset({pastry, rolls/buns, root vegetables...

[52601 rows x 2 columns]


In [8]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.1)

In [9]:
rules_sorted = rules[["antecedents", "consequents", "support", "confidence", "lift"]].sort_values(by="lift", ascending=False)

In [10]:
print("Top 10 Association Rules:")
print(rules_sorted.head(10))

Top 10 Association Rules:
                                              antecedents  \
648776  frozenset({other vegetables, bottled water, po...   
648765            frozenset({berries, pip fruit, yogurt})   
648759  frozenset({other vegetables, yogurt, pork, bot...   
648782                    frozenset({berries, pip fruit})   
648768      frozenset({pip fruit, yogurt, bottled water})   
648773       frozenset({berries, other vegetables, pork})   
648763        frozenset({pip fruit, bottled water, pork})   
648778     frozenset({other vegetables, berries, yogurt})   
648774           frozenset({yogurt, pork, bottled water})   
648767  frozenset({berries, pip fruit, other vegetables})   

                                              consequents   support  \
648776            frozenset({berries, pip fruit, yogurt})  0.002052   
648765  frozenset({other vegetables, bottled water, po...  0.002052   
648759                    frozenset({berries, pip fruit})  0.002052   
648782  frozenset(