In [22]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')


In [23]:
data=pd.read_excel('Online retail.xlsx',names=['products'],header=None)

In [24]:
data.head()

Unnamed: 0,products
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [25]:
data.duplicated().sum()

2325

In [26]:
data.drop_duplicates(inplace=True)

In [27]:
data.describe()

Unnamed: 0,products
count,5176
unique,5176
top,"shrimp,almonds,avocado,vegetables mix,green gr..."
freq,1


In [28]:
data['products'].unique()

array(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil',
       'burgers,meatballs,eggs', 'chutney', ...,
       'butter,light mayo,fresh bread',
       'burgers,frozen vegetables,eggs,french fries,magazines,green tea',
       'eggs,frozen smoothie,yogurt cake,low fat yogurt'], dtype=object)

In [29]:
transactions = data['products'].apply(lambda x: x.split(',')).tolist()

transactions[0:5]

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers', 'meatballs', 'eggs'],
 ['chutney'],
 ['turkey', 'avocado'],
 ['mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea']]

In [31]:
# Use TransactionEncoder to transform the transactions into a binary matrix
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_) # type: ignore

# Display the transformed dataframe
print(df.head())

    asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0       False     True               True      False     True        False   
1       False    False              False      False    False        False   
2       False    False              False      False    False        False   
3       False    False              False      False     True        False   
4       False    False              False      False    False        False   

   bacon  barbecue sauce  black tea  blueberries  ...  turkey  vegetables mix  \
0  False           False      False        False  ...   False            True   
1  False           False      False        False  ...   False           False   
2  False           False      False        False  ...   False           False   
3  False           False      False        False  ...    True           False   
4  False           False      False        False  ...   False           False   

   water spray  white wine  whole weat flour

In [62]:
# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)
display(frequent_itemsets)

Unnamed: 0,support,itemsets
0,0.029366,(almonds)
1,0.045981,(avocado)
2,0.020479,(black tea)
3,0.045015,(brownies)
4,0.113794,(burgers)
...,...,...
169,0.020093,"(milk, chocolate, mineral water)"
170,0.022991,"(spaghetti, chocolate, mineral water)"
171,0.020672,"(spaghetti, eggs, mineral water)"
172,0.024730,"(spaghetti, mineral water, ground beef)"


In [64]:
# Generate the association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5)
display(rules)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(chocolate),(burgers),0.205178,0.113794,0.024536,0.119586,1.050892,0.001188,1.006578,0.060929
1,(burgers),(chocolate),0.113794,0.205178,0.024536,0.215620,1.050892,0.001188,1.013312,0.054646
2,(eggs),(burgers),0.208076,0.113794,0.036128,0.173630,1.525826,0.012450,1.072408,0.435164
3,(burgers),(eggs),0.113794,0.208076,0.036128,0.317487,1.525826,0.012450,1.160307,0.388868
4,(french fries),(burgers),0.192620,0.113794,0.029366,0.152457,1.339761,0.007447,1.045618,0.314100
...,...,...,...,...,...,...,...,...,...,...
237,"(milk, mineral water)",(spaghetti),0.067813,0.229521,0.022604,0.333333,1.452301,0.007040,1.155719,0.334093
238,"(spaghetti, mineral water)",(milk),0.085008,0.170015,0.022604,0.265909,1.564029,0.008152,1.130629,0.394130
239,(milk),"(spaghetti, mineral water)",0.170015,0.085008,0.022604,0.132955,1.564029,0.008152,1.055299,0.434497
240,(spaghetti),"(milk, mineral water)",0.229521,0.067813,0.022604,0.098485,1.452301,0.007040,1.034023,0.404213


# What is lift and why is it important in Association rules?

Lift in association rule mining is a standard metric used to measure the strength of associations between items in a dataset. The lift value is an important measure in association rule mining because it provides a way to distinguish between significant and insignificant association rules. A high lift value indicates that the association rule is more significant, as it suggests that the two items are highly dependent on each other. On the other hand, a low lift value indicates that the association rule is not very significant, as it suggests that the two items are not strongly dependent on each other.

# What is support and Confidence. How do you calculate them?

Support measures how frequently an itemset appears in the dataset. It gives an idea of how popular an itemset is. The support of an itemset "A"
is defined as the proportion of transactions in the dataset that contain "A".
                    
                    Support(A) = (Number of transactions containing A) / (Total number of transactions)

Confidence measures the likelihood that item "B" is also purchased when item "A" is purchased. It is used to evaluate the reliability of an association rule. The confidence of an association rule A→B is defined as the proportion of transactions containing A that also contain B.

                    Confidence(A→B)= Support(AUB) / Support(A)

# What are some limitations or challenges of Association rules mining?

Association rule mining faces challenges like scalability, interpretability, and noise sensitivity. It can produce overwhelming, redundant rules, struggles with rare items, and requires careful threshold setting. Despite its power, ensuring actionable, updated, and relevant rules necessitates advanced algorithms, complementary metrics, and domain expertise.