In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [2]:
# Load dataset
file_path = "Online retail.xlsx"   # Adjust path if needed
data = pd.read_excel(file_path)

In [3]:
# Step 1: Convert the single column into list of transactions
transactions = data.iloc[:,0].apply(lambda x: x.split(','))


In [4]:
# Step 2: Transform transactions into one-hot encoded dataframe
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_array, columns=te.columns_)

In [5]:
# Step 3: Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)


In [6]:
# Step 4: Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)


In [7]:
# Step 5: Sort rules by confidence and lift for better readability
rules_sorted = rules.sort_values(by=["confidence", "lift"], ascending=[False, False])


In [8]:
# Display sample outputs
print("Frequent Itemsets:")
print(frequent_itemsets.head())

print("\nAssociation Rules:")
print(rules_sorted.head())

Frequent Itemsets:
    support    itemsets
0  0.020267   (almonds)
1  0.033200   (avocado)
2  0.033733  (brownies)
3  0.087200   (burgers)
4  0.030133    (butter)

Association Rules:
      antecedents      consequents  antecedent support  consequent support  \
80         (soup)  (mineral water)            0.050533            0.238267   
74    (olive oil)  (mineral water)            0.065733            0.238267   
64  (ground beef)  (mineral water)            0.098267            0.238267   
66  (ground beef)      (spaghetti)            0.098267            0.174133   
29  (cooking oil)  (mineral water)            0.051067            0.238267   

     support  confidence      lift  representativity  leverage  conviction  \
80  0.023067    0.456464  1.915771               1.0  0.011026    1.401441   
74  0.027467    0.417850  1.753707               1.0  0.011805    1.308483   
64  0.040933    0.416554  1.748266               1.0  0.017520    1.305576   
66  0.039200    0.398915  2.290857  

In [9]:
# Save results to Excel
frequent_itemsets.to_excel("frequent_itemsets.xlsx", index=False)
rules_sorted.to_excel("association_rules.xlsx", index=False)

In [None]:
'''
Interview Questions:
1.	What is lift and why is it important in Association rules?
Lift measures how strongly two items are associated compared to their expected occurrence if they were independent. 
A lift greater than 1 means the items occur together more often than by chance, while a lift less than 1 means they occur together less often. 
It is important because it helps identify truly meaningful relationships, avoiding misleading results that can happen if we rely only on confidence.
'''

In [None]:
'''
2.	What is support and Confidence. How do you calculate them?
Support shows how frequently a set of items appears in the dataset. It tells us the overall importance of a rule.

Confidence measures how often one item is found in transactions that already contain another item. It reflects the reliability of the rule.

Together, support ensures the rule is relevant, and confidence shows its strength.
'''

In [None]:
'''
3.	What are some limitations or challenges of Association rules mining?
It can generate a very large number of rules, many of which are not useful.

Rare but important item associations may be missed.

Processing large datasets can be computationally expensive.

Not all rules are meaningful or actionable; they need careful filtering.

It does not capture the order or timing of purchases.

It shows correlation but not causation.
'''