In [1]:
# load Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel('/content/Online retail.xlsx')
data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [3]:
data.columns = ['Items']
data

Unnamed: 0,Items
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"


In [4]:
# Drop missing values and duplicates
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data

Unnamed: 0,Items
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7492,"burgers,salmon,pancakes,french fries,frozen sm..."
7493,"turkey,burgers,dessert wine,shrimp,pasta,tomat..."
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."


In [5]:
# Convert transaction strings into lists
transactions = data['Items'].apply(lambda x: x.strip().split(','))
transactions

Unnamed: 0,Items
0,"[burgers, meatballs, eggs]"
1,[chutney]
2,"[turkey, avocado]"
3,"[mineral water, milk, energy bar, whole wheat ..."
4,[low fat yogurt]
...,...
7492,"[burgers, salmon, pancakes, french fries, froz..."
7493,"[turkey, burgers, dessert wine, shrimp, pasta,..."
7495,"[butter, light mayo, fresh bread]"
7496,"[burgers, frozen vegetables, eggs, french frie..."


In [6]:
# Clean the items by stripping extra spaces
data['items'] = data['Items'].apply(lambda x: [item.strip() for item in x])
data

Unnamed: 0,Items,items
0,"burgers,meatballs,eggs","[b, u, r, g, e, r, s, ,, m, e, a, t, b, a, l, ..."
1,chutney,"[c, h, u, t, n, e, y]"
2,"turkey,avocado","[t, u, r, k, e, y, ,, a, v, o, c, a, d, o]"
3,"mineral water,milk,energy bar,whole wheat rice...","[m, i, n, e, r, a, l, , w, a, t, e, r, ,, m, i..."
4,low fat yogurt,"[l, o, w, , f, a, t, , y, o, g, u, r, t]"
...,...,...
7492,"burgers,salmon,pancakes,french fries,frozen sm...","[b, u, r, g, e, r, s, ,, s, a, l, m, o, n, ,, ..."
7493,"turkey,burgers,dessert wine,shrimp,pasta,tomat...","[t, u, r, k, e, y, ,, b, u, r, g, e, r, s, ,, ..."
7495,"butter,light mayo,fresh bread","[b, u, t, t, e, r, ,, l, i, g, h, t, , m, a, y..."
7496,"burgers,frozen vegetables,eggs,french fries,ma...","[b, u, r, g, e, r, s, ,, f, r, o, z, e, n, , v..."


In [7]:
# One-hot encode the transactions
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
data_encoded = pd.DataFrame(te_array, columns=te.columns_)

In [8]:
# Find frequent itemsets with a minimum support threshold (e.g., 0.01 = 1%)
frequent_itemsets = apriori(data_encoded, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.029179,(almonds)
1,0.011014,(antioxydant juice)
2,0.045797,(avocado)
3,0.012560,(bacon)
4,0.015459,(barbecue sauce)
...,...,...
431,0.014686,"(olive oil, spaghetti, mineral water)"
432,0.016618,"(pancakes, spaghetti, mineral water)"
433,0.012367,"(spaghetti, shrimp, mineral water)"
434,0.010821,"(spaghetti, mineral water, soup)"


In [9]:
# Sort by support
frequent_itemsets.sort_values(by="support", ascending=False).head()

Unnamed: 0,support,itemsets
54,0.29971,(mineral water)
73,0.229565,(spaghetti)
24,0.208116,(eggs)
17,0.205217,(chocolate)
30,0.192657,(french fries)


In [10]:
# Generate rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(mineral water),(almonds),0.29971,0.029179,0.010821,0.036106,1.237399,1.0,0.002076,1.007186,0.273962,0.034022,0.007135,0.203483
1,(almonds),(mineral water),0.029179,0.29971,0.010821,0.370861,1.237399,1.0,0.002076,1.113092,0.197619,0.034022,0.101602,0.203483
2,(avocado),(chocolate),0.045797,0.205217,0.010242,0.223629,1.089716,1.0,0.000843,1.023715,0.086281,0.042536,0.023165,0.136767
3,(chocolate),(avocado),0.205217,0.045797,0.010242,0.049906,1.089716,1.0,0.000843,1.004325,0.103588,0.042536,0.004306,0.136767
4,(avocado),(french fries),0.045797,0.192657,0.011594,0.253165,1.314069,1.0,0.002771,1.081019,0.250476,0.051107,0.074947,0.156673


In [11]:
# Sort rules by lift
rules_sorted = rules.sort_values(by="lift", ascending=False)
rules_sorted.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
425,(olive oil),(whole wheat pasta),0.087536,0.04058,0.011014,0.125828,3.100757,1.0,0.007462,1.097519,0.742493,0.094059,0.088854,0.198628
424,(whole wheat pasta),(olive oil),0.04058,0.087536,0.011014,0.271429,3.100757,1.0,0.007462,1.252401,0.706154,0.094059,0.201534,0.198628
797,(soup),"(milk, mineral water)",0.070918,0.067826,0.012367,0.174387,2.571089,1.0,0.007557,1.129069,0.657703,0.097859,0.114314,0.178362
792,"(milk, mineral water)",(soup),0.067826,0.070918,0.012367,0.182336,2.571089,1.0,0.007557,1.136264,0.655521,0.097859,0.119923,0.178362
298,(ground beef),(herb & pepper),0.135845,0.066473,0.022802,0.167852,2.5251,1.0,0.013772,1.121828,0.698921,0.127018,0.108597,0.255438


In [12]:
# Interpretation:

# Meaning: Customers who buy milk and bread are also likely to buy butter.

# Support (5%): 5% of all transactions include milk, bread, and butter together.

# Confidence (70%): 70% of the time, if milk and bread are purchased, butter is also purchased.

# Lift (3.2): The chance of buying butter increases 3.2x when milk and bread are bought together — indicating a strong association.



In [13]:
# Insights

# Frequently Bought Together:
  # Items like spaghetti, olive oil, and tomato sauce often appear in rules, suggesting users tend to shop for pasta ingredients together.

# High Lift Combinations:

  # Rules with lift > 2 often reflect items with strong cross-sell opportunities (e.g., mineral water → green tea).

# Seasonal/Occasional Buying:

  #Items like dessert wine, champagne, frozen smoothie might cluster together during weekends or holidays.

#Complementary Items:

  #Pairings such as whole wheat bread → eggs, or frozen vegetables → soup show consumers are buying ingredients to complete meals.

# Interview Questions

In [None]:
# 1. What is Lift and why is it important in Association Rules?

# Lift tells us how much more likely two items are to be bought together than if they were bought independently.

 Why it's important:
Lift helps us find strong, non-random relationships between products. It’s more reliable than confidence alone, especially when items are very common.




In [None]:
# 2. What is Support and Confidence? How do you calculate them?

upport shows how often an item or itemset appears in all transactions.
For example, if 3 out of 10 people buy milk and bread together, the support is 30%.

Confidence shows how often the rule is actually correct.
If 4 out of 5 people who bought milk also bought bread, the confidence is 80%

Support = (Number of transactions with both A and B) / Total transactions

Confidence = (Transactions with A and B) / Transactions with A

In [None]:
 # 3. What are some challenges of Association Rules Mining?

Too many rules: You might end up with hundreds or thousands of rules, many of which are not useful.

* Rare itemsets: Some combinations don’t show up enough, so they’re ignored even if they’re important.

* Takes a lot of time: Especially with large datasets — finding frequent itemsets and rules can take a while.

* No sense of time/order: It doesn’t know when things were bought — just that they were bought together.