# ASSOCIATION RULES

In [2]:
!pip install mlxtend



In [3]:
import mlxtend

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [5]:
data=pd.read_excel("Online retail.xlsx", sheet_name='Sheet1')

In [6]:
data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [7]:
# Split each transaction into a list of items and strip any leading/trailing whitespace
data['items'] = data.iloc[:, 0].str.split(',').apply(lambda x: [item.strip() for item in x])

# Ensure each item list contains unique items
data['items'] = data['items'].apply(lambda x: list(set(x)))

data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil",items
0,"burgers,meatballs,eggs","[burgers, eggs, meatballs]"
1,chutney,[chutney]
2,"turkey,avocado","[avocado, turkey]"
3,"mineral water,milk,energy bar,whole wheat rice...","[milk, green tea, mineral water, energy bar, w..."
4,low fat yogurt,[low fat yogurt]


# Data Preprocessing:

In [8]:
data.isnull().sum()

shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil    0
items                                                                                                                                                                                                                              0
dtype: int64

In [9]:
# Create a list of all unique items
all_items = list(set(item for sublist in data['items'] for item in sublist))

# One-hot encode the transactions
encoded_data = pd.DataFrame(0, index=data.index, columns=all_items)

# unique is refered as 1 and 0 for non unique
for index, items in data['items'].items():
    encoded_data.loc[index, items] = 1
encoded_data

Unnamed: 0,green beans,shrimp,low fat yogurt,tea,french fries,flax seed,clothes accessories,cake,herb & pepper,black tea,...,honey,asparagus,spaghetti,cider,chicken,fresh bread,french wine,antioxydant juice,body spray,pickles
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7496,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Create a list of all unique items
all_items = list(set(item for sublist in data['items'] for item in sublist))
all_items

['green beans',
 'shrimp',
 'low fat yogurt',
 'tea',
 'french fries',
 'flax seed',
 'clothes accessories',
 'cake',
 'herb & pepper',
 'black tea',
 'mayonnaise',
 'corn',
 'melons',
 'green tea',
 'protein bar',
 'hand protein bar',
 'gums',
 'mushroom cream sauce',
 'turkey',
 'almonds',
 'shampoo',
 'muffins',
 'blueberries',
 'frozen smoothie',
 'pancakes',
 'carrots',
 'cookies',
 'rice',
 'pasta',
 'extra dark chocolate',
 'burgers',
 'toothpaste',
 'zucchini',
 'ham',
 'water spray',
 'frozen vegetables',
 'bramble',
 'energy bar',
 'sparkling water',
 'fromage blanc',
 'parmesan cheese',
 'nonfat milk',
 'salmon',
 'eggplant',
 'strong cheese',
 'milk',
 'grated cheese',
 'mineral water',
 'cream',
 'ketchup',
 'whole wheat rice',
 'spinach',
 'escalope',
 'hot dogs',
 'sandwich',
 'light mayo',
 'chocolate bread',
 'olive oil',
 'magazines',
 'champagne',
 'white wine',
 'cooking oil',
 'soup',
 'tomatoes',
 'cauliflower',
 'tomato juice',
 'vegetables mix',
 'candy bars',
 

In [17]:
#One-hot encode
basket = pd.DataFrame(0, index=data.index, columns=all_items)
basket.head()

Unnamed: 0,green beans,shrimp,low fat yogurt,tea,french fries,flax seed,clothes accessories,cake,herb & pepper,black tea,...,honey,asparagus,spaghetti,cider,chicken,fresh bread,french wine,antioxydant juice,body spray,pickles
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# unique is refered as 1 and 0 for non unique
for index, items in data['items'].items():
    basket.loc[index, items] = 1
basket

Unnamed: 0,green beans,shrimp,low fat yogurt,tea,french fries,flax seed,clothes accessories,cake,herb & pepper,black tea,...,honey,asparagus,spaghetti,cider,chicken,fresh bread,french wine,antioxydant juice,body spray,pickles
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7496,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.071333,(shrimp)
1,0.076400,(low fat yogurt)
2,0.170933,(french fries)
3,0.081067,(cake)
4,0.049467,(herb & pepper)
...,...,...
254,0.014267,"(mineral water, eggs, spaghetti)"
255,0.010933,"(chocolate, ground beef, mineral water)"
256,0.017067,"(mineral water, ground beef, spaghetti)"
257,0.015867,"(chocolate, mineral water, spaghetti)"


In [27]:
# Generate the association rules
rules = association_rules(frequent_itemsets,num_itemsets=0, metric="lift", min_threshold=1.0)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(green tea),(shrimp),0.132000,0.071333,0.011333,0.085859,1.203625,1.0,0.001917,1.015890,0.194904,0.059028,0.015641,0.122369
1,(shrimp),(green tea),0.071333,0.132000,0.011333,0.158879,1.203625,1.0,0.001917,1.031956,0.182171,0.059028,0.030966,0.122369
2,(pancakes),(shrimp),0.095067,0.071333,0.010533,0.110799,1.553263,1.0,0.003752,1.044384,0.393614,0.067579,0.042498,0.129231
3,(shrimp),(pancakes),0.071333,0.095067,0.010533,0.147664,1.553263,1.0,0.003752,1.061709,0.383554,0.067579,0.058122,0.129231
4,(frozen vegetables),(shrimp),0.095333,0.071333,0.016667,0.174825,2.450820,1.0,0.009866,1.125418,0.654355,0.111111,0.111441,0.204235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,"(chocolate, spaghetti)",(eggs),0.039200,0.179733,0.010533,0.268707,1.495034,1.0,0.003488,1.121667,0.344628,0.050544,0.108470,0.163656
404,"(eggs, spaghetti)",(chocolate),0.036533,0.163867,0.010533,0.288321,1.759486,1.0,0.004547,1.174875,0.448020,0.055478,0.148845,0.176301
405,(chocolate),"(eggs, spaghetti)",0.163867,0.036533,0.010533,0.064280,1.759486,1.0,0.004547,1.029653,0.516248,0.055478,0.028799,0.176301
406,(eggs),"(chocolate, spaghetti)",0.179733,0.039200,0.010533,0.058605,1.495034,1.0,0.003488,1.020613,0.403672,0.050544,0.020197,0.163656


In [28]:
rules1 = rules[["antecedents","consequents","support","confidence","lift"]]
rules1

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(green tea),(shrimp),0.011333,0.085859,1.203625
1,(shrimp),(green tea),0.011333,0.158879,1.203625
2,(pancakes),(shrimp),0.010533,0.110799,1.553263
3,(shrimp),(pancakes),0.010533,0.147664,1.553263
4,(frozen vegetables),(shrimp),0.016667,0.174825,2.450820
...,...,...,...,...,...
403,"(chocolate, spaghetti)",(eggs),0.010533,0.268707,1.495034
404,"(eggs, spaghetti)",(chocolate),0.010533,0.288321,1.759486
405,(chocolate),"(eggs, spaghetti)",0.010533,0.064280,1.759486
406,(eggs),"(chocolate, spaghetti)",0.010533,0.058605,1.495034


In [31]:
rules2 = rules1[(rules1["confidence"] >= 0.5) & (rules1["support"] >= 0.01) & (rules1["lift"] >= 1.5)]
rules2

Unnamed: 0,antecedents,consequents,support,confidence,lift
336,"(milk, ground beef)",(mineral water),0.011067,0.50303,2.111207
366,"(eggs, ground beef)",(mineral water),0.010133,0.506667,2.126469


In [32]:
# 1.What is lift and why is it important in Association rules?
#Lift is a metric used to evaluate the strength of an association rule by 
#measuring how much more likely the consequent (B) is to occur when the 
#antecedent (A) occurs, compared to when A and B are independent.
#Lift helps identify rules that are not due to chance.

In [None]:
#2.What is support and Confidence. How do you calculate them?
#Support measures how frequently an itemset appears in the dataset.
#Support(A > B)=  Number of transactions containing both A and B  / Total number of transactions 
#Confidence measures the likelihood that the consequent (B) is purchased given that the antecedent (A) is purchased.
#Confidence(A > B)=Support(A and B) / Support(A) 

In [None]:
#3.What are some limitations or challenges of Association rules mining?
#Generating frequent itemsets requires storing and processing a large number of combinations
#Multiple rules might convey the same relationship, making it difficult to identify truly valuable insights.