In [1]:
import pandas as pd

In [2]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [3]:
items_list = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Onion', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]


In [4]:
te = TransactionEncoder()

In [5]:
te_arr = te.fit_transform(items_list)
te_arr

array([[False, False, False,  True, False,  True,  True,  True,  True,
         True],
       [False, False,  True,  True, False,  True, False,  True,  True,
         True],
       [ True, False, False,  True, False,  True,  True, False, False,
        False],
       [False,  True, False, False, False,  True,  True, False,  True,
         True],
       [False,  True, False,  True,  True,  True, False, False,  True,
        False]])

In [6]:
te.columns_

['Apple',
 'Corn',
 'Dill',
 'Eggs',
 'Ice cream',
 'Kidney Beans',
 'Milk',
 'Nutmeg',
 'Onion',
 'Yogurt']

In [7]:
df = pd.DataFrame(te_arr,columns=te.columns_)
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Yogurt
0,False,False,False,True,False,True,True,True,True,True
1,False,False,True,True,False,True,False,True,True,True
2,True,False,False,True,False,True,True,False,False,False
3,False,True,False,False,False,True,True,False,True,True
4,False,True,False,True,True,True,False,False,True,False


## Apply Apriori Algorithm

In [8]:
freq_items = apriori(df, min_support=0.6, use_colnames=True)
freq_items

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.8,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Milk, Kidney Beans)"
8,0.8,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


In [9]:
freq_items['length'] = freq_items['itemsets'].apply(lambda x : len(x))

In [10]:
freq_items

Unnamed: 0,support,itemsets,length
0,0.8,(Eggs),1
1,1.0,(Kidney Beans),1
2,0.6,(Milk),1
3,0.8,(Onion),1
4,0.6,(Yogurt),1
5,0.8,"(Eggs, Kidney Beans)",2
6,0.6,"(Eggs, Onion)",2
7,0.6,"(Milk, Kidney Beans)",2
8,0.8,"(Onion, Kidney Beans)",2
9,0.6,"(Yogurt, Kidney Beans)",2


In [11]:
rules = association_rules(freq_items, min_threshold=1, num_itemsets=5)
rules

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,1.0,0.0,inf,0.0,0.8,0.0,0.9
1,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
2,(Onion),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,1.0,0.0,inf,0.0,0.8,0.0,0.9
3,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
4,(Yogurt),(Onion),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
5,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
6,"(Yogurt, Onion)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
7,"(Yogurt, Kidney Beans)",(Onion),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
8,(Yogurt),"(Onion, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875


In [12]:
type(rules)

pandas.core.frame.DataFrame

In [13]:
rules[rules['lift']>1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
4,(Yogurt),(Onion),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
7,"(Yogurt, Kidney Beans)",(Onion),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
8,(Yogurt),"(Onion, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875


## Interview questions

In [32]:
## 1.	What is lift and why is it important in Association rules?
#Lift is a metric in association rule mining that measures how much more likely two items are to be bought together than if they were independent. It is calculated as:
#    Lift=Confidence of the rule/Support of consequent
#Importance:
#Lift > 1: Indicates a strong positive association between items.
#Lift = 1: Suggests no association (independence).
#Lift < 1: Implies a negative association.
#Lift is crucial because it identifies meaningful and statistically significant relationships, helping to uncover patterns in customer purchasing behavior.

In [34]:
## 2.	What is support and Confidence. How do you calculate them?
## Support: The proportion of transactions that contain a specific item or itemset.
# Formula:

#Support=Transactions containing the itemset/Total transactions

##Confidence: The likelihood that the consequent occurs when the antecedent is present.
#Formula:
#Confidence=Transactions containing both antecedent and consequent/Transactions containing the antecedent


In [None]:
## 3.What are some limitations or challenges of Association rules mining?
#Limitations/Challenges of Association Rule Mining:
#High Computational Cost: Generating rules for large datasets can be time-consuming and resource-intensive.
#Choosing Thresholds: Setting appropriate support, confidence, and lift thresholds is subjective and can impact results.
#Too Many Rules: It may produce a large number of rules, making it difficult to identify useful insights.
#Redundancy: Many rules may be similar or redundant.
#Interpretability: Complex rules can be hard to interpret and act upon.
#Data Quality: Missing, inconsistent, or noisy data affects the accuracy of results.
#Sparsity: Datasets with many items and few transactions may yield limited patterns.
