In [61]:
#Importing Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # for visualizations
import seaborn as sns # for visualizations
import scipy
import warnings

In [62]:
#Importing functions from the above library
from scipy import stats
warnings.filterwarnings('ignore')
from mlxtend.frequent_patterns import apriori,association_rules #Association Rules libraries
from mlxtend.preprocessing import TransactionEncoder #Association Rules libraries

# **01. EDA**

In [63]:
# Upload dataset - Online retail.xlsx
#Reading our dataset
df = pd.read_excel('Online retail.xlsx', header=None)
# Assign a column name to the single column
df.columns = ['Items']
df

Unnamed: 0,Items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Items   7501 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [65]:
df.describe()

Unnamed: 0,Items
count,7501
unique,5176
top,cookies
freq,223


In [66]:
df.duplicated().sum()

2325

In [67]:
df.isnull().sum()

Items    0
dtype: int64

In [68]:
'''# Remove duplicates
df.drop_duplicates(inplace=True)'''

'# Remove duplicates\ndf.drop_duplicates(inplace=True)'

In [69]:
df

Unnamed: 0,Items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


# **02. Association Rules**

In [70]:
# Generate transactions
transactions = []
for i in range(len(df)):
    transactions.append([str(item) for item in df.iloc[i, 0].split(',')])

# Display number of transactions
print("Number of transactions:", len(transactions))

Number of transactions: 7501


In [71]:
# Association Rule Mining
# Initialize the TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)
df_trans

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [72]:
# Implement Apriori algorithm
frequent_itemsets = apriori(df_trans, min_support=0.02, use_colnames=True)
# Display the frequent itemsets
print(frequent_itemsets)

      support                           itemsets
0    0.020397                          (almonds)
1    0.033329                          (avocado)
2    0.033729                         (brownies)
3    0.087188                          (burgers)
4    0.030129                           (butter)
..        ...                                ...
98   0.020131  (whole wheat rice, mineral water)
99   0.022930             (olive oil, spaghetti)
100  0.025197              (pancakes, spaghetti)
101  0.021197                (spaghetti, shrimp)
102  0.020931              (tomatoes, spaghetti)

[103 rows x 2 columns]


In [73]:
#A better way to sort the Association rules is through conviction, then automatically lift ratios get sorted in the rigth order
#Therefore, whenever the lift values are same, check for confidence. If the confidence values are also same check for support
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head(20)
rules.sort_values('conviction',ascending = False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
78,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255,0.503221
64,(ground beef),(spaghetti),0.098254,0.17411,0.039195,0.398915,2.291162,0.022088,1.373997,0.624943
72,(olive oil),(mineral water),0.065858,0.238368,0.027596,0.419028,1.757904,0.011898,1.310962,0.461536
62,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401,0.474369
86,(olive oil),(spaghetti),0.065858,0.17411,0.02293,0.348178,1.999758,0.011464,1.267048,0.535186
29,(cooking oil),(mineral water),0.05106,0.238368,0.020131,0.394256,1.653978,0.00796,1.257349,0.416672
10,(chicken),(mineral water),0.059992,0.238368,0.022797,0.38,1.594172,0.008497,1.228438,0.396502
1,(burgers),(eggs),0.087188,0.179709,0.028796,0.330275,1.83783,0.013128,1.224818,0.499424
54,(frozen vegetables),(mineral water),0.095321,0.238368,0.035729,0.374825,1.572463,0.013007,1.21827,0.402413
68,(milk),(mineral water),0.129583,0.238368,0.047994,0.37037,1.553774,0.017105,1.20965,0.409465


In [74]:
# Filter rules by support, confidence, and lift
filtered_rules = rules[(rules['support'] >= 0.01) &
              (rules['confidence'] >= 0.2) &
              (rules['lift'] >= 1)]

In [75]:
filtered_rules.sort_values('conviction',ascending = False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
78,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255,0.503221
64,(ground beef),(spaghetti),0.098254,0.17411,0.039195,0.398915,2.291162,0.022088,1.373997,0.624943
72,(olive oil),(mineral water),0.065858,0.238368,0.027596,0.419028,1.757904,0.011898,1.310962,0.461536
62,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401,0.474369
86,(olive oil),(spaghetti),0.065858,0.17411,0.02293,0.348178,1.999758,0.011464,1.267048,0.535186
29,(cooking oil),(mineral water),0.05106,0.238368,0.020131,0.394256,1.653978,0.00796,1.257349,0.416672
10,(chicken),(mineral water),0.059992,0.238368,0.022797,0.38,1.594172,0.008497,1.228438,0.396502
1,(burgers),(eggs),0.087188,0.179709,0.028796,0.330275,1.83783,0.013128,1.224818,0.499424
54,(frozen vegetables),(mineral water),0.095321,0.238368,0.035729,0.374825,1.572463,0.013007,1.21827,0.402413
68,(milk),(mineral water),0.129583,0.238368,0.047994,0.37037,1.553774,0.017105,1.20965,0.409465


# **03. Analysis and Interpretation**

In [76]:
# Analyze the generated rules
def analyze_rules(rules):
    for _, rule in rules.iterrows():
        print(f"Rule: {rule['antecedents']} -> {rule['consequents']}")
        print(f"Support: {rule['support']}")
        print(f"Confidence: {rule['confidence']}")
        print(f"Lift: {rule['lift']}")
        print("=====================================")

In [77]:
# Interpret the results
analyze_rules(filtered_rules)

Rule: frozenset({'burgers'}) -> frozenset({'eggs'})
Support: 0.02879616051193174
Confidence: 0.33027522935779813
Lift: 1.8378297443715457
Rule: frozenset({'burgers'}) -> frozenset({'french fries'})
Support: 0.021997067057725635
Confidence: 0.25229357798165136
Lift: 1.4761732671141707
Rule: frozenset({'burgers'}) -> frozenset({'mineral water'})
Support: 0.024396747100386616
Confidence: 0.2798165137614679
Lift: 1.1738834841861134
Rule: frozenset({'burgers'}) -> frozenset({'spaghetti'})
Support: 0.021463804826023197
Confidence: 0.24617737003058102
Lift: 1.4139176513012162
Rule: frozenset({'cake'}) -> frozenset({'mineral water'})
Support: 0.027463004932675644
Confidence: 0.33881578947368424
Lift: 1.4213966649005065
Rule: frozenset({'chicken'}) -> frozenset({'mineral water'})
Support: 0.022796960405279298
Confidence: 0.38000000000000006
Lift: 1.5941722595078303
Rule: frozenset({'chocolate'}) -> frozenset({'eggs'})
Support: 0.03319557392347687
Confidence: 0.20260374288039054
Lift: 1.12739664

In [78]:
# Create a dataframe with specified columns
rules_df = pd.DataFrame({
    'Support': filtered_rules['support'],
    'Base Item': filtered_rules['antecedents'].apply(lambda x: ', '.join(list(x))), # Add all the antecedent items which are in the form of a set are converted into a list and joined by using',' to form a string
    'Add Item': filtered_rules['consequents'].apply(lambda x: ', '.join(list(x))), # Similarly, Add all the consequent items which are in the form of a set are converted into a list and joined by using',' to form a string
    'Confidence': filtered_rules['confidence'],
    'Lift': filtered_rules['lift']
})

In [82]:
rules_df.head(10)

Unnamed: 0,Support,Base Item,Add Item,Confidence,Lift
1,0.028796,burgers,eggs,0.330275,1.83783
2,0.021997,burgers,french fries,0.252294,1.476173
4,0.024397,burgers,mineral water,0.279817,1.173883
6,0.021464,burgers,spaghetti,0.246177,1.413918
8,0.027463,cake,mineral water,0.338816,1.421397
10,0.022797,chicken,mineral water,0.38,1.594172
13,0.033196,chocolate,eggs,0.202604,1.127397
14,0.034395,french fries,chocolate,0.201248,1.228284
15,0.034395,chocolate,french fries,0.209927,1.228284
16,0.02293,frozen vegetables,chocolate,0.240559,1.468215


Q1. What is lift and why is it important in Association rules?

A. Lift is a metric used in association rule mining to evaluate the strength of an association rule. It is calculated as the ratio of the observed support for a rule to the expected support if the items were independent. Specifically, lift measures how much more likely the consequent (e.g., item B) is to occur given the antecedent (e.g., item A) than it would be if the two items were independent of each other.

Lift = Support of (A U B) / Support (A) x Support (B)

Lift is important for several reasons:

1. Independence Testing: Lift helps to determine whether the occurrence of one item (or set of items) influences the occurrence of another item (or set of items).
2. Actionable Insights: High lift values suggest strong associations between items, which can be leveraged for actionable insights in various applications such as marketing, inventory management, and recommendation systems.
3. Prioritizing Rules: When there are many association rules generated, lift can help prioritize which rules are more significant. Rules with higher lift values are typically more interesting and useful because they indicate stronger relationships between items.


Q2. What is support and Confidence. How do you calculate them?

A. Support is a measure of how frequently an itemset appears in the dataset. It indicates the proportion of transactions in the dataset that contain a particular itemset. Support helps in identifying the popularity of an itemset.

Support = No. of transactions containing A/ Total no. of transactions

Confidence is a measure of the reliability of an association rule. It is defined as the proportion of transactions that contain the antecedent which also contain the consequent. Confidence indicates the likelihood that the consequent is purchased when the antecedent is purchased.

Confidence = Support (A u B) / Support (A)

Q3. What are some limitations or challenges of Association rules mining?

A. Association rule mining is a powerful technique for uncovering interesting relationships in large datasets, but it also comes with several limitations and challenges:

1. Scalability: As the size of the dataset and the number of items increase, the computational complexity grows exponentially. The algorithm has to examine a potentially vast number of itemsets and their combinations, making it computationally intensive and time-consuming.
2. Redundancy: Many rules might be redundant or provide similar information, making it hard to extract unique insights. Eg - many rules, such as "bread → milk" and "milk → bread," which might not provide new insights.
3. Rare Item Problem: Items that occur infrequently might not be identified as significant, even if they have strong associations with other items.
4. Dynamic Inventory: Seasonal products and changing inventory levels require frequent updates to the analysis.
5. Sparse Data Issues: Most transactions contain only a few items, making it harder to find strong associations.