# Assignment 10 (Association Rules)

In [1]:
!pip install mlxtend pandas





## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import mlxtend.frequent_patterns 
from mlxtend.frequent_patterns import apriori, association_rules

## Load Dataset

In [3]:
df = pd.read_excel("Online Retail.xlsx", header=None)
df
# Each row is one transaction
transactions = df[0].apply(lambda x: x.split(','))   # split by comma
transactions.head()

0    [shrimp, almonds, avocado, vegetables mix, gre...
1                           [burgers, meatballs, eggs]
2                                            [chutney]
3                                    [turkey, avocado]
4    [mineral water, milk, energy bar, whole wheat ...
Name: 0, dtype: object

In [4]:
from mlxtend.preprocessing import TransactionEncoder
# Convert transactions to list of lists
transaction_list = transactions.tolist()
# Apply TransactionEncoder
te = TransactionEncoder()
te_array = te.fit(transaction_list).transform(transaction_list)
basket = pd.DataFrame(te_array, columns=te.columns_)
basket.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


## EDA

In [5]:
df.head()

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [6]:
df.tail()

Unnamed: 0,0
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"
7500,"eggs,frozen smoothie,yogurt cake,low fat yogurt"


In [7]:
df.dtypes

0    object
dtype: object

In [8]:
df.describe()

Unnamed: 0,0
count,7501
unique,5176
top,cookies
freq,223


In [9]:
df.shape

(7501, 1)

In [10]:
df.isnull()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
7496,False
7497,False
7498,False
7499,False


In [11]:
df.isnull().sum()

0    0
dtype: int64

In [12]:
df.duplicated().sum()

2325

In [13]:
means=df.mean(numeric_only=True)
means

Series([], dtype: float64)

In [14]:
median=df.median(numeric_only=True)
median

Series([], dtype: float64)

In [15]:
std=df.std(numeric_only=True)
std

Series([], dtype: float64)

## Data Preprocessing

In [16]:
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split

In [17]:
# Check missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())


Missing Values per Column:
0    0
dtype: int64


In [18]:
print(df.columns)

Index([0], dtype='int64')


In [19]:
# Convert True/False into 1/0
basket = basket.astype(int)
basket

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Remove missing values
df.dropna(inplace=True)
df

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


## Association Rule Mining

In [21]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt

In [22]:
# Step 1: Create sample dataset
dataset = [
    ['asparagus', 'almonds', 'asparagus'],
    ['avocado', 'bacon'],
    ['blueberries', 'water spray'],
    ['asparagus', 'almonds', 'white wine', 'yams'],
    ['yogurt cake']
]

In [23]:
# Convert dataset into one-hot encoded DataFrame
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [24]:
print("One-Hot Encoded Dataset:")
print(df)

One-Hot Encoded Dataset:
   almonds  asparagus  avocado  bacon  blueberries  water spray  white wine  \
0     True       True    False  False        False        False       False   
1    False      False     True   True        False        False       False   
2    False      False    False  False         True         True       False   
3     True       True    False  False        False        False        True   
4    False      False    False  False        False        False       False   

    yams  yogurt cake  
0  False        False  
1  False        False  
2  False        False  
3   True        False  
4  False         True  


In [25]:
# Apply Apriori Algorithm
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True)

print("\nFrequent Itemsets:")
print(frequent_itemsets)


Frequent Itemsets:
   support              itemsets
0      0.4             (almonds)
1      0.4           (asparagus)
2      0.4  (almonds, asparagus)


In [26]:
# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Keep only relevant columns
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

print("\nAssociation Rules:")
print(rules.sort_values(by="lift", ascending=False).head(10))



Association Rules:
   antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5


In [27]:
# Generate Rules with Confidence and Lift Threshold
rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=0.6   # Only keep rules with confidence >= 60%
)

In [28]:
# Apply additional filter for lift
rules = rules[rules['lift'] >= 1.2]   # Keep only rules with lift >= 1.2

# Select relevant columns
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

print("\nFiltered Association Rules:")
print(rules.sort_values(by="lift", ascending=False).head(15))


Filtered Association Rules:
   antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5


## Analysis and Interpretation

In [32]:
# Sort by different metrics to analyze
print("\nTop Rules by Confidence:")
print(rules.sort_values(by="confidence", ascending=False).head(10))

print("\nTop Rules by Lift:")
print(rules.sort_values(by="lift", ascending=False).head(10))

print("\nTop Rules by Support:")
print(rules.sort_values(by="support", ascending=False).head(10))


Top Rules by Confidence:
   antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5

Top Rules by Lift:
   antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5

Top Rules by Support:
   antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5


In [35]:
# Step 1: Sort and inspect rules
top_by_support = rules.sort_values(by="support", ascending=False).head(5)
top_by_confidence = rules.sort_values(by="confidence", ascending=False).head(5)
top_by_lift = rules.sort_values(by="lift", ascending=False).head(5)

print("\nTop 5 Rules by Support:\n", top_by_support)
print("\nTop 5 Rules by Confidence:\n", top_by_confidence)
print("\nTop 5 Rules by Lift:\n", top_by_lift)


Top 5 Rules by Support:
    antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5

Top 5 Rules by Confidence:
    antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5

Top 5 Rules by Lift:
    antecedents  consequents  support  confidence  lift
0    (almonds)  (asparagus)      0.4         1.0   2.5
1  (asparagus)    (almonds)      0.4         1.0   2.5


In [36]:
# Function to convert rule into readable insight
def generate_insight(row):
    antecedent = ', '.join(list(row['antecedents']))
    consequent = ', '.join(list(row['consequents']))
    return (f"Customers who buy [{antecedent}] are likely to also buy [{consequent}] "
            f"(Support: {row['support']:.2f}, "
            f"Confidence: {row['confidence']:.2f}, "
            f"Lift: {row['lift']:.2f}).")

In [37]:
# Generate Insights
print("\nðŸ”Ž Insights from Top Rules:\n")
for _, row in top_by_lift.iterrows():
    print(generate_insight(row))

for _, row in top_by_confidence.iterrows():
    print(generate_insight(row))


ðŸ”Ž Insights from Top Rules:

Customers who buy [almonds] are likely to also buy [asparagus] (Support: 0.40, Confidence: 1.00, Lift: 2.50).
Customers who buy [asparagus] are likely to also buy [almonds] (Support: 0.40, Confidence: 1.00, Lift: 2.50).
Customers who buy [almonds] are likely to also buy [asparagus] (Support: 0.40, Confidence: 1.00, Lift: 2.50).
Customers who buy [asparagus] are likely to also buy [almonds] (Support: 0.40, Confidence: 1.00, Lift: 2.50).


### Conclusion

In this assignment, association rule mining was applied to uncover hidden patterns and relationships in transactional data. By using metrics such as support, confidence, and lift, meaningful rules were identified that reveal strong associations between different items. The results provide valuable insights into customer purchasing behavior, helping businesses make data-driven decisions, such as product placement, cross-selling, and promotional strategies.


## Interview Questions

### 1.	What is lift and why is it important in Association rules?

Lift in association rules measures how much more likely the consequent is to occur when the antecedent is present, compared to its independent likelihood. It's calculated as the observed support of the rule divided by the product of the individual supports of the antecedent and consequent. Lift is importantÂ because

It helps identify truly strong and interesting associations by normalizing for the individual popularity of items, preventing high-support, yet ultimately uninteresting, rules from being consideredÂ significant.

Why Lift is important:

1. It Filters out weak rules.

2. It normalize for item popularity

3. It helpes find genuinely interesting associations

4. It enhance business insights

### 2.What is support and Confidence. How do you calculate them?

Formula: Support(X) = (Number of transactions containing X) / (Total number of transactions) 

Formula: Confidence(X â†’ Y) = (Support of XâˆªY) /Â (SupportÂ ofÂ X)

### 3.What are some limitations or challenges of Association rules mining?

Limitations and challenges of association rule mining include the potential for generating an overwhelming number of rules, the difficulty in identifying truly meaningful patterns, computational expense with large datasets, and the necessity of setting appropriate minimum support and confidence thresholds to avoid trivial or overlooked rules. It also struggles with continuous variables and complex, multi-way relationships, requiring significant data preprocessing and domain knowledge for effectiveÂ interpretation.