# Association Rules(Assignment)

In [9]:
import pandas as pd
import numpy as np

## Loading and Preprocessing the dataset

In [10]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
df = pd.read_excel('Online retail.xlsx')

# Step 1: Rename the long unnamed column to 'Items'
df.columns = ['Items']

# Step 2: Convert the comma-separated string of items into a list
df['Items'] = df['Items'].apply(lambda x: [item.strip() for item in str(x).split(',') if item.strip() != ''])

# Step 3: Drop duplicate transactions
df = df.drop_duplicates()

# Step 4: Convert transactions to one-hot encoded format using TransactionEncoder
transactions = df['Items'].tolist()
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Step 5: Find frequent itemsets using Apriori
frequent_itemsets = apriori(df_encoded, min_support=0.02, use_colnames=True)

# Step 6: Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Step 7: Show the resulting rules with useful metrics
result = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
print(result.sort_values(by='lift', ascending=False))


                    antecedents                 consequents   support  \
122             (herb & pepper)               (ground beef)  0.022802   
123               (ground beef)             (herb & pepper)  0.022802   
204  (spaghetti, mineral water)               (ground beef)  0.024734   
209               (ground beef)  (spaghetti, mineral water)  0.024734   
114                  (tomatoes)         (frozen vegetables)  0.022609   
..                          ...                         ...       ...   
94               (french fries)                      (milk)  0.033816   
70                       (eggs)               (ground beef)  0.028792   
71                (ground beef)                      (eggs)  0.028792   
11              (mineral water)                   (burgers)  0.034589   
10                    (burgers)             (mineral water)  0.034589   

     confidence      lift  
122    0.343023  2.525100  
123    0.167852  2.525100  
204    0.290909  2.141472  
209    0.18

## Association Rule Mining

In [11]:
from mlxtend.frequent_patterns import apriori, association_rules

# Rename and process the single column
df.columns = ['Items']
df['Items'] = df['Items'].apply(lambda x: [item.strip() for item in str(x).split(',') if item.strip() != ''])
df = df.drop_duplicates()

# Convert to list of lists format for transactions
transactions = df['Items'].tolist()

# One-hot encode the transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.02, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

# Show rules sorted by lift
rules = rules.sort_values(by='lift', ascending=False)
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
print(rules.head(10))  # Show top 10 interesting rules


          antecedents        consequents   support  confidence      lift
44      ('spaghetti')    ('ground beef')  0.041159    0.242045  2.470582
45    ('ground beef')      ('spaghetti')  0.041159    0.420118  2.470582
77       ('tomatoes')      ('spaghetti')  0.021256    0.383275  2.253920
76      ('spaghetti')       ('tomatoes')  0.021256    0.125000  2.253920
61           ('soup')  ('mineral water')  0.026280    0.470588  2.153222
60  ('mineral water')           ('soup')  0.026280    0.120248  2.153222
50      ('olive oil')           ('milk')  0.021836    0.297368  2.140308
51           ('milk')      ('olive oil')  0.021836    0.157163  2.140308
73      ('olive oil')      ('spaghetti')  0.026473    0.360526  2.120141
72      ('spaghetti')      ('olive oil')  0.026473    0.155682  2.120141


## Extracting patterns

In [12]:
from mlxtend.frequent_patterns import apriori, association_rules

# Generate frequent itemsets with lower min_support
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Generate rules with lower confidence
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.4)

# Relax lift threshold to capture more associations
meaningful_rules = rules[(rules['lift'] > 1.0) & (rules['confidence'] >= 0.4)]

# Sort and view the rules
meaningful_rules = meaningful_rules.sort_values(by='lift', ascending=False)
print(meaningful_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


                               antecedents        consequents   support  \
0                              ('burgers')        (['turkey')  0.014106   
16                 ('ground beef', 'milk')      ('spaghetti')  0.010242   
20                        ('soup', 'milk')  ('mineral water')  0.010435   
2                          ('ground beef')      ('spaghetti')  0.041159   
17        ('mineral water', 'ground beef')      ('spaghetti')  0.017391   
8                               ('pepper')      ('spaghetti')  0.010821   
13  ('mineral water', 'frozen vegetables')      ('spaghetti')  0.010435   
4                        ('herb & pepper')      ('spaghetti')  0.010628   
15                 ('ground beef', 'milk')  ('mineral water')  0.011594   
14      ('spaghetti', 'frozen vegetables')  ('mineral water')  0.010435   

    confidence       lift  
0     0.948052  11.709233  
16    0.456897   2.686863  
20    0.574468   2.628534  
2     0.420118   2.470582  
17    0.416667   2.450284  
8     

## Analysis and Interpretation

**1. Spaghetti**

    (frozen vegetables, ground beef) ⇒ spaghetti
    (eggs, ground beef) ⇒ spaghetti
    (chocolate, olive oil) ⇒ spaghetti

    Customers who buy basic ingredients like ground beef, eggs, or vegetables are also highly likely to buy spaghetti.

**2. Mineral Water is a Frequent Pair**

    (soup, milk) ⇒ mineral water
    (frozen vegetables, ground beef) ⇒ mineral water

    Customers who buy healthy meals also purchase mineral water.

**3. Ground Beef**

    (ground beef, milk) ⇒ spaghetti
    (chocolate, ground beef) ⇒ spaghetti
    (ground beef, mineral water) ⇒ spaghetti

    Whenever ground beef is bought, people tend to buy several other items with it.

## Interview Questions

#### Q.	What is lift and why is it important in Association rules?

Lift tells you how much more likely two items are to be bought together than separately by chance.

Lift shows how interesting or useful a rule is.

A high lift means: "If someone buys X, they're way more likely to also buy Y — not just by chance."

#### Q.	What is support and Confidence. How do you calculate them?

**Support:**
    Support tells you how often an item or set of items appears in all transactions.

    eg- 100 transactions,
        15 of those include milk
        Support(milk) = 15 / 100 = 0.15 (or 15%)
        **Support(X) = transactions with X / total**

**Confidence:**
    Confidence tells you how often Y is bought when X is bought.
    
    "If someone buys X, how confident are we that they also buy Y?"
    
    eg- 15 people bought milk
        10 of them also bought bread
        Confidence(milk ⇒ bread) = 10 / 15 = 0.67 (or 67%)
        **Confidence(X ⇒ Y) = Support(X and Y) / Support(X)**

#### Q. What are some limitations or challenges of Association rules mining?

The limitations include:

**1. Too Many Rules:**

    Even with small datasets, you can generate thousands of rules.

**2. Not All Rules Are Meaningful:**

    Just because two products are bought together doesn’t mean there’s a useful or causal relationship.

**3. Ignores Item Quantities:**

    Association rules typically only consider whether an item was bought or not

**4. No Time Factor:**

    Rules do not consider the order or timing of purchases.