# Assignment 19: Association Rules (Market Basket Analysis)

## Dataset: Online Retail

**Topics Covered:**
- Apriori Algorithm
- Support, Confidence, Lift
- Market Basket Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Load data
df = pd.read_excel('Online retail.xlsx')
print("Dataset loaded! Shape:", df.shape)
df.head()

In [None]:
# Clean data
# Remove cancelled transactions
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Remove missing descriptions
df = df.dropna(subset=['Description'])

# Keep only positive quantities
df = df[df['Quantity'] > 0]

print("After cleaning:", df.shape)

In [None]:
# Group by invoice to create transactions
basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')

# Convert to binary (1 if item was purchased)
def encode_units(x):
    if x <= 0:
        return 0
    else:
        return 1

basket_encoded = basket.map(encode_units)
print("Basket matrix shape:", basket_encoded.shape)

In [None]:
# Apply Apriori algorithm
print("=== Apriori Algorithm ===")

# Find frequent itemsets with minimum support of 2%
frequent_itemsets = apriori(basket_encoded, min_support=0.02, use_colnames=True)
print("Number of frequent itemsets:", len(frequent_itemsets))

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False)
print("\nTop 10 frequent itemsets:")
frequent_itemsets.head(10)

In [None]:
# Generate association rules
print("=== Association Rules ===")

rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
rules = rules.sort_values('lift', ascending=False)

print("Number of rules:", len(rules))
print("\nTop 10 rules by lift:")
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

In [None]:
# Visualize rules
plt.figure(figsize=(10, 6))
plt.scatter(rules['support'], rules['confidence'], c=rules['lift'], cmap='viridis', alpha=0.5)
plt.colorbar(label='Lift')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Association Rules: Support vs Confidence')
plt.savefig('rules_scatter.png')
plt.show()

In [None]:
# Filter strong rules
strong_rules = rules[(rules['lift'] >= 2) & (rules['confidence'] >= 0.5)]
print("=== Strong Rules (Lift>=2, Confidence>=0.5) ===")
print("Number of strong rules:", len(strong_rules))
strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

## Summary

**Key Concepts:**
- **Support**: How often items appear together
- **Confidence**: Probability of buying B given A is bought
- **Lift**: How much more likely to buy B with A vs without

**Business Application:**
- Product placement
- Cross-selling recommendations
- Bundle offers