In [2]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Sample transaction data (in a real scenario, this would be loaded from a CSV/database)
transactions = [
    ['bread', 'milk', 'eggs'],
    ['bread', 'butter', 'jam'],
    ['milk', 'butter', 'eggs', 'yogurt'],
    ['bread', 'milk', 'butter', 'eggs'],
    ['bread', 'milk', 'butter'],
    ['eggs', 'bacon', 'bread'],
    ['milk', 'eggs', 'cereal', 'fruit'],
    ['bread', 'eggs', 'bacon'],
    ['bread', 'milk', 'eggs', 'yogurt'],
    ['coffee', 'sugar', 'milk'],
    ['coffee', 'bread', 'butter'],
    ['bread', 'eggs', 'milk', 'cheese'],
    ['milk', 'yogurt', 'fruit'],
    ['bread', 'coffee', 'sugar'],
    ['eggs', 'bacon'],
    ['bread', 'milk', 'cheese'],
    ['bread', 'butter', 'eggs', 'bacon'],
    ['cereal', 'milk', 'sugar'],
    ['bread', 'jam', 'milk'],
    ['bread', 'cheese', 'eggs']
]

# Function to convert transactions to one-hot encoded format
def encode_transactions(transactions):
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    return df

# Apply Apriori algorithm to find frequent itemsets
def find_frequent_itemsets(df, min_support=0.1):
    frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
    frequent_itemsets['itemsets_len'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    return frequent_itemsets

# Generate association rules from frequent itemsets
def generate_rules(frequent_itemsets, min_threshold=0.5):
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)
    return rules

# Visualize the support of frequent itemsets
def plot_support(frequent_itemsets):
    plt.figure(figsize=(10, 6))
    
    # Filter to itemsets with 1 or 2 items for better readability
    plot_data = frequent_itemsets[frequent_itemsets['itemsets_len'] <= 2].copy()
    
    # Convert frozensets to strings for plotting
    plot_data['itemsets_str'] = plot_data['itemsets'].apply(lambda x: ', '.join(list(x)))
    
    # Sort by support
    plot_data = plot_data.sort_values('support', ascending=False)
    
    # Plot
    sns.barplot(x='support', y='itemsets_str', data=plot_data)
    plt.title('Support of Frequent Itemsets')
    plt.xlabel('Support')
    plt.ylabel('Itemsets')
    plt.tight_layout()
    
    return plt

# Visualize the association rules
def plot_rules(rules):
    plt.figure(figsize=(10, 6))
    
    # Convert antecedents and consequents to strings
    rules_plot = rules.copy()
    rules_plot['antecedents_str'] = rules_plot['antecedents'].apply(lambda x: ', '.join(list(x)))
    rules_plot['consequents_str'] = rules_plot['consequents'].apply(lambda x: ', '.join(list(x)))
    rules_plot['rule'] = rules_plot['antecedents_str'] + ' → ' + rules_plot['consequents_str']
    
    # Sort by lift and select top 10 for visualization
    rules_plot = rules_plot.sort_values('lift', ascending=False).head(10)
    
    # Plot
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='support', y='confidence', size='lift', 
                    sizes=(50, 400), data=rules_plot, alpha=0.7)
    
    # Add rule text as annotations
    for i, row in rules_plot.iterrows():
        plt.annotate(row['rule'], 
                    (row['support'], row['confidence']),
                    xytext=(7, 0), 
                    textcoords='offset points',
                    fontsize=8)
    
    plt.title('Association Rules: Support vs Confidence (size represents lift)')
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.tight_layout()
    
    return plt

# Main analysis function
def perform_market_basket_analysis(transactions, min_support=0.1, min_confidence=0.5):
    # Encode transactions
    df = encode_transactions(transactions)
    
    # Find frequent itemsets
    frequent_itemsets = find_frequent_itemsets(df, min_support)
    
    # Generate rules
    rules = generate_rules(frequent_itemsets, min_confidence)
    
    return df, frequent_itemsets, rules

# Run the analysis
df, frequent_itemsets, rules = perform_market_basket_analysis(
    transactions, 
    min_support=0.15,  # Items appearing in at least 15% of transactions
    min_confidence=0.6  # Rules with at least 60% confidence
)

# Display results
print("Dataset Overview:")
print(f"Number of transactions: {len(transactions)}")
print(f"Number of unique items: {len(df.columns)}")
print("\nTop 10 Frequent Itemsets by Support:")
print(frequent_itemsets.sort_values('support', ascending=False).head(10))
print("\nTop 10 Association Rules by Lift:")
print(rules.sort_values('lift', ascending=False).head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Calculate some basic business insights
def extract_business_insights(rules, frequent_itemsets):
    # Most commonly purchased products
    single_items = frequent_itemsets[frequent_itemsets['itemsets_len'] == 1].sort_values('support', ascending=False)
    
    # Most valuable product associations (by lift)
    top_associations = rules.sort_values('lift', ascending=False).head(5)
    
    # Products often bought together (by support)
    products_together = rules.sort_values('support', ascending=False).head(5)
    
    return single_items, top_associations, products_together

# Get insights
single_items, top_associations, products_together = extract_business_insights(rules, frequent_itemsets)

# Print insights
print("\n=== BUSINESS INSIGHTS ===")
print("\nMost Commonly Purchased Products:")
for i, row in single_items.iterrows():
    item = list(row['itemsets'])[0]
    print(f"- {item}: appears in {row['support']*100:.1f}% of transactions")

print("\nStrongest Product Associations (by lift):")
for i, row in top_associations.iterrows():
    antecedent = ', '.join(list(row['antecedents']))
    consequent = ', '.join(list(row['consequents']))
    print(f"- If customer buys {antecedent}, they are {row['lift']:.2f}x more likely to buy {consequent}")
    print(f"  (Support: {row['support']*100:.1f}%, Confidence: {row['confidence']*100:.1f}%)")

print("\nProducts Most Often Bought Together (by support):")
for i, row in products_together.iterrows():
    antecedent = ', '.join(list(row['antecedents']))
    consequent = ', '.join(list(row['consequents']))
    print(f"- {antecedent} and {consequent} appear together in {row['support']*100:.1f}% of transactions")
    print(f"  (Confidence: {row['confidence']*100:.1f}%, Lift: {row['lift']:.2f})")


Dataset Overview:
Number of transactions: 20
Number of unique items: 12

Top 10 Frequent Itemsets by Support:
    support         itemsets  itemsets_len
1      0.70          (bread)             1
6      0.60           (milk)             1
5      0.55           (eggs)             1
13     0.40    (eggs, bread)             2
14     0.35    (milk, bread)             2
2      0.30         (butter)             1
17     0.30     (eggs, milk)             2
11     0.25  (butter, bread)             2
0      0.20          (bacon)             1
10     0.20    (eggs, bacon)             2

Top 10 Association Rules by Lift:
      antecedents    consequents  support  confidence      lift
8         (bacon)  (eggs, bread)     0.15    0.750000  1.875000
1         (bacon)         (eggs)     0.20    1.000000  1.818182
7  (bacon, bread)         (eggs)     0.15    1.000000  1.818182
5        (yogurt)         (milk)     0.15    1.000000  1.666667
3        (cheese)        (bread)     0.15    1.000000  1.42857