In [8]:
# Title: Implement Apriori for Association Rule Mining

# Task 1: Market Basket Analysis with Simple Transactions
# Step 1: Define a simple dataset of transactions.
# Step 2: Implement the Apriori algorithm using the apyori library.

# Task 2: Apriori Implementation with a Groceries Dataset
# Step 1: Load a sample dataset of grocery transactions. (Consider creating or downloading a CSV file of transactions)
# Step 2: Convert transactions for Apriori and execute the algorithm.
    
# Task 3: Finding Frequent Itemsets in Large Dataset
# Step 1: Use a pre-existing large dataset or generate a synthetic dataset.
# Step 2: Run Apriori and identify frequent itemsets.


In [9]:
import pandas as pd
from apyori import apriori

# Task 1: Market Basket Analysis with Simple Transactions

# Step 1: Define a simple dataset of transactions.
# Each sublist represents a single transaction, containing items purchased together.
transactions_data = [
    ['milk', 'bread', 'butter'],
    ['milk', 'sugar', 'coffee'],
    ['bread', 'eggs'],
    ['milk', 'bread', 'sugar'],
    ['coffee', 'bread'],
    ['milk', 'bread', 'eggs', 'sugar'],
    ['butter', 'sugar', 'coffee'],
    ['milk', 'bread', 'butter', 'coffee']
]

print("--- Original Transactions Dataset ---")
for i, transaction in enumerate(transactions_data):
    print(f"Transaction {i+1}: {transaction}")
print("\n")

# Step 2: Implement the Apriori algorithm using the apyori library.

# The apriori function expects a list of lists, where each inner list is a transaction.
# Our 'transactions_data' is already in this format.

# Parameters for the Apriori algorithm:
# min_support: The minimum support threshold for an itemset to be considered frequent.
#              Support(A) = (Number of transactions containing A) / (Total number of transactions)
# min_confidence: The minimum confidence threshold for a rule to be considered interesting.
#                 Confidence(A -> B) = Support(A U B) / Support(A)
# min_lift: The minimum lift threshold for a rule. Lift(A -> B) = Confidence(A -> B) / Support(B)
#           Lift > 1 indicates a positive correlation.
# min_length: Minimum number of items in the itemset.
# max_length: Maximum number of items in the itemset.

# Let's set some common thresholds for demonstration.
# These values can be adjusted based on the dataset and desired output.
rules = apriori(transactions_data,
                min_support=0.25,      # Items appearing in at least 25% of transactions
                min_confidence=0.5,    # Rules must have at least 50% confidence
                min_lift=1.0,          # Only consider rules with lift >= 1 (positive correlation)
                min_length=2,          # Consider itemsets with at least 2 items
                max_length=4)          # Consider itemsets with at most 4 items

# Convert the rules (generator object) into a list for easier inspection
results = list(rules)

print("--- Apriori Algorithm Results ---")
print(f"Found {len(results)} frequent itemsets/rules.\n")

# Display the results in a readable format
if not results:
    print("No frequent itemsets or rules found with the given thresholds.")
else:
    for item in results:
        # First itemset of the rule
        # The 'items' attribute contains the frozenset of items in the itemset.
        # The 'support' attribute contains the support value of the itemset.
        pair = item[0]
        items = [x for x in pair]
        print(f"Itemset: {items}")
        print(f"Support: {item[1]:.4f}")

        # Second itemset of the rule (association rules)
        # The 'ordered_statistics' attribute contains information about the rules derived from the itemset.
        # Each rule has a 'items_base' (antecedent), 'items_add' (consequent), 'confidence', and 'lift'.
        if item[2]: # Check if there are any ordered statistics (rules)
            for rule in item[2]:
                antecedent = [x for x in rule[0]]
                consequent = [x for x in rule[1]]
                confidence = rule[2]
                lift = rule[3]

                print(f"  Rule: {antecedent} -> {consequent}")
                print(f"    Confidence: {confidence:.4f}")
                print(f"    Lift: {lift:.4f}")
        print("-" * 30)



--- Original Transactions Dataset ---
Transaction 1: ['milk', 'bread', 'butter']
Transaction 2: ['milk', 'sugar', 'coffee']
Transaction 3: ['bread', 'eggs']
Transaction 4: ['milk', 'bread', 'sugar']
Transaction 5: ['coffee', 'bread']
Transaction 6: ['milk', 'bread', 'eggs', 'sugar']
Transaction 7: ['butter', 'sugar', 'coffee']
Transaction 8: ['milk', 'bread', 'butter', 'coffee']


--- Apriori Algorithm Results ---
Found 12 frequent itemsets/rules.

Itemset: ['bread']
Support: 0.7500
  Rule: [] -> ['bread']
    Confidence: 0.7500
    Lift: 1.0000
------------------------------
Itemset: ['coffee']
Support: 0.5000
  Rule: [] -> ['coffee']
    Confidence: 0.5000
    Lift: 1.0000
------------------------------
Itemset: ['milk']
Support: 0.6250
  Rule: [] -> ['milk']
    Confidence: 0.6250
    Lift: 1.0000
------------------------------
Itemset: ['sugar']
Support: 0.5000
  Rule: [] -> ['sugar']
    Confidence: 0.5000
    Lift: 1.0000
------------------------------
Itemset: ['bread', 'eggs']


In [10]:
import pandas as pd
from apyori import apriori
import io # Import io for simulating file reading from a string

# Task 1: Market Basket Analysis with Simple Transactions

# Step 1: Define a simple dataset of transactions.
# Each sublist represents a single transaction, containing items purchased together.
transactions_data = [
    ['milk', 'bread', 'butter'],
    ['milk', 'sugar', 'coffee'],
    ['bread', 'eggs'],
    ['milk', 'bread', 'sugar'],
    ['coffee', 'bread'],
    ['milk', 'bread', 'eggs', 'sugar'],
    ['butter', 'sugar', 'coffee'],
    ['milk', 'bread', 'butter', 'coffee']
]

print("--- Original Transactions Dataset (Task 1) ---")
for i, transaction in enumerate(transactions_data):
    print(f"Transaction {i+1}: {transaction}")
print("\n")

# Step 2: Implement the Apriori algorithm using the apyori library for Task 1.

# The apriori function expects a list of lists, where each inner list is a transaction.
# Our 'transactions_data' is already in this format.

# Parameters for the Apriori algorithm:
# min_support: The minimum support threshold for an itemset to be considered frequent.
#              Support(A) = (Number of transactions containing A) / (Total number of transactions)
# min_confidence: The minimum confidence threshold for a rule to be considered interesting.
#                 Confidence(A -> B) = Support(A U B) / Support(A)
# min_lift: The minimum lift threshold for a rule. Lift(A -> B) = Confidence(A -> B) / Support(B)
#           Lift > 1 indicates a positive correlation.
# min_length: Minimum number of items in the itemset.
# max_length: Maximum number of items in the itemset.

# Let's set some common thresholds for demonstration.
# These values can be adjusted based on the dataset and desired output.
rules_task1 = apriori(transactions_data,
                      min_support=0.25,      # Items appearing in at least 25% of transactions
                      min_confidence=0.5,    # Rules must have at least 50% confidence
                      min_lift=1.0,          # Only consider rules with lift >= 1 (positive correlation)
                      min_length=2,          # Consider itemsets with at least 2 items
                      max_length=4)          # Consider itemsets with at most 4 items

# Convert the rules (generator object) into a list for easier inspection
results_task1 = list(rules_task1)

print("--- Apriori Algorithm Results (Task 1) ---")
print(f"Found {len(results_task1)} frequent itemsets/rules.\n")

# Display the results in a readable format
if not results_task1:
    print("No frequent itemsets or rules found for Task 1 with the given thresholds.")
else:
    for item in results_task1:
        pair = item[0]
        items = [x for x in pair]
        print(f"Itemset: {items}")
        print(f"Support: {item[1]:.4f}")

        if item[2]: # Check if there are any ordered statistics (rules)
            for rule in item[2]:
                antecedent = [x for x in rule[0]]
                consequent = [x for x in rule[1]]
                confidence = rule[2]
                lift = rule[3]

                print(f"  Rule: {antecedent} -> {consequent}")
                print(f"    Confidence: {confidence:.4f}")
                print(f"    Lift: {lift:.4f}")
        print("-" * 30)

print("\n" + "="*50 + "\n")

# Task 2: Apriori Implementation with a Groceries Dataset

# Step 1: Load a sample dataset of grocery transactions.
# We'll simulate a CSV file as a string for demonstration purposes.
# In a real scenario, you would load from a file like:
# df = pd.read_csv('groceries.csv', header=None)

groceries_csv_data = """
milk,bread,butter,eggs,sugar
coffee,tea,sugar,milk
bread,butter,jam
milk,eggs,cheese
bread,coffee,sugar
milk,tea,jam,butter
eggs,cheese
bread,milk,sugar,butter
coffee,tea
milk,bread,eggs
"""

# Use io.StringIO to treat the string data as a file
groceries_df = pd.read_csv(io.StringIO(groceries_csv_data), header=None)

print("--- Loaded Groceries Dataset (Task 2) ---")
print(groceries_df.head())
print("\n")

# Step 2: Convert transactions for Apriori and execute the algorithm.
# The `apyori` library expects a list of lists, where each sublist is a transaction.
# We need to convert our DataFrame rows into this format.

# Create a list of lists, where each inner list contains the non-NaN items from a row.
groceries_transactions = []
for i in range(len(groceries_df)):
    # Convert each row to a list, drop NaN values, and convert to string type
    groceries_transactions.append([str(item) for item in groceries_df.iloc[i].dropna()])

print("--- Converted Groceries Transactions (Task 2) ---")
for i, transaction in enumerate(groceries_transactions[:5]): # Print first 5 for brevity
    print(f"Transaction {i+1}: {transaction}")
print(f"... and {len(groceries_transactions) - 5} more transactions.\n")


# Execute Apriori algorithm on the groceries dataset
# Adjust thresholds as needed for this dataset
rules_task2 = apriori(groceries_transactions,
                      min_support=0.3,      # Example: Items appearing in at least 30% of transactions
                      min_confidence=0.6,   # Example: Rules must have at least 60% confidence
                      min_lift=1.2,         # Example: Only consider rules with lift >= 1.2
                      min_length=2,
                      max_length=4)

results_task2 = list(rules_task2)

print("--- Apriori Algorithm Results (Task 2 - Groceries) ---")
print(f"Found {len(results_task2)} frequent itemsets/rules.\n")

# Display the results for the groceries dataset
if not results_task2:
    print("No frequent itemsets or rules found for Task 2 with the given thresholds.")
else:
    for item in results_task2:
        pair = item[0]
        items = [x for x in pair]
        print(f"Itemset: {items}")
        print(f"Support: {item[1]:.4f}")

        if item[2]:
            for rule in item[2]:
                antecedent = [x for x in rule[0]]
                consequent = [x for x in rule[1]]
                confidence = rule[2]
                lift = rule[3]

                print(f"  Rule: {antecedent} -> {consequent}")
                print(f"    Confidence: {confidence:.4f}")
                print(f"    Lift: {lift:.4f}")
        print("-" * 30)



--- Original Transactions Dataset (Task 1) ---
Transaction 1: ['milk', 'bread', 'butter']
Transaction 2: ['milk', 'sugar', 'coffee']
Transaction 3: ['bread', 'eggs']
Transaction 4: ['milk', 'bread', 'sugar']
Transaction 5: ['coffee', 'bread']
Transaction 6: ['milk', 'bread', 'eggs', 'sugar']
Transaction 7: ['butter', 'sugar', 'coffee']
Transaction 8: ['milk', 'bread', 'butter', 'coffee']


--- Apriori Algorithm Results (Task 1) ---
Found 12 frequent itemsets/rules.

Itemset: ['bread']
Support: 0.7500
  Rule: [] -> ['bread']
    Confidence: 0.7500
    Lift: 1.0000
------------------------------
Itemset: ['coffee']
Support: 0.5000
  Rule: [] -> ['coffee']
    Confidence: 0.5000
    Lift: 1.0000
------------------------------
Itemset: ['milk']
Support: 0.6250
  Rule: [] -> ['milk']
    Confidence: 0.6250
    Lift: 1.0000
------------------------------
Itemset: ['sugar']
Support: 0.5000
  Rule: [] -> ['sugar']
    Confidence: 0.5000
    Lift: 1.0000
------------------------------
Itemset: 

In [11]:
import pandas as pd
from apyori import apriori
import io # Import io for simulating file reading from a string
import numpy as np # For generating synthetic large dataset
import random # For generating synthetic large dataset

# Task 1: Market Basket Analysis with Simple Transactions

# Step 1: Define a simple dataset of transactions.
# Each sublist represents a single transaction, containing items purchased together.
transactions_data = [
    ['milk', 'bread', 'butter'],
    ['milk', 'sugar', 'coffee'],
    ['bread', 'eggs'],
    ['milk', 'bread', 'sugar'],
    ['coffee', 'bread'],
    ['milk', 'bread', 'eggs', 'sugar'],
    ['butter', 'sugar', 'coffee'],
    ['milk', 'bread', 'butter', 'coffee']
]

print("--- Original Transactions Dataset (Task 1) ---")
for i, transaction in enumerate(transactions_data):
    print(f"Transaction {i+1}: {transaction}")
print("\n")

# Step 2: Implement the Apriori algorithm using the apyori library for Task 1.

# The apriori function expects a list of lists, where each inner list is a transaction.
# Our 'transactions_data' is already in this format.

# Parameters for the Apriori algorithm:
# min_support: The minimum support threshold for an itemset to be considered frequent.
#              Support(A) = (Number of transactions containing A) / (Total number of transactions)
# min_confidence: The minimum confidence threshold for a rule to be considered interesting.
#                 Confidence(A -> B) = Support(A U B) / Support(A)
# min_lift: The minimum lift threshold for a rule. Lift(A -> B) = Confidence(A -> B) / Support(B)
#           Lift > 1 indicates a positive correlation.
# min_length: Minimum number of items in the itemset.
# max_length: Maximum number of items in the itemset.

# Let's set some common thresholds for demonstration.
# These values can be adjusted based on the dataset and desired output.
rules_task1 = apriori(transactions_data,
                      min_support=0.25,      # Items appearing in at least 25% of transactions
                      min_confidence=0.5,    # Rules must have at least 50% confidence
                      min_lift=1.0,          # Only consider rules with lift >= 1 (positive correlation)
                      min_length=2,          # Consider itemsets with at least 2 items
                      max_length=4)          # Consider itemsets with at most 4 items

# Convert the rules (generator object) into a list for easier inspection
results_task1 = list(rules_task1)

print("--- Apriori Algorithm Results (Task 1) ---")
print(f"Found {len(results_task1)} frequent itemsets/rules.\n")

# Display the results in a readable format
if not results_task1:
    print("No frequent itemsets or rules found for Task 1 with the given thresholds.")
else:
    for item in results_task1:
        pair = item[0]
        items = [x for x in pair]
        print(f"Itemset: {items}")
        print(f"Support: {item[1]:.4f}")

        if item[2]: # Check if there are any ordered statistics (rules)
            for rule in item[2]:
                antecedent = [x for x in rule[0]]
                consequent = [x for x in rule[1]]
                confidence = rule[2]
                lift = rule[3]

                print(f"  Rule: {antecedent} -> {consequent}")
                print(f"    Confidence: {confidence:.4f}")
                print(f"    Lift: {lift:.4f}")
        print("-" * 30)

print("\n" + "="*50 + "\n")

# Task 2: Apriori Implementation with a Groceries Dataset

# Step 1: Load a sample dataset of grocery transactions.
# We'll simulate a CSV file as a string for demonstration purposes.
# In a real scenario, you would load from a file like:
# df = pd.read_csv('groceries.csv', header=None)

groceries_csv_data = """
milk,bread,butter,eggs,sugar
coffee,tea,sugar,milk
bread,butter,jam
milk,eggs,cheese
bread,coffee,sugar
milk,tea,jam,butter
eggs,cheese
bread,milk,sugar,butter
coffee,tea
milk,bread,eggs
"""

# Use io.StringIO to treat the string data as a file
groceries_df = pd.read_csv(io.StringIO(groceries_csv_data), header=None)

print("--- Loaded Groceries Dataset (Task 2) ---")
print(groceries_df.head())
print("\n")

# Step 2: Convert transactions for Apriori and execute the algorithm.
# The `apyori` library expects a list of lists, where each sublist is a transaction.
# We need to convert our DataFrame rows into this format.

# Create a list of lists, where each inner list contains the non-NaN items from a row.
groceries_transactions = []
for i in range(len(groceries_df)):
    # Convert each row to a list, drop NaN values, and convert to string type
    groceries_transactions.append([str(item) for item in groceries_df.iloc[i].dropna()])

print("--- Converted Groceries Transactions (Task 2) ---")
for i, transaction in enumerate(groceries_transactions[:5]): # Print first 5 for brevity
    print(f"Transaction {i+1}: {transaction}")
print(f"... and {len(groceries_transactions) - 5} more transactions.\n")


# Execute Apriori algorithm on the groceries dataset
# Adjust thresholds as needed for this dataset
rules_task2 = apriori(groceries_transactions,
                      min_support=0.3,      # Example: Items appearing in at least 30% of transactions
                      min_confidence=0.6,   # Example: Rules must have at least 60% confidence
                      min_lift=1.2,         # Example: Only consider rules with lift >= 1.2
                      min_length=2,
                      max_length=4)

results_task2 = list(rules_task2)

print("--- Apriori Algorithm Results (Task 2 - Groceries) ---")
print(f"Found {len(results_task2)} frequent itemsets/rules.\n")

# Display the results for the groceries dataset
if not results_task2:
    print("No frequent itemsets or rules found for Task 2 with the given thresholds.")
else:
    for item in results_task2:
        pair = item[0]
        items = [x for x in pair]
        print(f"Itemset: {items}")
        print(f"Support: {item[1]:.4f}")

        if item[2]:
            for rule in item[2]:
                antecedent = [x for x in rule[0]]
                consequent = [x for x in rule[1]]
                confidence = rule[2]
                lift = rule[3]

                print(f"  Rule: {antecedent} -> {consequent}")
                print(f"    Confidence: {confidence:.4f}")
                print(f"    Lift: {lift:.4f}")
        print("-" * 30)

print("\n" + "="*50 + "\n")

# Task 3: Finding Frequent Itemsets in Large Dataset

# Step 1: Use a pre-existing large dataset or generate a synthetic dataset.
# We will generate a synthetic dataset to simulate a large number of transactions.

def generate_synthetic_transactions(num_transactions=1000, num_unique_items=50, max_items_per_transaction=10):
    """
    Generates a synthetic list of transactions.
    """
    all_items = [f"item_{i+1}" for i in range(num_unique_items)]
    synthetic_transactions = []
    for _ in range(num_transactions):
        num_items = random.randint(1, max_items_per_transaction)
        transaction = random.sample(all_items, num_items)
        synthetic_transactions.append(transaction)
    return synthetic_transactions

# Generate a larger dataset
large_transactions_data = generate_synthetic_transactions(num_transactions=5000, num_unique_items=100, max_items_per_transaction=15)

print("--- Generated Large Synthetic Transactions Dataset (Task 3) ---")
print(f"Number of transactions: {len(large_transactions_data)}")
print(f"First 5 transactions: {large_transactions_data[:5]}\n")

# Step 2: Run Apriori and identify frequent itemsets.
# For a larger dataset, we might need to adjust the thresholds to get meaningful results
# without generating an overwhelming number of rules.
# Lowering min_support can lead to many more rules.

rules_task3 = apriori(large_transactions_data,
                      min_support=0.01,    # Example: Items appearing in at least 1% of transactions
                      min_confidence=0.2,  # Example: Rules must have at least 20% confidence
                      min_lift=1.0,        # Only consider rules with lift >= 1
                      min_length=2,
                      max_length=3)        # Limiting max_length for faster processing on larger data

results_task3 = list(rules_task3)

print("--- Apriori Algorithm Results (Task 3 - Large Dataset) ---")
print(f"Found {len(results_task3)} frequent itemsets/rules.\n")

# Display the results for the large dataset
if not results_task3:
    print("No frequent itemsets or rules found for Task 3 with the given thresholds.")
else:
    # Print only the first few results for brevity, as large datasets can generate many rules
    print("Displaying first 10 results (if available):")
    for i, item in enumerate(results_task3):
        if i >= 10: # Limit output to 10 results
            break
        pair = item[0]
        items = [x for x in pair]
        print(f"Itemset: {items}")
        print(f"Support: {item[1]:.4f}")

        if item[2]:
            for rule in item[2]:
                antecedent = [x for x in rule[0]]
                consequent = [x for x in rule[1]]
                confidence = rule[2]
                lift = rule[3]

                print(f"  Rule: {antecedent} -> {consequent}")
                print(f"    Confidence: {confidence:.4f}")
                print(f"    Lift: {lift:.4f}")
        print("-" * 30)
    if len(results_task3) > 10:
        print(f"... and {len(results_task3) - 10} more results (truncated for display).\n")


--- Original Transactions Dataset (Task 1) ---
Transaction 1: ['milk', 'bread', 'butter']
Transaction 2: ['milk', 'sugar', 'coffee']
Transaction 3: ['bread', 'eggs']
Transaction 4: ['milk', 'bread', 'sugar']
Transaction 5: ['coffee', 'bread']
Transaction 6: ['milk', 'bread', 'eggs', 'sugar']
Transaction 7: ['butter', 'sugar', 'coffee']
Transaction 8: ['milk', 'bread', 'butter', 'coffee']


--- Apriori Algorithm Results (Task 1) ---
Found 12 frequent itemsets/rules.

Itemset: ['bread']
Support: 0.7500
  Rule: [] -> ['bread']
    Confidence: 0.7500
    Lift: 1.0000
------------------------------
Itemset: ['coffee']
Support: 0.5000
  Rule: [] -> ['coffee']
    Confidence: 0.5000
    Lift: 1.0000
------------------------------
Itemset: ['milk']
Support: 0.6250
  Rule: [] -> ['milk']
    Confidence: 0.6250
    Lift: 1.0000
------------------------------
Itemset: ['sugar']
Support: 0.5000
  Rule: [] -> ['sugar']
    Confidence: 0.5000
    Lift: 1.0000
------------------------------
Itemset: 