In [1]:
# Title: Implement Apriori for Association Rule Mining

# Task 1: Market Basket Analysis with Simple Transactions
# Step 1: Define a simple dataset of transactions.
# Step 2: Implement the Apriori algorithm using the apyori library.

# Task 2: Apriori Implementation with a Groceries Dataset
# Step 1: Load a sample dataset of grocery transactions. (Consider creating or downloading a CSV file of transactions)
# Step 2: Convert transactions for Apriori and execute the algorithm.
    
# Task 3: Finding Frequent Itemsets in Large Dataset
# Step 1: Use a pre-existing large dataset or generate a synthetic dataset.
# Step 2: Run Apriori and identify frequent itemsets.


In [2]:
import pandas as pd
from apyori import apriori

# Task 1: Market Basket Analysis with Simple Transactions
# Step 1: Define a simple dataset of transactions.
transactions1 = [
    ['milk', 'bread', 'butter'],
    ['milk', 'sugar'],
    ['bread', 'butter'],
    ['milk', 'bread', 'sugar', 'butter'],
    ['sugar', 'bread']
]

# Step 2: Implement the Apriori algorithm using the apyori library.
association_rules1 = apriori(transactions1, min_support=0.4, min_confidence=0.6, min_lift=1, min_length=2)
results1 = list(association_rules1)

# You can print the results for Task 1 to see the generated rules
print("--- Task 1 Results ---")
for relation_record in results1:
    for rule in relation_record.ordered_statistics:
        print(f"Rule: {list(rule.items_base)} -> {list(rule.items_add)} (Support: {relation_record.support}, Confidence: {rule.confidence}, Lift: {rule.lift})")
print("\n")

# ---

# Task 2: Apriori Implementation with a Groceries Dataset
# Step 1: Load a sample dataset of grocery transactions.
# Assuming 'groceries.csv' is in the same directory as your script
# You can create a 'groceries.csv' file with content like:
# milk,bread,butter
# milk,sugar,eggs
# bread,butter,cheese
# milk,bread,sugar,butter
# sugar,bread,yogurt,milk

try:
    df_groceries = pd.read_csv('groceries.csv', header=None)
except FileNotFoundError:
    print("Error: 'groceries.csv' not found. Please create the file or ensure it's in the correct directory.")
    # Create a dummy dataframe for demonstration if file not found
    df_groceries = pd.DataFrame([
        "milk,bread,butter",
        "milk,sugar,eggs",
        "bread,butter,cheese",
        "milk,bread,sugar,butter",
        "sugar,bread,yogurt,milk"
    ])

# Step 2: Convert transactions for Apriori and execute the algorithm.
transactions2 = []
for index, row in df_groceries.iterrows():
    # Assuming each row is a comma-separated string of items
    transactions2.append([item.strip() for item in str(row[0]).split(',') if item.strip()])

association_rules2 = apriori(transactions2, min_support=0.01, min_confidence=0.2, min_lift=3, min_length=2)
results2 = list(association_rules2)

# You can print the results for Task 2
print("--- Task 2 Results ---")
for relation_record in results2:
    for rule in relation_record.ordered_statistics:
        print(f"Rule: {list(rule.items_base)} -> {list(rule.items_add)} (Support: {relation_record.support}, Confidence: {rule.confidence}, Lift: {rule.lift})")
print("\n")

# ---

# Task 3: Finding Frequent Itemsets in Large Dataset
# Step 1: Use a pre-existing large dataset or generate a synthetic dataset.
# Assuming 'large_transactions.csv' is in the same directory as your script
# You can create a 'large_transactions.csv' file with more extensive data, e.g.:
# apples,bananas,oranges,grapes
# milk,bread,butter,eggs,sugar,flour
# chicken,broccoli,rice
# yogurt,granola,berries
# apples,grapes,pears
# milk,coffee,sugar
# bread,butter,jam,honey
# toothpaste,shampoo,soap
# milk,cereal,banana
# chicken,potatoes,carrots,onions
try:
    df_large_dataset = pd.read_csv('large_transactions.csv', header=None)
except FileNotFoundError:
    print("Error: 'large_transactions.csv' not found. Please create the file or ensure it's in the correct directory.")
    # Create a dummy dataframe for demonstration if file not found
    df_large_dataset = pd.DataFrame([
        "apples,bananas,oranges,grapes",
        "milk,bread,butter,eggs,sugar,flour",
        "chicken,broccoli,rice",
        "yogurt,granola,berries",
        "apples,grapes,pears",
        "milk,coffee,sugar",
        "bread,butter,jam,honey",
        "toothpaste,shampoo,soap",
        "milk,cereal,banana",
        "chicken,potatoes,carrots,onions",
        "apples,oranges,strawberries,blueberries"
    ])

# Step 2: Run Apriori and identify frequent itemsets.
transactions3 = []
for index, row in df_large_dataset.iterrows():
    transactions3.append([item.strip() for item in str(row[0]).split(',') if item.strip()])

association_rules3 = apriori(transactions3, min_support=0.05, min_confidence=0.2, min_lift=3, min_length=2)
results3 = list(association_rules3)

# You can print the results for Task 3
print("--- Task 3 Results ---")
for relation_record in results3:
    for rule in relation_record.ordered_statistics:
        print(f"Rule: {list(rule.items_base)} -> {list(rule.items_add)} (Support: {relation_record.support}, Confidence: {rule.confidence}, Lift: {rule.lift})")

--- Task 1 Results ---
Rule: [] -> ['bread'] (Support: 0.8, Confidence: 0.8, Lift: 1.0)
Rule: [] -> ['butter'] (Support: 0.6, Confidence: 0.6, Lift: 1.0)
Rule: [] -> ['milk'] (Support: 0.6, Confidence: 0.6, Lift: 1.0)
Rule: [] -> ['sugar'] (Support: 0.6, Confidence: 0.6, Lift: 1.0)
Rule: [] -> ['bread', 'butter'] (Support: 0.6, Confidence: 0.6, Lift: 1.0)
Rule: ['bread'] -> ['butter'] (Support: 0.6, Confidence: 0.7499999999999999, Lift: 1.2499999999999998)
Rule: ['butter'] -> ['bread'] (Support: 0.6, Confidence: 1.0, Lift: 1.25)
Rule: ['butter'] -> ['milk'] (Support: 0.4, Confidence: 0.6666666666666667, Lift: 1.1111111111111114)
Rule: ['milk'] -> ['butter'] (Support: 0.4, Confidence: 0.6666666666666667, Lift: 1.1111111111111114)
Rule: ['milk'] -> ['sugar'] (Support: 0.4, Confidence: 0.6666666666666667, Lift: 1.1111111111111114)
Rule: ['sugar'] -> ['milk'] (Support: 0.4, Confidence: 0.6666666666666667, Lift: 1.1111111111111114)
Rule: ['butter'] -> ['bread', 'milk'] (Support: 0.4, Confid

ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4
