In [53]:
import csv
import random
from itertools import combinations

# Define the fixed list of items (ensuring deterministic behavior)
items = ["Milk", "Bread", "Eggs", "Diapers", "Soap", "Shampoo", "Towel", "Juice", "Cereal", "Cheese"]

# Seed the random number generator for repeatability
random.seed(2020)

# Save the items list into a file
items_file = "items.csv"
with open(items_file, "w", newline="") as f:
    writer = csv.writer(f)
    for item in items:
        writer.writerow([item])

# Function to read the items list from a file
def read_items(file_path):
    with open(file_path, "r") as f:
        return [line.strip() for line in f if line.strip()]

# Function to generate a random number of items per transaction (between 3 and 8)
def get_random_count():
    return random.randint(3, 8)

# Function to generate a set of transactions
def generate_transactions(item_list, num_transactions=20):
    transactions = []
    for _ in range(num_transactions):
        num_items = get_random_count()
        transaction = random.sample(item_list, num_items)
        transactions.append(sorted(transaction))  # Sorting to ensure deterministic order
    return transactions

# Function to save transactions into a CSV file
def save_transactions(transactions, file_path):
    with open(file_path, "w", newline="") as f:
        writer = csv.writer(f)
        for transaction in transactions:
            writer.writerow(transaction)

# Load the items from the saved file
loaded_items = read_items(items_file)

# Generate and save 5 transaction datasets
for i in range(1, 6):
    transactions = generate_transactions(loaded_items)
    save_transactions(transactions, f"data{i}.csv")

print("Data generation complete. Files generated: items.csv, data1.csv, data2.csv, data3.csv, data4.csv, data5.csv")

Data generation complete. Files generated: items.csv, data1.csv, data2.csv, data3.csv, data4.csv, data5.csv


In [54]:
import csv
import itertools
import time
from apriori_python import apriori

# Function to read a CSV file and return list of transactions (each transaction is a list)
def read_csv(filename):
    data = []
    try:
        with open(filename, "r") as f:
            reader = csv.reader(f)
            for row in reader:
                if row:
                    data.append([item.strip() for item in row if item.strip() != ""])
    except Exception as e:
        print(f"Error reading {filename}: {e}")
    return data

# Calculate how many transactions contain the itemset
def support_count(itemset, transactions):
    return sum(1 for trans in transactions if set(itemset).issubset(set(trans)))

# Generate association rules from frequent itemsets using a simple approach
def generate_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        # Try every possible non-empty subset of itemset as antecedent
        for i in range(1, len(itemset)):
            for antecedent in itertools.combinations(itemset, i):
                antecedent = list(antecedent)
                consequent = list(set(itemset) - set(antecedent))
                if not antecedent:
                    continue
                count_antecedent = support_count(antecedent, transactions)
                count_itemset = support_count(itemset, transactions)
                if count_antecedent == 0:
                    continue
                confidence = count_itemset / count_antecedent
                if confidence >= min_confidence:
                    rules.append({
                        "full_itemset": itemset,
                        "antecedent": antecedent,
                        "consequent": consequent,
                        "support": round(count_itemset/len(transactions), 4),
                        "confidence": round(confidence, 4)
                    })
    return rules

# Brute-force method: generate all possible item combinations and filter by support
def brute_force(transactions, items, min_support, min_confidence):
    n = len(transactions)
    min_count = min_support * n
    all_frequent = []
    start_time = time.perf_counter()
    k = 1
    while True:
        combos = list(itertools.combinations(items, k))
        freq = []
        for combo in combos:
            count = support_count(combo, transactions)
            if count >= min_count:
                freq.append(combo)
        if not freq:
            break
        all_frequent.extend(freq)
        k += 1
    rules = generate_rules(all_frequent, transactions, min_confidence)
    exec_time = time.perf_counter() - start_time
    return all_frequent, rules, exec_time

# Run the Apriori algorithm from the package
def run_apriori_algo(transactions, min_support, min_confidence):
    start_time = time.perf_counter()
    freq_itemsets, assoc_rules = apriori(transactions, minSup=min_support, minConf=min_confidence)
    flat_freq = []
    for size, itemset_list in freq_itemsets.items():
        for itemset in itemset_list:
            flat_freq.append(tuple(itemset))
    rules = []
    for rule in assoc_rules:
        antecedent = tuple(rule[0])
        consequent = tuple(rule[1])
        full_itemset = tuple(set(antecedent) | set(consequent))
        count_full = support_count(full_itemset, transactions)
        rules.append({
            "full_itemset": full_itemset,
            "antecedent": antecedent,
            "consequent": consequent,
            "support": round(count_full/len(transactions), 4),
            "confidence": round(rule[2], 4)
        })
    exec_time = time.perf_counter() - start_time
    return flat_freq, rules, exec_time

In [55]:

def main():
    # Prompt user for input
    ds_num = input("Enter dataset number (1-5): ").strip()
    if ds_num not in ['1', '2', '3', '4', '5']:
        print("Invalid dataset selection. Exiting.")
        return
    dataset_file = f"data{ds_num}.csv"
    try:
        transactions = read_csv(dataset_file)
        if not transactions:
            print("No transactions found in the selected dataset.")
            return
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Read the items list from items.csv
    try:
        items_data = read_csv("items.csv")
        # Each row in items.csv has one item; flatten the list
        items = [row[0] for row in items_data if row]
    except Exception as e:
        print(f"Error loading items: {e}")
        return

    try:
        min_support = float(input("Enter minimum support (e.g., 0.3): ").strip())
        min_confidence = float(input("Enter minimum confidence (e.g., 0.6): ").strip())
    except Exception as e:
        print("Invalid support or confidence value. Exiting.")
        return

    print("\nRunning Apriori algorithm (Python package)...")
    apriori_freq, apriori_rules, time_apriori = run_apriori_algo(transactions, min_support, min_confidence)

    print("\nRunning Brute-force algorithm...")
    brute_freq, brute_rules, time_brute = brute_force(transactions, items, min_support, min_confidence)

    # Output results
    print("\n=== RESULTS ===")
    print(f"Dataset: {dataset_file}")
    print(f"Minimum Support: {min_support} | Minimum Confidence: {min_confidence}\n")

    print("--- APRIORI RESULTS ---")
    print(f"Total Frequent Itemsets Found: {len(apriori_freq)}")
    print("Frequent Itemsets:")
    for itemset in apriori_freq[:10]:  # Display only first 10 for brevity
        count = support_count(itemset, transactions)
        support_val = round(count/len(transactions), 4)
        print(f"{itemset} - Count: {count}, Support: {support_val}")
    if len(apriori_freq) > 10:
        print("... (more frequent itemsets found)")

    print("\nTotal Association Rules Generated: ", len(apriori_rules))
    print("Sample Rules:")
    if apriori_rules:
        for rule in apriori_rules[:5]:  # Display first 5 rules
            print(f"{rule['antecedent']} => {rule['consequent']}, Support: {rule['support']}, Confidence: {rule['confidence']}")
        if len(apriori_rules) > 5:
            print("... (more rules found)")
    else:
        print("No association rules generated by Apriori.")

    print("\n--- BRUTE-FORCE RESULTS ---")
    print(f"Total Frequent Itemsets Found: {len(brute_freq)}")
    print("Frequent Itemsets:")
    for itemset in brute_freq[:10]:  # Display only first 10 for brevity
        count = support_count(itemset, transactions)
        support_val = round(count/len(transactions), 4)
        print(f"{itemset} - Count: {count}, Support: {support_val}")
    if len(brute_freq) > 10:
        print("... (more frequent itemsets found)")

    print("\nTotal Association Rules Generated: ", len(brute_rules))
    print("Sample Rules:")
    if brute_rules:
        for rule in brute_rules[:5]:  # Display first 5 rules
            print(f"{rule['antecedent']} => {rule['consequent']}, Support: {rule['support']}, Confidence: {rule['confidence']}")
        if len(brute_rules) > 5:
            print("... (more rules found)")
    else:
        print("No association rules generated by brute-force.")

    print("\n--- EXECUTION TIME COMPARISON ---")
    print(f"Apriori execution time: {time_apriori:.4f} seconds")
    print(f"Brute-force execution time: {time_brute:.4f} seconds")

    if time_apriori < time_brute:
        print("Apriori algorithm is faster.")
    else:
        print("Brute-force algorithm is faster.")

    print("\n--- FINAL COMPARISON SUMMARY ---")
    print(f"Total Frequent Itemsets: Apriori = {len(apriori_freq)}, Brute-force = {len(brute_freq)}")
    print(f"Total Association Rules: Apriori = {len(apriori_rules)}, Brute-force = {len(brute_rules)}")
    print(f"Execution Time: Apriori = {time_apriori:.4f}s, Brute-force = {time_brute:.4f}s")

if __name__ == "__main__":
    main()

Enter dataset number (1-5): 1
Enter minimum support (e.g., 0.3): 0.3
Enter minimum confidence (e.g., 0.6): 0.6

Running Apriori algorithm (Python package)...

Running Brute-force algorithm...

=== RESULTS ===
Dataset: data1.csv
Minimum Support: 0.3 | Minimum Confidence: 0.6

--- APRIORI RESULTS ---
Total Frequent Itemsets Found: 66
Frequent Itemsets:
('Soap',) - Count: 8, Support: 0.4
('Towel',) - Count: 11, Support: 0.55
('Eggs',) - Count: 12, Support: 0.6
('Cereal',) - Count: 15, Support: 0.75
('Juice',) - Count: 14, Support: 0.7
('Diapers',) - Count: 14, Support: 0.7
('Cheese',) - Count: 8, Support: 0.4
('Shampoo',) - Count: 10, Support: 0.5
('Bread',) - Count: 13, Support: 0.65
('Milk',) - Count: 9, Support: 0.45
... (more frequent itemsets found)

Total Association Rules Generated:  130
Sample Rules:
('Towel',) => ('Eggs', 'Juice'), Support: 0.35, Confidence: 0.6364
('Towel',) => ('Juice', 'Cereal'), Support: 0.35, Confidence: 0.6364
('Juice',) => ('Eggs',), Support: 0.45, Confide