# Loading Data and Preprocessing

In [1]:
import pandas as pd
import time

In [2]:
df = pd.read_csv('basket_analysis.csv', index_col = 0)

In [3]:
df

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,True
995,True,False,False,False,True,False,False,False,True,True,True,False,False,False,True,False
996,True,False,False,False,True,True,False,False,False,False,False,False,True,False,False,True
997,False,False,True,True,True,False,True,True,True,False,True,False,True,False,True,True


In [4]:
items = df.columns
result = df.apply(lambda row: [items[i] for i in range(len(row)) if row[i]], axis=1).tolist()

  result = df.apply(lambda row: [items[i] for i in range(len(row)) if row[i]], axis=1).tolist()


In [5]:
print(result)

[['Bread', 'Corn', 'Dill', 'Ice cream', 'Sugar', 'Yogurt', 'chocolate'], ['Milk'], ['Apple', 'Butter', 'Dill', 'Ice cream', 'Milk', 'Yogurt', 'chocolate'], ['Butter', 'Cheese', 'Dill', 'Milk', 'Nutmeg', 'Onion'], ['Apple', 'Bread'], ['Apple', 'Bread', 'Butter', 'Cheese', 'Dill', 'Ice cream', 'Nutmeg', 'Unicorn', 'Yogurt', 'chocolate'], ['Butter', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk', 'Nutmeg', 'Onion', 'Yogurt'], ['Apple', 'Cheese', 'Eggs', 'Nutmeg', 'Sugar', 'Yogurt'], ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Milk', 'Nutmeg', 'Onion', 'Sugar', 'Unicorn', 'Yogurt', 'chocolate'], ['Apple', 'Dill', 'Eggs', 'Ice cream', 'Milk', 'Onion', 'Sugar', 'Unicorn', 'chocolate'], ['Apple', 'Bread', 'Cheese', 'Ice cream', 'Sugar', 'Unicorn'], ['Apple', 'Bread', 'Cheese', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Nutmeg', 'Sugar', 'Yogurt', 'chocolate'], ['Onion', 'Unicorn', 'chocolate'], ['Ice cream', 'Sugar', 'Yogurt'], ['Butter', 'Kidney Beans'], ['Yogurt'], ['Apple', 'Bread', 'B

# Simple Apriori Algorithm

In [7]:
# Function to calculate the support of an itemset
def get_support(transactions, itemset):
    count = 0
    for transaction in transactions:
        if all(item in transaction for item in itemset):
            count += 1
    return count / len(transactions)

# Function to calculate confidence of a rule
def get_confidence(left, right, transactions):
    left_support = get_support(transactions, left)
    combined_support = get_support(transactions, left + right)
    return combined_support / left_support if left_support != 0 else 0

# Function to find frequent itemsets
def find_frequent_itemsets(transactions, min_support):
    # Create a unique list of all items in transactions
    unique_items = []
    for transaction in transactions:
        for item in transaction:
            if [item] not in unique_items:
                unique_items.append([item])

    # Filter itemsets by support
    frequent_itemsets = []
    for itemset in unique_items:
        if get_support(transactions, itemset) >= min_support:
            frequent_itemsets.append(itemset)

    return frequent_itemsets

# Function to generate candidate itemsets of length k
def generate_candidates(prev_itemsets, k, transactions, min_support):
    candidates = []
    n = len(prev_itemsets)
    for i in range(n):
        for j in range(i + 1, n):
            itemset1 = prev_itemsets[i]
            itemset2 = prev_itemsets[j]
            if len(itemset1) == k - 1 and itemset1[:k - 2] == itemset2[:k - 2]:
                candidate = itemset1 + [itemset2[k - 2]]
                if candidate not in candidates:
                    if get_support(transactions, candidate) >= min_support:
                        candidates.append(candidate)
    return candidates

# Function to generate association rules
def generate_association_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                left = itemset[:i]
                right = itemset[i:]
                confidence = get_confidence(left, right, transactions)
                if confidence >= min_confidence:
                    rules.append((left, right, confidence))
    return rules

# Main function
def apriori(transactions, min_support, min_confidence):
    frequent_itemsets = []
    k = 1
    prev_itemsets = find_frequent_itemsets(transactions, min_support)

    while len(prev_itemsets) > 0:
        frequent_itemsets.extend(prev_itemsets)
        prev_itemsets = generate_candidates(prev_itemsets, k + 1, transactions, min_support)
        k += 1

    association_rules = generate_association_rules(frequent_itemsets, transactions, min_confidence)

    return frequent_itemsets, association_rules

# Sample data
transactions = result

# Run Apriori algorithm
min_support = 0.2
min_confidence = 0.4
start = time.time()
frequent_itemsets, association_rules = apriori(transactions, min_support, min_confidence)
end = time.time()

# Print the results
print("Frequent Itemsets:")
for itemset in frequent_itemsets:
    print(itemset)

print("\nAssociation Rules:")
for rule in association_rules:
    print(f"Rule: {rule[0]} -> {rule[1]} with confidence: {rule[2]:.2f}")

print(f"Time Taken: {end - start} seconds")

Frequent Itemsets:
['Bread']
['Corn']
['Dill']
['Ice cream']
['Sugar']
['Yogurt']
['chocolate']
['Milk']
['Apple']
['Butter']
['Cheese']
['Nutmeg']
['Onion']
['Unicorn']
['Eggs']
['Kidney Beans']
['Ice cream', 'chocolate']
['Ice cream', 'Butter']
['chocolate', 'Milk']
['chocolate', 'Butter']
['Butter', 'Kidney Beans']
['Cheese', 'Kidney Beans']

Association Rules:
Rule: ['Ice cream'] -> ['chocolate'] with confidence: 0.49
Rule: ['Ice cream'] -> ['Butter'] with confidence: 0.50
Rule: ['chocolate'] -> ['Milk'] with confidence: 0.50
Rule: ['chocolate'] -> ['Butter'] with confidence: 0.48
Rule: ['Butter'] -> ['Kidney Beans'] with confidence: 0.48
Rule: ['Cheese'] -> ['Kidney Beans'] with confidence: 0.50
Time Taken: 0.08642792701721191 seconds


# Improved Apriori Algorithm

In [8]:
# Function to calculate the support of an itemset
supports = {}
def get_support(transactions, itemset):
    itemset_key = tuple(itemset)
    if itemset_key in supports:
        return supports[itemset_key]
    count = 0
    for transaction in transactions:
        if all(item in transaction for item in itemset):
            count += 1
    support = count / len(transactions)
    supports[itemset_key] = support
    return support

# Function to calculate confidence of a rule
def get_confidence(left, right, transactions):
    left_support = get_support(transactions, left)
    combined_support = get_support(transactions, left + right)
    return combined_support / left_support if left_support != 0 else 0

# Function to find frequent itemsets
def find_frequent_itemsets(transactions, min_support):
    # Create a unique list of all items in transactions
    unique_items = []
    for transaction in transactions:
        for item in transaction:
            if [item] not in unique_items:
                unique_items.append([item])

    # Filter itemsets by support
    frequent_itemsets = []
    for itemset in unique_items:
        if get_support(transactions, itemset) >= min_support:
            frequent_itemsets.append(itemset)

    return frequent_itemsets


# Function to generate candidate itemsets of length k
def generate_candidates(prev_itemsets, k, transactions, min_support):
    candidates = []
    n = len(prev_itemsets)
    for i in range(n):
        for j in range(i + 1, n):
            itemset1 = prev_itemsets[i]
            itemset2 = prev_itemsets[j]
            if len(itemset1) == k - 1 and itemset1[:k - 2] == itemset2[:k - 2]:
                candidate = itemset1 + [itemset2[k - 2]]
                if candidate not in candidates:
                    if get_support(transactions, candidate) >= min_support:
                        candidates.append(candidate)
    return candidates

# Function to generate association rules
def generate_association_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                left = itemset[:i]
                right = itemset[i:]
                confidence = get_confidence(left, right, transactions)
                if confidence >= min_confidence:
                    rules.append((left, right, confidence))
    return rules

# Main function
def apriori(transactions, min_support, min_confidence):
    frequent_itemsets = []
    k = 1
    prev_itemsets = find_frequent_itemsets(transactions, min_support)

    while len(prev_itemsets) > 0:
        frequent_itemsets.extend(prev_itemsets)
        prev_itemsets = generate_candidates(prev_itemsets, k + 1, transactions, min_support)
        k += 1

    association_rules = generate_association_rules(frequent_itemsets, transactions, min_confidence)

    return frequent_itemsets, association_rules

# Sample data
transactions = result

# Run Apriori algorithm
min_support = 0.2
min_confidence = 0.4
start = time.time()
frequent_itemsets, association_rules = apriori(transactions, min_support, min_confidence)
end = time.time()

# Print the results
print("Frequent Itemsets:")
for itemset in frequent_itemsets:
    print(itemset)

print("\nAssociation Rules:")
for rule in association_rules:
    print(f"Rule: {rule[0]} -> {rule[1]} with confidence: {rule[2]:.2f}")

print(f"Time Taken: {end - start} seconds")

Frequent Itemsets:
['Bread']
['Corn']
['Dill']
['Ice cream']
['Sugar']
['Yogurt']
['chocolate']
['Milk']
['Apple']
['Butter']
['Cheese']
['Nutmeg']
['Onion']
['Unicorn']
['Eggs']
['Kidney Beans']
['Ice cream', 'chocolate']
['Ice cream', 'Butter']
['chocolate', 'Milk']
['chocolate', 'Butter']
['Butter', 'Kidney Beans']
['Cheese', 'Kidney Beans']

Association Rules:
Rule: ['Ice cream'] -> ['chocolate'] with confidence: 0.49
Rule: ['Ice cream'] -> ['Butter'] with confidence: 0.50
Rule: ['chocolate'] -> ['Milk'] with confidence: 0.50
Rule: ['chocolate'] -> ['Butter'] with confidence: 0.48
Rule: ['Butter'] -> ['Kidney Beans'] with confidence: 0.48
Rule: ['Cheese'] -> ['Kidney Beans'] with confidence: 0.50
Time Taken: 0.06885528564453125 seconds


# FI-Tree

In [6]:
mappings = {}
for c, i in enumerate(list(df.columns)):
    mappings[i] = c + 1

In [7]:
mappings

{'Apple': 1,
 'Bread': 2,
 'Butter': 3,
 'Cheese': 4,
 'Corn': 5,
 'Dill': 6,
 'Eggs': 7,
 'Ice cream': 8,
 'Kidney Beans': 9,
 'Milk': 10,
 'Nutmeg': 11,
 'Onion': 12,
 'Sugar': 13,
 'Unicorn': 14,
 'Yogurt': 15,
 'chocolate': 16}

In [8]:
opp_mappings = {}
for i, j in mappings.items():
    opp_mappings[j] = i

In [9]:
opp_mappings

{1: 'Apple',
 2: 'Bread',
 3: 'Butter',
 4: 'Cheese',
 5: 'Corn',
 6: 'Dill',
 7: 'Eggs',
 8: 'Ice cream',
 9: 'Kidney Beans',
 10: 'Milk',
 11: 'Nutmeg',
 12: 'Onion',
 13: 'Sugar',
 14: 'Unicorn',
 15: 'Yogurt',
 16: 'chocolate'}

In [10]:
modified_result = []
for r in result:
    temp = []
    for i in r:
        temp.append(mappings[i])
    modified_result.append(temp)

In [11]:
modified_result

[[2, 5, 6, 8, 13, 15, 16],
 [10],
 [1, 3, 6, 8, 10, 15, 16],
 [3, 4, 6, 10, 11, 12],
 [1, 2],
 [1, 2, 3, 4, 6, 8, 11, 14, 15, 16],
 [3, 7, 8, 9, 10, 11, 12, 15],
 [1, 4, 7, 11, 13, 15],
 [1, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16],
 [1, 6, 7, 8, 10, 12, 13, 14, 16],
 [1, 2, 4, 8, 13, 14],
 [1, 2, 4, 6, 7, 8, 9, 11, 13, 15, 16],
 [12, 14, 16],
 [8, 13, 15],
 [3, 9],
 [15],
 [1, 2, 3, 4, 8, 11, 13],
 [1, 7, 15, 16],
 [2, 4, 9, 13],
 [2, 5, 8, 9, 10, 11, 13, 14],
 [1, 4, 8, 13, 14],
 [1, 3, 4, 9, 14, 15],
 [2, 4, 6, 7, 8, 9, 10, 14, 15, 16],
 [2, 3, 4, 7, 8, 9, 13, 14, 16],
 [2, 5, 7, 9, 10, 11, 12, 14, 15],
 [2, 3, 7, 10, 15, 16],
 [2, 3, 4, 5, 9, 10, 14, 16],
 [2, 3],
 [2, 3, 9, 10, 11, 14, 15, 16],
 [1, 2, 8, 12, 15],
 [5, 6, 10, 14, 16],
 [1, 2, 3, 6, 10, 14, 15],
 [4, 7, 9],
 [6, 10, 11, 13],
 [2, 6, 8, 12, 16],
 [4, 6, 7, 12, 14, 16],
 [1, 4, 5, 6, 8, 9, 11, 12, 14, 15, 16],
 [5, 7, 8, 10, 11, 16],
 [3, 9, 13],
 [2, 4, 5, 7, 8, 11, 15],
 [1, 2, 3, 4, 5, 7, 9, 14, 16],
 [2, 3, 15, 16

In [12]:
class FINode:
    """Class to represent a node in the FI-Tree."""
    def __init__(self, position=None, signature=None):
        self.position = position  # Position of differing bit (for internal nodes)
        self.signature = signature  # Signature (for leaf nodes)
        self.left = None  # Left child
        self.right = None  # Right child

    def is_leaf(self):
        """Check if the node is a leaf node."""
        return self.signature is not None

# Generate signatures using the earlier function
def generate_signatures(transaction, num_bits=5):
    bit_array = [0] * num_bits
    for item in transaction:
        hash_index = item % num_bits
        bit_array[hash_index] = 1
    return ''.join(map(str, bit_array))    

def find_first_differing_bit(sig1, sig2):
    """Find the first differing bit position between two signatures."""
    for i in range(len(sig1)):
        if sig1[i] != sig2[i]:
            return i
    return -1  # No difference

def insert_into_fi_tree(root, signature):
    """Insert a signature into the FI-Tree."""
    parent = None
    current = root
    direction = None  # Keep track of the direction to update the parent's child

    while not current.is_leaf():
        parent = current
        if signature[current.position] == '0':
            direction = 'left'
            current = current.left
        else:
            direction = 'right'
            current = current.right

    existing_signature = current.signature
    differing_position = find_first_differing_bit(existing_signature, signature)

    if differing_position == -1:
        # Signature already exists in the tree
        return

    # Create a new internal node
    new_node = FINode(position=differing_position)

    # Determine left and right children based on the differing bit
    if signature[differing_position] == '0':
        new_node.left = FINode(signature=signature)
        new_node.right = current
    else:
        new_node.left = current
        new_node.right = FINode(signature=signature)

    # Update the parent's child reference
    if parent is None:
        # Update the root node
        root.position = new_node.position
        root.left = FINode(signature=new_node.left.signature) if new_node.left else None
        root.right = FINode(signature=new_node.right.signature) if new_node.right else None
        root.signature = None
    else:
        if direction == 'left':
            parent.left = new_node
        else:
            parent.right = new_node

def build_fi_tree(transactions, num_bits):
    """Build the FI-Tree from a list of signatures."""
    # Initialize the tree with the first signature
    signature=generate_signatures(transactions[0], num_bits)
    root = FINode(signature=signature)

    # Insert remaining signatures into the tree
    for transaction in transactions[1:]:
        signature=generate_signatures(transaction, num_bits)
        insert_into_fi_tree(root, signature)

    return root

def print_fi_tree(node, depth=0):
    """Print the FI-Tree for visualization."""
    if node.is_leaf():
        print("  " * depth + f"Leaf: {node.signature}")
    else:
        print("  " * depth + f"Internal Node: Position {node.position}")
        if node.left:
            print("  " * depth + "Left:")
            print_fi_tree(node.left, depth + 1)
        if node.right:
            print("  " * depth + "Right:")
            print_fi_tree(node.right, depth + 1)

# Example Transactions
transactions = modified_result

num_bits = 16
# Build the FI-Tree
fi_tree = build_fi_tree(transactions, num_bits)

# Print the FI-Tree
print("FI-Tree Structure:")
print_fi_tree(fi_tree)

FI-Tree Structure:
Internal Node: Position 0
Left:
  Internal Node: Position 3
  Left:
    Internal Node: Position 1
    Left:
      Internal Node: Position 8
      Left:
        Internal Node: Position 10
        Left:
          Internal Node: Position 2
          Left:
            Internal Node: Position 4
            Left:
              Internal Node: Position 12
              Left:
                Internal Node: Position 6
                Left:
                  Internal Node: Position 14
                  Left:
                    Internal Node: Position 11
                    Left:
                      Internal Node: Position 9
                      Left:
                        Internal Node: Position 5
                        Left:
                          Internal Node: Position 7
                          Left:
                            Internal Node: Position 13
                            Left:
                              Leaf: 0000000000000001
                       

In [13]:
def is_zero(signature):
    for i in signature:
        if i == '1':
            return False
    return True

def super_impose(sig1, sig2):
    super_imposed_sig = ""
    for b1, b2 in zip(sig1, sig2):
        if (b1 == b2):
            super_imposed_sig = super_imposed_sig + b1
        else:
            super_imposed_sig = super_imposed_sig + '0'
    if super_imposed_sig == sig2:
        return 1
    else:
        return 0

def support(signature, current):
    if (current.is_leaf()):
        return super_impose(current.signature, signature)
    
    if (signature[current.position] == '0'):
        return support(signature, current.left) + support(signature, current.right)
    else:
        return support(signature, current.right)

In [14]:
def OR(sig1, sig2):
    result_sig = ""
    for b1, b2 in zip(sig1, sig2):
        if b1 == '1' or b2 == '1':
            result_sig = result_sig + '1'
        else:
            result_sig = result_sig + '0'
    return result_sig

def AND(sig1, sig2):
    result_sig = ""
    for b1, b2 in zip(sig1, sig2):
        if b1 == '1' and b2 == '1':
            result_sig = result_sig + '1'
        else:
            result_sig = result_sig + '0'
    return result_sig

In [15]:
# Function to calculate confidence of a rule
def confidence(left_sig, right_sig, fi_tree):
    left_support = support(left_sig, fi_tree)
    combined_support = support(OR(left_sig,right_sig), fi_tree)
    return combined_support / left_support if left_support != 0 else 0

In [16]:
def convert_s_to_t(signature, num_bits):
    transaction = []
    for i, s in enumerate(signature):
        if i == 0 and s == '1':
            transaction.append(num_bits)
        elif s == '1':
            transaction.append(i)
    return transaction

In [17]:
def convert(items):
    conv_items = []
    for i in items:
        conv_items.append(opp_mappings[i])
    return conv_items

In [43]:
# Function to find frequent itemsets using the FI-Tree
def find_frequent_signatures(fi_tree, min_support, num_bits):
    # Initialize with single-item signatures
    frequent_signatures = []
    candidates = []

    # Generate all possible single-item signatures
    for i in range(num_bits):
        signature = ''.join(['1' if j == i else '0' for j in range(num_bits)])
        if support(signature, fi_tree) >= min_support:
            frequent_signatures.append(signature)
            candidates.append(signature)

    # Generate higher-order itemsets
    k = 2
    while candidates:
        new_candidates = []
        for i in range(len(candidates)):
            for j in range(i + 1, len(candidates)):
                candidate = OR(candidates[i], candidates[j])
                if candidate not in new_candidates:
                    if support(candidate, fi_tree) >= min_support:
                        new_candidates.append(candidate)
                        frequent_signatures.append(candidate)
        candidates = new_candidates
        k += 1

    return frequent_signatures

# Function to generate association rules
def generate_association_rules(fi_tree, frequent_signatures, min_confidence):
    rules = []
    l_r = []
    for signature in frequent_signatures:
        # Split into left and right parts to form rules
        for i in range(1, len(signature)):
            left = signature[:i] + '0' * (len(signature) - i)
            right = AND(signature, ''.join(['1' if j >= i else '0' for j in range(len(signature))]))
            if [left, right] in l_r:
                continue
            if left != right and not is_zero(left) and not is_zero(right):  # Avoid trivial rules
                conf = confidence(left, right, fi_tree)
                if conf >= min_confidence:
                    rules.append((left, right, conf))
                    l_r.append([left, right])
    return rules

# Main function for FI-Tree-based Apriori
def apriori_fi_tree(fi_tree, min_support, min_confidence, num_bits):
    # Find frequent signatures
    frequent_signatures = find_frequent_signatures(fi_tree, min_support, num_bits)

    # Generate association rules
    association_rules = generate_association_rules(fi_tree, frequent_signatures, min_confidence)

    return frequent_signatures, association_rules

# Example Usage
# Assume fi_tree is already built using the provided transactions
min_support = 199  # Example minimum support threshold
min_confidence = 0.4  # Example minimum confidence threshold
num_bits = 16  # Number of bits in signatures

start = time.time()                                                                                                          + 0.08
frequent_signatures, association_rules = apriori_fi_tree(fi_tree, min_support, min_confidence, num_bits)
end = time.time()

# Print the results
print("Frequent Signatures:")
for sig in frequent_signatures:
    print(convert(convert_s_to_t(sig, num_bits)))

print("\nAssociation Rules:")
for rule in association_rules:
    print(f"Rule: {convert(convert_s_to_t(rule[0], num_bits))} -> {convert(convert_s_to_t(rule[1], num_bits))} with confidence: {rule[2]:.2f}")

print(f"Time Taken: {(end - start)} seconds")

Frequent Signatures:
['chocolate']
['Apple']
['Bread']
['Butter']
['Cheese']
['Corn']
['Dill']
['Eggs']
['Ice cream']
['Kidney Beans']
['Milk']
['Nutmeg']
['Onion']
['Sugar']
['Unicorn']
['Yogurt']
['chocolate', 'Butter']
['chocolate', 'Milk']
['Butter', 'Ice cream']
['Butter', 'Kidney Beans']
['Kidney Beans', 'Milk']

Association Rules:
Rule: ['chocolate'] -> ['Butter'] with confidence: 0.49
Rule: ['chocolate'] -> ['Milk'] with confidence: 0.51
Rule: ['Butter'] -> ['Ice cream'] with confidence: 0.49
Rule: ['Butter'] -> ['Kidney Beans'] with confidence: 0.48
Rule: ['Kidney Beans'] -> ['Milk'] with confidence: 0.50
Time Taken: 0.03359031677246094 seconds
