In [1]:
# Question 4: Combine hierarchical clustering with Apriori to analyze clustered data and find frequent patterns within each cluster of a given dataset.

In [None]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
from collections import defaultdict
from itertools import combinations

def generate_frequent_itemsets(transactions, min_support):
    item_counts = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_counts[frozenset([item])] += 1
    
    L1 = {itemset: count for itemset, count in item_counts.items() if count >= min_support}
    
    frequent_itemsets = {1: L1}
    k = 2
    while True:
        Ck = generate_candidate_itemsets(frequent_itemsets[k-1], k)
        Lk = defaultdict(int)
        for transaction in transactions:
            for candidate in Ck:
                if candidate.issubset(transaction):
                    Lk[candidate] += 1
        
        Lk = {itemset: count for itemset, count in Lk.items() if not Lk[itemset] < min_support}
        if not Lk:
            break
        frequent_itemsets[k] = Lk
        k += 1
    return frequent_itemsets

def generate_candidate_itemsets(Lk_minus_1, k):
    candidates = set()
    items = sorted(list(Lk_minus_1.keys()))
    for i in range(len(items)):
        for j in range(i + 1, len(items)):
            itemset1 = list(items[i])
            itemset2 = list(items[j])
            
            itemset1.sort()
            itemset2.sort()

            if k == 2 or itemset1[:-1] == itemset2[:-1]:
                new_candidate = frozenset(sorted(list(itemset1) + list(itemset2)))
                candidates.add(new_candidate)
    return candidates

def generate_association_rules(frequent_itemsets, min_confidence):
    rules = []
    for k, Lk in frequent_itemsets.items():
        if k > 1:
            for itemset in Lk:
                for antecedent_tuple in combinations(itemset, 1):
                    antecedent = frozenset(antecedent_tuple)
                    consequent = itemset - antecedent
                    if consequent:
                        confidence = Lk[itemset] / frequent_itemsets[len(antecedent)][antecedent]
                        if confidence >= min_confidence:
                            rules.append((antecedent, consequent, confidence, Lk[itemset]))
    return rules

def preprocess_transactions_for_clustering(transactions):
    unique_items = sorted(list(set(item for sublist in transactions for item in sublist)))
    item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
    
    binary_vectors = []
    for transaction in transactions:
        vector = [0] * len(unique_items)
        for item in transaction:
            vector[item_to_idx[item]] = 1
        binary_vectors.append(vector)
    return np.array(binary_vectors), unique_items

transactions_data = [
    ['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
    ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
    ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
    ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
    ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs'],
    ['Bread', 'Butter'],
    ['Bread', 'Milk'],
    ['Butter', 'Milk'],
    ['Bread', 'Butter', 'Milk'],
    ['Cheese', 'Wine']
]

min_support = 2
min_confidence = 0.6
num_clusters = 3
binary_data, all_items = preprocess_transactions_for_clustering(transactions_data)
distances = pdist(binary_data, metric='jaccard')
linked = linkage(distances, method='ward')
clusters = fcluster(linked, num_clusters, criterion='maxclust')
clustered_transactions = defaultdict(list)
for i, cluster_id in enumerate(clusters):
    clustered_transactions[cluster_id].append(transactions_data[i])
print("Hierarchical Clustering Results:")
for cluster_id, cluster_transactions in clustered_transactions.items():
    print(f"\nCluster {cluster_id} (Number of transactions: {len(cluster_transactions)}):")
    for trans in cluster_transactions:
        print(f"  {trans}")
    
    print(f"\nApplying Apriori to Cluster {cluster_id}:")
    frequent_itemsets_cluster = generate_frequent_itemsets(cluster_transactions, min_support)
    if frequent_itemsets_cluster:
        for k, itemsets in frequent_itemsets_cluster.items():
            if itemsets:
                print(f"  Frequent {k}-itemsets:")
                for itemset, count in itemsets.items():
                    print(f"    {list(itemset)}: {count}")
        
        association_rules_cluster = generate_association_rules(frequent_itemsets_cluster, min_confidence)
        if association_rules_cluster:
            print("  Association Rules:")
            for antecedent, consequent, confidence, support in association_rules_cluster:
                print(f"    {list(antecedent)} -> {list(consequent)} (Support: {support}, Confidence: {confidence:.2f})")
        else:
            print("  No association rules found for this cluster with the given confidence.")
    else:
        print("  No frequent itemsets found for this cluster with the given support.")

Hierarchical Clustering Results:

Cluster 2 (Number of transactions: 5):
  ['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt']
  ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt']
  ['Milk', 'Apple', 'Kidney Beans', 'Eggs']
  ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt']
  ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']

Applying Apriori to Cluster 2:
  Frequent 1-itemsets:
    ['Milk']: 3
    ['Onion']: 4
    ['Nutmeg']: 2
    ['Kidney Beans']: 5
    ['Eggs']: 4
    ['Yogurt']: 3
    ['Corn']: 2
  Frequent 2-itemsets:
    ['Onion', 'Nutmeg']: 2
    ['Onion', 'Kidney Beans']: 3
    ['Nutmeg', 'Yogurt']: 2
    ['Milk', 'Kidney Beans']: 3
    ['Eggs', 'Yogurt']: 2
    ['Eggs', 'Nutmeg']: 2
    ['Eggs', 'Kidney Beans']: 4
    ['Nutmeg', 'Kidney Beans']: 2
    ['Onion', 'Eggs']: 3
    ['Onion', 'Yogurt']: 2
    ['Milk', 'Eggs']: 2
    ['Milk', 'Yogurt']: 2
    ['Kidney Beans', 'Yogurt']: 3
    ['Kidney Beans', 'Corn']: 2
  Frequent 3-itemsets:
 