In [1]:
import pandas as pd
import itertools

# 1️⃣ Load Dataset
data = pd.read_csv('/home/varun/Desktop/scratch/data.csv')

# 2️⃣ Preprocess Data
def preprocess_data(df):
    records = []
    for _, row in df.iterrows():
        record = [str(item) for item in row if item != 'Nan']
        records.append(record)
    return records

records = preprocess_data(data)

# 3️⃣ Generate Frequent 1-itemsets
def get_frequent_1_itemsets(records, min_support_count):
    items = [item for sublist in records for item in sublist]
    item_counts = pd.Series(items).value_counts()

    # Filter items by minimum support count
    frequent_items = item_counts[item_counts >= min_support_count].to_dict()
    return frequent_items

# 4️⃣ Generate Candidate Itemsets
def generate_candidates(prev_frequent_itemsets, k):
    candidates = []
    prev_items = list(prev_frequent_itemsets.keys())
    for i in range(len(prev_items)):
        for j in range(i + 1, len(prev_items)):
            candidate = tuple(sorted(set(prev_items[i]).union(set(prev_items[j]))))
            if len(candidate) == k:
                candidates.append(candidate)
    return candidates

# 5️⃣ Calculate Support for Itemsets
def calculate_support(candidates, records):
    support_count = {candidate: 0 for candidate in candidates}
    for record in records:
        for candidate in candidates:
            if set(candidate).issubset(set(record)):
                support_count[candidate] += 1
    return support_count

# 6️⃣ Apriori Algorithm Implementation
def apriori(records, min_support_count):
    frequent_itemsets = {}
    k = 1
    current_frequent_itemsets = get_frequent_1_itemsets(records, min_support_count)
    
    while current_frequent_itemsets:
        frequent_itemsets.update(current_frequent_itemsets)
        candidates = generate_candidates(current_frequent_itemsets, k + 1)
        support_count = calculate_support(candidates, records)

        # Filter candidates by support threshold
        current_frequent_itemsets = {itemset: count for itemset, count in support_count.items() if count >= min_support_count}

        k += 1

    return frequent_itemsets

# 7️⃣ Run Apriori Algorithm
min_support_count = 2
frequent_itemsets = apriori(records, min_support_count)

# 8️⃣ Display Results
print("Frequent Itemsets Found:")
for itemset, count in frequent_itemsets.items():
    print(f"{itemset}: {count} occurrences")


Frequent Itemsets Found:
A: 3 occurrences
B: 2 occurrences
C: 2 occurrences
('A', 'C'): 2 occurrences
