Data Cleaning

In [26]:
cleaned_rows = []
invalid_rows = []

with open("basket.csv", "r") as f:
    for i, line in enumerate(f, start=1):
        fields = line.strip().split(",")
        if len(fields) == 5:
            cleaned_rows.append(line.strip())
        else:
            invalid_rows.append((i, line.strip()))
with open("basket_cleaned.csv", "w") as f:
    for row in cleaned_rows:
        f.write(row + "\n")

print(f"Cleaning complete: {len(cleaned_rows)} valid rows saved.")
print(f"{len(invalid_rows)} invalid rows removed.")


Cleaning complete: 1104 valid rows saved.
316 invalid rows removed.


Function is used to generate all the subset of any set

In [8]:
from collections import OrderedDict
import pandas as pd
from itertools import combinations

def genSubsets(l):
    powerSetSize = 2 ** len(l)
    powerSet = []
    for i in range(1, powerSetSize):
        tempEle = []
        for j in range(len(l)):
            if i & (1 << j):
                tempEle.append(l[j])
        powerSet.append(tempEle)
    return powerSet


In [9]:
def initPass(txList):
    allTx = [item for tx in txList for item in tx]
    allTx.sort()
    cntr = OrderedDict()
    for tx in allTx:
        cntr[tx] = cntr.get(tx, 0) + 1
    return cntr


Apriori Algorithm Implementation

In [10]:
def genCandidate(Fk1):
    Ck = []
    k1 = len(Fk1[0])
    for i in range(len(Fk1) - 1):
        for j in range(i + 1, len(Fk1)):
            f1, f2 = Fk1[i], Fk1[j]
            if f1[:len(f1) - 1] == f2[:len(f2) - 1] and f1[-1] < f2[-1]:
                tempC = f1 + [f2[-1]]
                subset = genSubsets(tempC)
                appendSts = True
                for item in subset:
                    if len(item) == k1 and item not in Fk1:
                        appendSts = False
                if appendSts:
                    Ck.append(tempC)
    return Ck

In [11]:
def searchInT(t, candid):
    return all(eachCandid in t for eachCandid in candid)

def apriori(T, minSup):
    finalSet = []
    c1 = initPass(T)
    f = [[item] for item in c1.keys() if c1[item] / len(T) >= minSup]
    for item in f:
        finalSet.append(item)
    while len(f) != 0:
        Ck = genCandidate(f)
        freqDict = {}
        for t in T:
            for candidate in Ck:
                if searchInT(t, candidate):
                    freqDict[tuple(candidate)] = freqDict.get(tuple(candidate), 0) + 1
        f = []
        for c in freqDict.keys():
            if freqDict[c] / len(T) >= minSup:
                f.append(list(c))
        if len(f) != 0:
            f = sorted(f, key=lambda x: (len(x), *x))
            for item in f:
                finalSet.append(item)
    return finalSet
def generate_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    total_tx = len(transactions)
    itemset_counts = {}
    for itemset in frequent_itemsets:
        count = sum(1 for tx in transactions if set(itemset).issubset(set(tx)))
        itemset_counts[tuple(itemset)] = count / total_tx
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                antecedent = list(antecedent)
                consequent = [x for x in itemset if x not in antecedent]
                antecedent_support = itemset_counts.get(tuple(antecedent), 0)
                rule_support = itemset_counts[tuple(itemset)]
                if antecedent_support > 0:
                    confidence = rule_support / antecedent_support
                    if confidence >= min_confidence:
                        rules.append((antecedent, consequent, rule_support, confidence))
    return rules

Running on Different Support and confidence values

In [12]:
data = pd.read_csv("basket_cleaned.csv", header=None, names=['Cert1', 'Cert2', 'Ethnicity', 'City', 'ZIP'])
transactions = []
for _, row in data.iterrows():
    items = []
    if pd.notna(row['Cert1']): items.append(row['Cert1'])
    if pd.notna(row['Cert2']): items.append(row['Cert2'])
    if pd.notna(row['Ethnicity']): items.append(row['Ethnicity'])
    transactions.append(items)

support_values = [0.05, 0.1, 0.2]
confidence_values = [0.1,0.3,0.5, 0.7, 0.9]

for min_support in support_values:
    for min_confidence in confidence_values:
        print(f"\nSupport: {min_support}, Confidence: {min_confidence}")
        frequent_itemsets = apriori(transactions, min_support)
        rules = generate_rules(frequent_itemsets, transactions, min_confidence)

        print(f"Number of frequent itemsets: {len(frequent_itemsets)}")
        print(f"Number of rules: {len(rules)}")
        for rule in rules[:5]:
            antecedent, consequent, support, confidence = rule
            print(f"{antecedent} -> {consequent}, Support: {support:.3f}, Confidence: {confidence:.3f}")


Support: 0.05, Confidence: 0.1
Number of frequent itemsets: 23
Number of rules: 38
['ASIAN'] -> ['MBE'], Support: 0.196, Confidence: 0.986
['MBE'] -> ['ASIAN'], Support: 0.196, Confidence: 0.319
['ASIAN'] -> ['New York'], Support: 0.056, Confidence: 0.283
['New York'] -> ['ASIAN'], Support: 0.056, Confidence: 0.184
['BLACK'] -> ['Brooklyn'], Support: 0.060, Confidence: 0.224

Support: 0.05, Confidence: 0.3
Number of frequent itemsets: 23
Number of rules: 26
['ASIAN'] -> ['MBE'], Support: 0.196, Confidence: 0.986
['MBE'] -> ['ASIAN'], Support: 0.196, Confidence: 0.319
['Brooklyn'] -> ['BLACK'], Support: 0.060, Confidence: 0.415
['BLACK'] -> ['MBE'], Support: 0.267, Confidence: 1.000
['MBE'] -> ['BLACK'], Support: 0.267, Confidence: 0.436

Support: 0.05, Confidence: 0.5
Number of frequent itemsets: 23
Number of rules: 13
['ASIAN'] -> ['MBE'], Support: 0.196, Confidence: 0.986
['BLACK'] -> ['MBE'], Support: 0.267, Confidence: 1.000
['Bronx'] -> ['MBE'], Support: 0.063, Confidence: 0.909


Maximum size of rule that can be created

In [18]:
transactions = []
for _, row in data.iterrows():
    items = []
    if pd.notna(row['Cert1']): items.append(row['Cert1'])
    if pd.notna(row['Cert2']): items.append(row['Cert2'])
    if pd.notna(row['Ethnicity']): items.append(row['Ethnicity'])
    transactions.append(items)

min_support = 0.05
frequent_itemsets = apriori(transactions, min_support)
max_size = max(len(itemset) for itemset in frequent_itemsets) if frequent_itemsets else 0
print(f"Maximum size of frequent itemsets: {max_size}")
print(f"Maximum size of rule possible: {max_size}")

Maximum size of frequent itemsets: 3
Maximum size of rule possible: 3


In [19]:
transactions = []
for _, row in data.iterrows():
    items = []
    if pd.notna(row['Cert1']): items.append(row['Cert1'])
    if pd.notna(row['Cert2']): items.append(row['Cert2'])
    if pd.notna(row['Ethnicity']): items.append(row['Ethnicity'])
    transactions.append(items)

min_support = 0.1
frequent_itemsets = apriori(transactions, min_support)
max_size = max(len(itemset) for itemset in frequent_itemsets) if frequent_itemsets else 0
print(f"Maximum size of frequent itemsets: {max_size}")
print(f"Maximum size of rule possible: {max_size}")

Maximum size of frequent itemsets: 3
Maximum size of rule possible: 3


In [20]:
transactions = []
for _, row in data.iterrows():
    items = []
    if pd.notna(row['Cert1']): items.append(row['Cert1'])
    if pd.notna(row['Cert2']): items.append(row['Cert2'])
    if pd.notna(row['Ethnicity']): items.append(row['Ethnicity'])
    transactions.append(items)

min_support = 0.2
frequent_itemsets = apriori(transactions, min_support)
max_size = max(len(itemset) for itemset in frequent_itemsets) if frequent_itemsets else 0
print(f"Maximum size of frequent itemsets: {max_size}")
print(f"Maximum size of rule possible: {max_size}")

Maximum size of frequent itemsets: 2
Maximum size of rule possible: 2


 Confidence value for which Minimum number of rules are generated

In [25]:
min_support = 0.05
confidence_values = [0.1,0.3,0.5, 0.7, 0.9]
frequent_itemsets = apriori(transactions, min_support)

rule_counts = {}
for min_confidence in confidence_values:
    rules = generate_rules(frequent_itemsets, transactions, min_confidence)
    rule_counts[min_confidence] = len(rules)
    print(f"Confidence: {min_confidence}, Number of rules: {len(rules)}")

min_rules_conf = min(rule_counts, key=rule_counts.get)
print(f"\nConfidence value with minimum number of rules: {min_rules_conf} ({rule_counts[min_rules_conf]} rules)")

Confidence: 0.1, Number of rules: 38
Confidence: 0.3, Number of rules: 26
Confidence: 0.5, Number of rules: 13
Confidence: 0.7, Number of rules: 10
Confidence: 0.9, Number of rules: 10

Confidence value with minimum number of rules: 0.7 (10 rules)
