In [429]:
import numpy as np
import itertools as it
from collections import defaultdict

In [430]:
# loading data
with open("./retail.csv", 'r') as file:
    baskets = file.readlines()

# transactions = [[int(item) for item in basket.strip().split(' ')] for basket in baskets]
transactions = [[1, 3, 4],
                [2, 3, 5],
                [1, 2, 3, 5],
                [2, 5],
                [1, 3, 5]]

frequent_sets = []
# support = round(0.01*len(transactions))
support = 2
confidence = 0.60
print(f"support threshold = {support}")
print(f"confidence threshold = {confidence}")

support threshold = 2
confidence threshold = 0.6


In [431]:
# Pass 1: recording singleton support in dictionary
# This generates C1
C1 = defaultdict(int)
for basket in transactions:
    for item in basket:
        C1[item] += 1        

print(C1)
print(f"There are {len(C1)} candidate singletons.")

defaultdict(<class 'int'>, {1: 3, 3: 4, 4: 1, 2: 3, 5: 4})
There are 5 candidate singletons.


In [432]:
# Pruning C1: removing infrequent singletons from dictionary
# this generates L1

items = list(C1.keys())

# if the item count is less than support, entry is removed from dictionary
for item in items:
    if C1[item] < support:
        del C1[item]
L1 = C1
print(L1)
print(f"There are {len(L1)} frequent singletons.")
frequent_sets.append(list(L1.keys()))

defaultdict(<class 'int'>, {1: 3, 3: 4, 2: 3, 5: 4})
There are 4 frequent singletons.


In [433]:
# Will generate candidate item pairs in C2 using singletons in L1

pairs = it.combinations(frequent_sets[0],2)

C2 = {}
for pair in pairs:
    pair = tuple(sorted(pair))
    C2[pair] = 0

print(C2)
print(f"There are {len(C2)} candidate item pairs.")

{(1, 3): 0, (1, 2): 0, (1, 5): 0, (2, 3): 0, (3, 5): 0, (2, 5): 0}
There are 6 candidate item pairs.


In [434]:
# Pass 2: finding support of all candidate item pairs in C2

for basket in transactions:
    
    for pair in it.combinations(basket,2):
        pair = tuple(sorted(pair))
        if pair in C2:
            C2[pair] += 1
print(C2)

{(1, 3): 3, (1, 2): 1, (1, 5): 2, (2, 3): 2, (3, 5): 3, (2, 5): 3}


In [435]:
# Pruning: removing all infrequent item pairs from C2
# this will generate L2

pairs = list(C2.keys())

for pair in pairs:
    if C2[pair] < support:
        del C2[pair]
        
L2 = C2
print(L2)
print(f"There are {len(L2)} frequent item pairs.")
frequent_sets.append(list(L2.keys()))

{(1, 3): 3, (1, 5): 2, (2, 3): 2, (3, 5): 3, (2, 5): 3}
There are 5 frequent item pairs.


In [436]:
# will use individual items in each pair in L2 to generate candidate item triples
# this generates C3

# extracting individual items from frequent pairs
L2_items = set([item for pair in frequent_sets[1] for item in pair])

# creating all possible triples from extracted singletons
triples = it.combinations(L2_items, 3)

# populating C3 with candidate triples
C3 = {}
for t in triples:
    t = tuple(sorted(t))
    C3[t] = 0

print(C3)
print(f"There are {len(C3)} candidate item triples.")

{(1, 2, 3): 0, (1, 2, 5): 0, (1, 3, 5): 0, (2, 3, 5): 0}
There are 4 candidate item triples.


In [437]:
# Initial Prune of C3: will remove all triples containing infrequent pairs (pairs not in L2)

triples = list(C3.keys())
for t in triples:
    for p in it.combinations(t, 2):
        p = tuple(sorted(p))
        if p not in L2:
            del C3[t]
            break

print(C3)
print(f"There are now {len(C3)} candidate item triples after initial pruning.")
    

{(1, 3, 5): 0, (2, 3, 5): 0}
There are now 2 candidate item triples after initial pruning.


In [438]:
# Pass 3: counting support for all triples

for basket in transactions:

    for t in it.combinations(basket, 3):
        t = tuple(sorted(t))
        if t in C3:
            C3[t] += 1

print(C3)

{(1, 3, 5): 2, (2, 3, 5): 2}


In [439]:
# Pruning C3 to remove infrequent triples
# this generates L3

triples = list(C3.keys())

for t in triples:
    if C3[t] < support:
        del C3[t]

L3 = C3

print(L3)
print(f"There are now {len(L3)} candidate item triples after support threshold pruning.")
frequent_sets.append(list(L3.keys()))

{(1, 3, 5): 2, (2, 3, 5): 2}
There are now 2 candidate item triples after support threshold pruning.


In [440]:
print(frequent_sets)

[[1, 3, 2, 5], [(1, 3), (1, 5), (2, 3), (3, 5), (2, 5)], [(1, 3, 5), (2, 3, 5)]]


In [441]:
# generating association rules based on frequent pairs and triples
rules = []


for p in frequent_sets[1]:
    
    # confidence of i -> j
    conf_i_j = L2[p]/L1[p[0]]
    if(conf_i_j >= 0.6):
        rules.append(p)
    
    # confidence of j -> i
    conf_j_i = L2[p]/L1[p[1]]
    if(conf_i_j >= 0.6):
        rules.append((p[1],p[0]))


for t in frequent_sets[2]:
    
    for c in t:
        # a,b -> c
        a_b = sorted(tuple(set(t) - {c}))
        print(a_b)
        
    
    
        


[3, 5]
[1, 5]
[1, 3]
[3, 5]
[2, 5]
[2, 3]
