# Frequent Itemsets and Association Rules

In [2]:
#dev/python3

import numpy as np
import itertools

In [3]:
#read the dataset of baskets

file = open('T10I4D100K.dat', 'r')

data = []
for line in file:
    x = list(map(int, line.split(" ")[:-1]))
    data.append(set(x))

numItems = len(data)


In [4]:
#count number of occurences per item 
#we create a dict with item: occurences (C1)

def generateC1(dataSet):
    productDict = {}
    for basket in dataSet:
        for product in basket:
            if product not in productDict:
                productDict[product] = 1
            else:
                productDict[product] = productDict[product] + 1
    return productDict

In [5]:
#eliminate items that have a support under the threshold (C -> L)

def filterBySupport(d, support):
    return {k:v for k,v in d.items() if d[k]/numItems > support}

In [6]:
#count number of occurences per itemSet 
#we create a dict with itemSet: occurences (Ck)

def generateCk(lk, k, dataset):
    
    countDict = {}
    
    if k > 2:
        #flatten
        lk = {item for t in lk for item in t}

    
    for basket in dataset:
        
        intersection = basket.intersection(lk)
        
        if len(intersection) > k:
            newcandidates = itertools.combinations(sorted(intersection), k)
            
            for c in newcandidates:

                if c not in countDict:
                    countDict[c] = 1
                else:
                    countDict[c] +=1
    
    return countDict
                

In [13]:
#test first 3 rounds

C1 = generateC1(data)
L1 = filterBySupport(C1, 0.005)

C2 = generateCk(set(L1.keys()), 2, data)
L2 = filterBySupport(C2, 0.005)

C3 = generateCk(set(L2.keys()), 3, data)
L3 = filterBySupport(C3, 0.005)
L3

{(4, 373, 716): 510,
 (6, 32, 472): 522,
 (12, 227, 390): 605,
 (12, 227, 722): 607,
 (12, 390, 722): 624,
 (21, 413, 793): 697,
 (21, 413, 857): 686,
 (21, 793, 857): 693,
 (32, 285, 947): 584,
 (33, 217, 283): 792,
 (33, 217, 346): 801,
 (33, 217, 515): 789,
 (33, 283, 346): 802,
 (33, 283, 515): 786,
 (33, 346, 515): 797,
 (39, 704, 825): 991,
 (54, 944, 998): 576,
 (70, 684, 765): 692,
 (70, 684, 819): 697,
 (70, 765, 819): 694,
 (75, 438, 684): 639,
 (105, 494, 815): 551,
 (105, 494, 862): 549,
 (105, 494, 944): 557,
 (105, 815, 862): 547,
 (105, 815, 944): 565,
 (105, 862, 944): 559,
 (120, 593, 862): 571,
 (120, 593, 895): 577,
 (120, 862, 895): 571,
 (158, 354, 583): 593,
 (158, 354, 617): 595,
 (158, 583, 617): 590,
 (185, 471, 678): 656,
 (185, 471, 960): 656,
 (185, 678, 960): 652,
 (192, 638, 935): 504,
 (192, 888, 935): 505,
 (208, 290, 458): 751,
 (208, 290, 888): 764,
 (208, 290, 969): 764,
 (208, 458, 888): 756,
 (208, 458, 969): 755,
 (208, 888, 969): 771,
 (217, 283, 

In [14]:
#given a support recursivly create C and L 
#and append them to two lists
#exit when the last L is empty

def generateLK(dataset, support):
    
    dataset = dataset.copy()
    
    L = []
    C = []
    
    C.append(generateC1(dataset))
    L.append(filterBySupport(C[0], support))
    
    for k in range(1, 20):
        
        #print("C: ", k-1,  len(C[-1].keys()))
        #print("L: ", k-1, len(L[-1].keys()))
        
        C.append(generateCk(set(L[-1].keys()), k+1, dataset))
        L.append(filterBySupport(C[-1], support))
        
                  
        if len(L[len(L)-1]) == 0: 
            break
        
    return L,C
        

In [15]:
#generate frequent ItemSets

L, C = generateLK(data, 0.005)
print("candidates generated")

candidates generated


In [16]:
#print the keys (only first 10) of each round of itemSets
#singletons, pairs, triplets...

for k in range(len(L)):
    
    print( "round ", k+1, " : ",  list(L[k])[0:10], "\n")

round  1  :  [448, 834, 164, 775, 328, 687, 240, 368, 274, 561] 

round  2  :  [(448, 538), (39, 704), (39, 825), (704, 825), (708, 883), (708, 978), (853, 883), (883, 978), (529, 782), (674, 720)] 

round  3  :  [(39, 704, 825), (708, 883, 978), (571, 623, 795), (571, 623, 853), (571, 795, 853), (623, 795, 853), (392, 801, 862), (350, 411, 572), (350, 411, 579), (350, 411, 803)] 

round  4  :  [(350, 411, 572, 579), (350, 411, 572, 803), (350, 411, 572, 842), (350, 411, 579, 803), (350, 411, 579, 842), (350, 411, 803, 842), (350, 572, 579, 803), (350, 572, 579, 842), (350, 572, 803, 842), (350, 579, 803, 842)] 

round  5  :  [] 



In [17]:
#generate all possible association rules 
#from all the frequent itemSets using combinations

def generateAssociationRule(freqSet):
    
    associationRule = []
    
    for item in freqSet:
        item = list(item)
        if len(item) != 0:
            length = len(item) - 1
            while length > 0:
                combinations = list(itertools.combinations(item, length))
                temp = []
                LS = []
                for RS in combinations:
                    LS = set(item) - set(RS)
                    temp.append(list(LS))
                    temp.append(list(RS))
                    associationRule.append(temp)
                    temp = []
                length = length - 1
    return associationRule
   


In [21]:
#compute confidence for any antecedent and consequent rules lenghts

def confidence(rule):
    xAndY = sorted(rule[0] + rule[1])
    
    #access verification for antecendent of lenght 1
    if len(rule[0])==1:
        if rule[0][0] in C[len(rule[0])-1]:
            x = C[len(rule[0])-1][rule[0][0]]
        else:
            return 0
        
    #access verification for antecendent of lenght >=2
    else:
        if tuple(rule[0]) in C[len(rule[0])-1]:
            x = C[len(rule[0])-1][tuple(rule[0])]
        else:
            return 0
        
    #access verification for the consequent 
    if tuple(xAndY) in C[len(xAndY)-1]:
        xy = C[len(xAndY)-1][tuple(xAndY)]
        return (xy / x)
    else:
        return 0
                

In [22]:
#generate association rules and filter them given a confidence
#return a dict: rule (as string) : confidence

def associationRule(freqItem, minConf=0.8):
    
    confDict = dict()

    for k in range(1, len(freqItem)):

        rules = generateAssociationRule(freqItem[k])

        for i in range(len(rules)):

            c = confidence(rules[i])

            if c > minConf: 
                confDict[str(rules[i][0]) + " --> " + str(rules[i][1]) ] = c  
    return confDict


In [23]:
#generate association rules from frequent itemsets found before

AR = associationRule(L, 0.95)
print(len(AR))
AR

41


{'[185, 471] --> [678]': 0.9647058823529412,
 '[185, 471] --> [960]': 0.9647058823529412,
 '[185, 678] --> [471]': 0.9879518072289156,
 '[185, 678] --> [960]': 0.9819277108433735,
 '[192, 638] --> [935]': 0.9527410207939508,
 '[208, 290] --> [888]': 0.9514321295143213,
 '[208, 290] --> [969]': 0.9514321295143213,
 '[208, 458, 888] --> [969]': 0.951058201058201,
 '[208, 969] --> [888]': 0.956575682382134,
 '[217, 515] --> [346]': 0.9596678529062871,
 '[217, 546, 661] --> [923]': 0.9692307692307692,
 '[217, 546, 661] --> [947]': 0.9709401709401709,
 '[217, 546, 923] --> [661]': 0.984375,
 '[217, 546, 923] --> [947]': 0.9756944444444444,
 '[217, 546, 947] --> [661]': 0.9844020797227037,
 '[217, 546, 947] --> [923]': 0.9740034662045061,
 '[217, 546] --> [661]': 0.9766277128547579,
 '[217, 546] --> [923]': 0.9616026711185309,
 '[217, 546] --> [947]': 0.9632721202003339,
 '[217, 923, 947] --> [546]': 0.9639794168096055,
 '[283, 515] --> [217]': 0.9568862275449102,
 '[283, 515] --> [346]': 0.

In [24]:
#create both frequent itemsets and assRules given support and confidence

def outputRulesFromSC(support, confidence):
    
    L, C = generateLK(data, support)
    AR = associationRule(L, confidence)
    
    return AR
    

In [25]:
#run it!

outputRulesFromSC(0.006, 0.96)

{'[185, 471] --> [678]': 0.9647058823529412,
 '[185, 471] --> [960]': 0.9647058823529412,
 '[185, 678] --> [471]': 0.9879518072289156,
 '[185, 678] --> [960]': 0.9819277108433735,
 '[283, 515] --> [346]': 0.9640718562874252,
 '[33, 515] --> [346]': 0.9672330097087378}