# Frequent Itemsets and Association Rules

In [1]:
#dev/python3

import numpy as np
import itertools

In [2]:
#read the dataset of baskets

file = open('T10I4D100K.dat', 'r')

data = []
for line in file:
    x = list(map(int, line.split(" ")[:-1]))
    data.append(set(x))

numItems = len(data)


In [3]:
#count number of occurences per item 
#we create a dict with item: occurences (C1)

def generateC1(dataSet):
    productDict = {}
    for basket in dataSet:
        for product in basket:
            if product not in productDict:
                productDict[product] = 1
            else:
                productDict[product] = productDict[product] + 1
    return productDict

In [4]:
#eliminate items that have a support under the threshold (C -> L)

def filterBySupport(d, support):
    return {k:v for k,v in d.items() if d[k]/numItems > support}

In [5]:
#count number of occurences per itemSet 
#we create a dict with itemSet: occurences (Ck)

def generateCk(lk, k, dataset):
    
    countDict = {}
    
    if k > 2:
        #flatten
        lk = {item for t in lk for item in t}

    
    for basket in dataset:
        
        intersection = basket.intersection(lk)
        
        if len(intersection) > k:
            newcandidates = itertools.combinations(sorted(intersection), k)
            
            for c in newcandidates:

                if c not in countDict:
                    countDict[c] = 1
                else:
                    countDict[c] +=1
    
    return countDict
                

In [6]:
#test first 3 rounds

#C1 = generateC1(data)
#L1 = filterBySupport(C1, 0.01)
#print(L1)
#C2 = generateCk(set(L1.keys()), 2, data)
#L2 = filterBySupport(C2, 0.01)
#print(L2)
#C3 = generateCk(set(L2.keys()), 3, data)
#L3 = filterBySupport(C3, 0.01)
#print(L3)

In [7]:
#given a support recursivly create C and L 
#and append them to two lists
#exit when the last L is empty

def generateLK(dataset, support):
    
    dataset = dataset.copy()
    
    L = []
    C = []
    
    C.append(generateC1(dataset))
    L.append(filterBySupport(C[0], support))
    
    for k in range(1, 20):
        
        #print("C: ", k-1,  len(C[-1].keys()))
        #print("L: ", k-1, len(L[-1].keys()))
        
        C.append(generateCk(set(L[-1].keys()), k+1, dataset))
        L.append(filterBySupport(C[-1], support))
        
                  
        if len(L[len(L)-1]) == 0: 
            break
        
    return L,C
        

In [8]:
#generate frequent ItemSets

L, C = generateLK(data, 0.005)
print("candidates generated")

candidates generated


In [9]:
#print the keys (only first 10) of each round of itemSets
#singletons, pairs, triplets...

for k in range(len(L)):
    
    print( "round ", k+1, " : ",  list(L[k])[0:10], "\n")

round  1  :  [448, 834, 164, 775, 328, 687, 240, 368, 274, 561] 

round  2  :  [(448, 538), (39, 704), (39, 825), (704, 825), (708, 883), (708, 978), (853, 883), (883, 978), (529, 782), (674, 720)] 

round  3  :  [(39, 704, 825), (708, 883, 978), (571, 623, 795), (571, 623, 853), (571, 795, 853), (623, 795, 853), (392, 801, 862), (350, 411, 572), (350, 411, 579), (350, 411, 803)] 

round  4  :  [(350, 411, 572, 579), (350, 411, 572, 803), (350, 411, 572, 842), (350, 411, 579, 803), (350, 411, 579, 842), (350, 411, 803, 842), (350, 572, 579, 803), (350, 572, 579, 842), (350, 572, 803, 842), (350, 579, 803, 842)] 

round  5  :  [] 



In [10]:
#generate all possible association rules 
#from all the frequent itemSets using combinations

def generateAssociationRule(freqSet):
    
    associationRule = []
    
    for item in freqSet:
        item = list(item)
        if len(item) != 0:
            length = len(item) - 1
            while length > 0:
                combinations = list(itertools.combinations(item, length))
                temp = []
                LHS = []
                for RHS in combinations:
                    LHS = set(item) - set(RHS)
                    temp.append(list(LHS))
                    temp.append(list(RHS))
                    associationRule.append(temp)
                    temp = []
                length = length - 1
    return associationRule
   


In [11]:
#compute confidence whatever the rule's antecedent and consequent lenghts are

def confidence(rule):
    xAndY = sorted(rule[0] + rule[1])
    if len(rule[0])==1:
        if rule[0][0] in C[len(rule[0])-1]:
            x = C[len(rule[0])-1][rule[0][0]]
        else:
            return 0
    else:
        if tuple(rule[0]) in C[len(rule[0])-1]:
            x = C[len(rule[0])-1][tuple(rule[0])]
        else:
            return 0
    
    if tuple(xAndY) in C[len(xAndY)-1]:
        xy = C[len(xAndY)-1][tuple(xAndY)]
        return (xy / x)
    else:
        return 0
                

In [12]:
#generate association rules and filter them given a confidence
#return a dict: rule (as string) : confidence

def associationRule(freqItem, minConf=0.8):
    
    confDict = dict()

    for k in range(1, len(freqItem)):

        rules = generateAssociationRule(freqItem[k])

        for i in range(len(rules)):

            c = confidence(rules[i])

            if c > minConf: 
                confDict[str(rules[i][0]) + " --> " + str(rules[i][1]) ] = c  
    return confDict


In [13]:
#generate association rules from frequent itemsets found before

AR = associationRule(L, 0.95)
print(len(AR))
AR

41


{'[969] --> [888]': 0.9540636042402827,
 '[392, 801] --> [862]': 0.9593373493975904,
 '[208, 290] --> [888]': 0.9514321295143213,
 '[208, 290] --> [969]': 0.9514321295143213,
 '[888, 969] --> [208]': 0.9518518518518518,
 '[208, 969] --> [888]': 0.956575682382134,
 '[638, 935] --> [192]': 0.9824561403508771,
 '[192, 638] --> [935]': 0.9527410207939508,
 '[33, 515] --> [217]': 0.9575242718446602,
 '[33, 346] --> [283]': 0.9502369668246445,
 '[33, 515] --> [283]': 0.9538834951456311,
 '[33, 515] --> [346]': 0.9672330097087378,
 '[283, 515] --> [217]': 0.9568862275449102,
 '[346, 515] --> [217]': 0.9528857479387515,
 '[217, 515] --> [346]': 0.9596678529062871,
 '[283, 515] --> [346]': 0.9640718562874252,
 '[546, 661] --> [217]': 0.9590163934426229,
 '[217, 546] --> [661]': 0.9766277128547579,
 '[546, 923] --> [217]': 0.96,
 '[217, 546] --> [923]': 0.9616026711185309,
 '[217, 546] --> [947]': 0.9632721202003339,
 '[546, 923] --> [661]': 0.9866666666666667,
 '[546, 661] --> [923]': 0.9704918

In [14]:
#create both frequent itemsets and assRules given support and confidence

def outputRulesFromSC(support, confidence):
    
    L, C = generateLK(data, support)
    AR = associationRule(L, confidence)
    
    return AR
    

In [15]:
#run it!

outputRulesFromSC(0.006, 0.96)

{'[33, 515] --> [346]': 0.9672330097087378,
 '[283, 515] --> [346]': 0.9640718562874252,
 '[185, 678] --> [471]': 0.9879518072289156,
 '[185, 471] --> [678]': 0.9647058823529412,
 '[185, 471] --> [960]': 0.9647058823529412,
 '[185, 678] --> [960]': 0.9819277108433735}