# CH11 Apriori

In [1]:
import numpy as np
from functools import reduce
import matplotlib.pyplot as plt

- Association Analysis (Association rule learning)
    - frequent item sets : **SUPPORT**
    - association rules : **CONFIDENCE**
- Apriori

## Frequent Item Sets

### Helper

In [2]:
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [3]:
def createC1(dataSet):
    data = reduce(lambda x,y:x+y,dataSet)
    data = set(data)
    data =[[x] for x in data]
    data.sort()
    return list(map(frozenset,data))

In [8]:
dataSet = loadDataSet()

In [9]:
C1 = createC1(dataSet)
C1

[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]

In [10]:
def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt.keys(): 
                    ssCnt[can]=1
                else: 
                    ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)
        supportData[key] = support
    return retList, supportData

In [11]:
D = list(map(set,dataSet))
D

[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]

In [12]:
L1,suppData0 = scanD(D,C1,0.5)

In [13]:
L1

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [14]:
suppData0

{frozenset({1}): 0.5,
 frozenset({3}): 0.75,
 frozenset({4}): 0.25,
 frozenset({2}): 0.75,
 frozenset({5}): 0.75}

In [15]:
def aprioriGen(Lk, k): #creates Ck
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

In [16]:
def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    print("C1:",C1)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        print("Ck:",Ck)
        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData


In [17]:
apriori(dataSet)

C1: [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
Ck: [frozenset({2, 5}), frozenset({3, 5}), frozenset({1, 5}), frozenset({2, 3}), frozenset({1, 2}), frozenset({1, 3})]
Ck: [frozenset({2, 3, 5})]
Ck: []


([[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
  [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
  [frozenset({2, 3, 5})],
  []],
 {frozenset({1}): 0.5,
  frozenset({3}): 0.75,
  frozenset({4}): 0.25,
  frozenset({2}): 0.75,
  frozenset({5}): 0.75,
  frozenset({1, 3}): 0.5,
  frozenset({2, 5}): 0.75,
  frozenset({3, 5}): 0.5,
  frozenset({2, 3}): 0.5,
  frozenset({1, 5}): 0.25,
  frozenset({1, 2}): 0.25,
  frozenset({2, 3, 5}): 0.5})

## Association Rules

In [18]:
def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    bigRuleList = []
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList    

In [19]:
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            print (freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH


In [20]:
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

In [25]:
def pntRules(ruleList, itemMeaning):
    for ruleTup in ruleList:
        for item in ruleTup[0]:
            print (itemMeaning[item])
        print( "           -------->")
        for item in ruleTup[1]:
            print(itemMeaning[item])
        print( "confidence: %f" % ruleTup[2])
        print()       #print a blank line

In [26]:
L,suppData=apriori(dataSet,minSupport=0.5)

C1: [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
Ck: [frozenset({2, 5}), frozenset({3, 5}), frozenset({1, 5}), frozenset({2, 3}), frozenset({1, 2}), frozenset({1, 3})]
Ck: [frozenset({2, 3, 5})]
Ck: []


In [27]:
rules = generateRules(L,suppData,minConf=0.7)

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


## EX1 : Congressional Voting Records

In [28]:
import pandas as pd

In [30]:
pd.read_table("./Data/CH11/recent100bills.txt",header=None)

Unnamed: 0,0,1
0,13020,Amending FAA Rulemaking Activities
1,12939,Prohibiting Federal Funding of National Public...
2,12888,Additional Continuing Appropriations
3,12940,Removing Troops from Afghanistan
4,12884,Department of Defense funding and Continuing A...
5,12887,Terminating the Neighborhood Stabilization Pro...
6,12828,Continuing Appropriations Amendments
7,12830,Prioritizing Payment of Public Debt
8,12857,Calling for a Balanced Budget Constitutional A...
9,12885,Terminating the Federal Housing Administration...


## EX2 : Agaricus lepiota

In [None]:
2**100