## 生成候选项集

### 辅助函数

In [1]:
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [2]:
# 构建所有选项集的集合，只包含一个元素的集合组成的集合
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))    # frozenset表示不可改变集，对C1每个项构建一个不变集合

### 数据集扫描伪代码：  
对数据集中的每条交易记录tran   
&ensp;&ensp;&ensp;&ensp;对每个候选项集can：  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;检查一下can是否是tran的子集：  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;如果是，则增加can的计数值   
对每个候选项集：  
&ensp;&ensp;&ensp;&ensp;如果其支持度不低于最小值，则保留该项集  
返回所有频繁项集列表

In [3]:
def scanD(dataSet, Ck, minSupport):
    ssCnt = {}    # 创建空字典
    for tid in dataSet:    # 遍历数据集中的交易记录
        for can in Ck:    # 遍历Ck中的所有候选集
            # 记录Ck中集合出现的次数
            if can.issubset(tid):    
                if can not in ssCnt:
                    ssCnt[can] = 1
                else:
                    ssCnt[can] += 1
    numItems = float(len(dataSet))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key] / numItems    # 计算所有项集的支持度
        # 不满足最小支持度的集合不会输出
        if support >= minSupport:
            retList.insert(0, key)
        supportData[key] = support
    return retList, supportData

## 完整Apriori算法  

In [4]:
def aprioriGen(Lk, k):    # create Ck
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i + 1, lenLk):
            # 前k-2个项相同时，将两个集合合并
            L1 = list(Lk[i])[: k-2]
            L2 = list(Lk[j])[: k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                retList.append(Lk[i] | Lk[j])    # ‘|’操作符表示并集操作
    return retList

伪代码如下：  
当集合中项的个数大于0时  
&ensp;&ensp;&ensp;&ensp;构建一个k个项组成的候选项集的列表  
&ensp;&ensp;&ensp;&ensp;检查数据以确认每个项集都是频繁的  
&ensp;&ensp;&ensp;&ensp;保留频繁项集并构建k+1项组成的候选项集的列表

In [5]:
def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k =2
    while len(L[k-2]) > 0:
        Ck =aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)    # 扫描Ck，得到Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

In [6]:
dataSet = loadDataSet()
L, suppData = apriori(dataSet)
L

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]

In [7]:
suppData

{frozenset({1}): 0.5,
 frozenset({3}): 0.75,
 frozenset({4}): 0.25,
 frozenset({2}): 0.75,
 frozenset({5}): 0.75,
 frozenset({1, 3}): 0.5,
 frozenset({2, 5}): 0.75,
 frozenset({3, 5}): 0.5,
 frozenset({2, 3}): 0.5,
 frozenset({1, 5}): 0.25,
 frozenset({1, 2}): 0.25,
 frozenset({2, 3, 5}): 0.5}

In [8]:
aprioriGen(L[0], 2)

[frozenset({2, 5}),
 frozenset({3, 5}),
 frozenset({1, 5}),
 frozenset({2, 3}),
 frozenset({1, 2}),
 frozenset({1, 3})]

In [9]:
L, suppData = apriori(dataSet, minSupport=0.7)
L

[[frozenset({5}), frozenset({2}), frozenset({3})], [frozenset({2, 5})], []]

## 挖掘关联规则

In [10]:
def generateRules(L, supportData, minConf=0.7):
    bigRuleList = []
    for i in range(1, len(L)):    # 只获取有2个或更多元素的集合
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]    # 当前频繁集包含的单元素不变集
            if i > 1:
                # 如果频繁项集元素超过2，需要合并
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList

### 对规则进行评估

In [11]:
def calcConf(freqSet, H, supportData, br1, minConf=0.7):
    prunedH = []
    # 遍历H中所有项集，计算可信度
    for conseq in H:
        conf = supportData[freqSet] / supportData[freqSet - conseq]    # 计算可信度
        if conf >= minConf:
            print(freqSet-conseq, '-->', conseq, 'conf:', conf)
            br1.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

### 生成候选规则集合

In [12]:
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
    m = len(H[0])
    if len(freqSet) > (m + 1):    # 尝试进一步合并
        Hmp1 = aprioriGen(H, m + 1)    # 创建m+1条新候选规则
        Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
        if len(Hmp1) > 1:    # 如果不止一条规则，迭代调用rulesFromConseq判断是否可以合并
            rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)

In [13]:
L, suppData = apriori(dataSet, minSupport=0.5)
rules = generateRules(L, suppData, minConf=0.7)
rules

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

In [14]:
rules = generateRules(L, suppData, minConf=0.5)
rules

frozenset({3}) --> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({3}) --> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3, 5}) conf: 0.6666666666666666


[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]