# Apriori算法
--- 
## 关联分析Association Analysis

关联分析是一种在大规模数据集中寻找有趣关系的任务。 这些关系可以有两种形式:

- 频繁项集（frequent item sets）: 经常出现在一块的物品的集合。
- 关联规则（associational rules）: 暗示两种物品之间可能存在很强的关系。

支持度与可信度
- 支持度(support): 数据集中包含该项集(子集合)的记录所占的比例
- 可信度或置信度(confidence): 一条规则 A -> B 的可信度定义为 support(A | B) / support(A)。

`支持度`和 `可信度` 是用来量化` 关联分析 `是否成功的一个方法. 。 假设想找到支持度大于 0.8 的所有项集，应该如何去做呢？ 一个办法是生成一个物品所有可能组合的清单，然后对每一种组合统计它出现的频繁程度，但是当物品成千上万时，上述做法就非常非常慢了.

## Apriori算法原理
假设有4个商品{0, 1, 2, 3}的所有可能的项集合 2^4 - 1 = 15个.随着物品的增加，计算的次数呈指数的形式增长.

为了降低计算次数和时间，研究人员发现了一种所谓的 `Apriori `原理，即某个项集是频繁的，那么它的所有子集也是频繁的。 例如，如果 {0, 1} 是频繁的，那么 {0}, {1} 也是频繁的。 该原理直观上没有什么帮助，但是如果反过来看就有用了，也就是说如果一个项集是 `非频繁项集`，那么它的所有`超集`也是非频繁项集.
![Apriori](../img/Apriori.png)

In [1]:
from itertools import combinations
from functools import reduce

Apriori 算法优缺点

* 优点：易编码实现
* 缺点：在大数据集上可能较慢
* 适用数据类型：数值型 或者 标称型数据。

In [4]:
def create_c1(data_set):
    # 生成含单个元素的集合的列表
    C1 = []
    for transaction in data_set:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    # frozenset 不可变集合
    return list(map(set, C1))

In [6]:
# 加载数据集
def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [7]:
data = loadDataSet()
c1 = create_c1(data)
c1

[{1}, {2}, {3}, {4}, {5}]

In [8]:
reduce(set.union, c1)

{1, 2, 3, 4, 5}

In [9]:
def apriori_gen(Lk, k):
    # 输入频繁项集列表 Lk 与返回的元素个数 k，然后输出所有可能的候选项集 Ck
    # ck = [{1, 2}, {0, 1}, {2, 3}] k=3 -> [{0, 1,2 }, {1, 2, 3}, {0, 2, 3}]
#         union_set = reduce(set.union, map(set, Lk))
#         return list(map(frozenset, combinations(union_set, k)))
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[: k-2]
            L2 = list(Lk[j])[: k-2]
            # print '-----i=', i, k-2, Lk, Lk[i], list(Lk[i])[: k-2]
            # print '-----j=', j, k-2, Lk, Lk[j], list(Lk[j])[: k-2]
            L1.sort()
            L2.sort()
            print(f"i={i}, j={j}: ", L1, L2)
            # 第一次 L1,L2 为空，元素直接进行合并，返回元素两两合并的数据集
            # if first k-2 elements are equal
            # {1, 2} {1, 3} [1] == [1] -> {1, 2, 3}
            # {2, 3} {2, 4} [2] == [2] -> {2, 3, 4}
            if L1 == L2:
                # set union
                # print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
                union_set = Lk[i] | Lk[j]
                print(f"union_set: {union_set}")
                retList.append(union_set)
    return retList

# ck = [[{0}, {1}, {2}, {3}]]
l = apriori_gen(c1, 2)
l

i=0, j=1:  [] []
union_set: {1, 2}
i=0, j=2:  [] []
union_set: {1, 3}
i=0, j=3:  [] []
union_set: {1, 4}
i=0, j=4:  [] []
union_set: {1, 5}
i=1, j=2:  [] []
union_set: {2, 3}
i=1, j=3:  [] []
union_set: {2, 4}
i=1, j=4:  [] []
union_set: {2, 5}
i=2, j=3:  [] []
union_set: {3, 4}
i=2, j=4:  [] []
union_set: {3, 5}
i=3, j=4:  [] []
union_set: {4, 5}


[{1, 2},
 {1, 3},
 {1, 4},
 {1, 5},
 {2, 3},
 {2, 4},
 {2, 5},
 {3, 4},
 {3, 5},
 {4, 5}]

In [10]:
apriori_gen(l, 3)

i=0, j=1:  [1] [1]
union_set: {1, 2, 3}
i=0, j=2:  [1] [1]
union_set: {1, 2, 4}
i=0, j=3:  [1] [1]
union_set: {1, 2, 5}
i=0, j=4:  [1] [2]
i=0, j=5:  [1] [2]
i=0, j=6:  [1] [2]
i=0, j=7:  [1] [3]
i=0, j=8:  [1] [3]
i=0, j=9:  [1] [4]
i=1, j=2:  [1] [1]
union_set: {1, 3, 4}
i=1, j=3:  [1] [1]
union_set: {1, 3, 5}
i=1, j=4:  [1] [2]
i=1, j=5:  [1] [2]
i=1, j=6:  [1] [2]
i=1, j=7:  [1] [3]
i=1, j=8:  [1] [3]
i=1, j=9:  [1] [4]
i=2, j=3:  [1] [1]
union_set: {1, 4, 5}
i=2, j=4:  [1] [2]
i=2, j=5:  [1] [2]
i=2, j=6:  [1] [2]
i=2, j=7:  [1] [3]
i=2, j=8:  [1] [3]
i=2, j=9:  [1] [4]
i=3, j=4:  [1] [2]
i=3, j=5:  [1] [2]
i=3, j=6:  [1] [2]
i=3, j=7:  [1] [3]
i=3, j=8:  [1] [3]
i=3, j=9:  [1] [4]
i=4, j=5:  [2] [2]
union_set: {2, 3, 4}
i=4, j=6:  [2] [2]
union_set: {2, 3, 5}
i=4, j=7:  [2] [3]
i=4, j=8:  [2] [3]
i=4, j=9:  [2] [4]
i=5, j=6:  [2] [2]
union_set: {2, 4, 5}
i=5, j=7:  [2] [3]
i=5, j=8:  [2] [3]
i=5, j=9:  [2] [4]
i=6, j=7:  [2] [3]
i=6, j=8:  [2] [3]
i=6, j=9:  [2] [4]
i=7, j=8:  [3

[{1, 2, 3},
 {1, 2, 4},
 {1, 2, 5},
 {1, 3, 4},
 {1, 3, 5},
 {1, 4, 5},
 {2, 3, 4},
 {2, 3, 5},
 {2, 4, 5},
 {3, 4, 5}]

In [13]:
class Apriori:
    def __init__(self, min_support=0.5):
        # 支持度 >= 最小支持度的候选项集以及它们的支持度。即我们的频繁项集。
        self.min_support = min_support
    @staticmethod
    def create_c1(data_set):
        # 生成含单个元素的集合的列表
        C_1 = []
        for transaction in data_set:
            for item in transaction:
                if not [item] in C_1:
                    C_1.append([item])
        C_1.sort()
        # frozenset 不可变集合  hashable
        return list(map(frozenset, C_1))
    
    @staticmethod
    def apriori_gen(Lk, k):
        # 输入频繁项集列表 Lk 与返回的元素个数 k，然后输出所有可能的候选项集 Ck
        # ck = [{1, 2}, {0, 1}, {2, 3}] k=3 -> [{0, 1,2 }, {1, 2, 3}, {0, 2, 3}]
#         union_set = reduce(set.union, map(set, Lk))
#         return list(map(frozenset, combinations(union_set, k)))
        retList = []
        lenLk = len(Lk)
        for i in range(lenLk):
            for j in range(i+1, lenLk):
                L1 = list(Lk[i])[: k-2]
                L2 = list(Lk[j])[: k-2]
                # print '-----i=', i, k-2, Lk, Lk[i], list(Lk[i])[: k-2]
                # print '-----j=', j, k-2, Lk, Lk[j], list(Lk[j])[: k-2]
                L1.sort()
                L2.sort()
                # 第一次 L1,L2 为空，元素直接进行合并，返回元素两两合并的数据集
                # if first k-2 elements are equal
                if L1 == L2:
                    # set union
                    # print 'union=', Lk[i] | Lk[j], Lk[i], Lk[j]
                    retList.append(Lk[i] | Lk[j])
        return retList
    
    def fit(self, data):
        c_1 = self.create_c1(data)
        D = list(map(set, data))
        
        # 按子集元素个数排列 [[含1个元素的子集,], [2个], [3个]]
        support_list = []
        l, support_dict = self.scan(D, c_1)
        support_list.append(l)
        k = 2
        while True: 
            # 由满足支持度条件的ck 生成的含2, 3 ..个元素组成的集合列表
            c_k = self.apriori_gen(support_list[k-2], k)
            list_, dict_ = self.scan(D, c_k)
            support_dict.update(dict_)
            
            # 子集含元素数目增加过程中 若不满足支持度 超集也不满足
            if not list_:
                break
            support_list.append(list_)
            k += 1
        return support_dict, support_list
            
    def scan(self, D, C_k):
        """scan（计算候选数据集 CK 在数据集 D 中的支持度，并返回支持度大于最小支持度 minSupport 的数据）
        Args:
            D 数据集
            Ck 候选项集列表
            minSupport 最小支持度
        Returns:
            retList 支持度大于 minSupport 的集合
            supportData 候选项集支持度数据
        """
        # 数据集ck在数据集D中的支持度, 
        # 并返回支持度大于最小支持度（minSupport）的数据
        
        # 计算每个集合的频数
        cnt = {}
        for set_ in D:
            for C in C_k:
                # 若C为D中数据集的子集, C频数+1
                if C.issubset(set_):
                    cnt[C] = cnt.get(C, 0) + 1
        num = len(D)
        ret_C = {}
        retList = []
        for key in cnt:
            support = cnt[key] / num
            if support >= self.min_support:
                retList.insert(0, key)
            ret_C.update({key: support})
                
        return retList, ret_C

In [14]:
apriori = Apriori(0.5)
dict_, l1 = apriori.fit(data)

In [16]:
dict_

{frozenset({1}): 0.5,
 frozenset({3}): 0.75,
 frozenset({4}): 0.25,
 frozenset({2}): 0.75,
 frozenset({5}): 0.75,
 frozenset({1, 3}): 0.5,
 frozenset({2, 5}): 0.75,
 frozenset({3, 5}): 0.5,
 frozenset({2, 3}): 0.5,
 frozenset({1, 5}): 0.25,
 frozenset({1, 2}): 0.25,
 frozenset({2, 3, 5}): 0.5}

In [17]:
l1

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})]]

## 从频繁项集中挖掘关联规则
如下图所示，给出的是项集 {0,1,2,3} 产生的所有关联规则:
![2](../img/Apriori2.png)

通过观察，我们可以知道，如果某条规则并不满足 `最小可信度` 要求，那么该规则的所有子集也不会满足 最小可信度 的要求。
如上图所示，假设` 123 -> 3 `并不满足最小可信度要求，那么就知道任何左部为{0,1,2} 子集的规则也不会满足 最小可信度 的要求。 即` 12 -> 03 , 02 -> 13 , 01 -> 23 , 2 -> 013, 1 -> 023, 0 -> 123 `都不满足 最小可信度 要求。

In [24]:
class ApriorRules(Apriori):
    def __init__(self, min_support=0.5, min_confidence=0.5):
        super().__init__(min_support)
        self.min_confidence = min_confidence
    
    def calc_confidence(self, sub_sets, freq_set):
        # 递归计算频繁项集的规则
        # freq_set: 频繁项集中的元素 {1, 2, 3}
        # sub_sets: freq_set子集 组成的list [{1}, {2}, {3}]
        # 规则 freq_set - set_ -> set_ 的置信度
        subs = []  # freq_set的子集
        for set_ in sub_sets:
            # (A U B) / (A U B - B)
            conf = self.support_dict[freq_set] / self.support_dict[freq_set - set_]
            # A --> B
            # print(freq_set - set_, set_, conf)
            if conf > self.min_confidence:
                # {1, 2} ---> {3}, 
                self.rules.append((freq_set - set_, set_, conf))
                subs.append(set_)             
        return subs
   
    def rules_from_conseq(self, freq_set, sub_sets):
        # freq_set 频繁集 {1, 2, 3}
        # 子元素集合 [{1}, {2}, {3}]  [{1, 2}, {2, 3}, {1, 3}]
        sub_len = len(sub_sets[0])  # 每次sub_sets元素长度一致
        if len(freq_set) > sub_len:
            # 计算 置信度, 返回满足条件的
            subs = self.calc_confidence(sub_sets, freq_set)
            # 生成元素数量+1的子集
            subs = self.apriori_gen(subs, sub_len + 1)

            # 例如{0, 1, 2} -> {3} 不符合要求,就不会检查 {0, 2}->{1 , 3} , {0}->{1, 2, 3}了
            if len(subs) > 1:
                self.rules_from_conseq(freq_set, subs)
    
    def fit(self, data):
        self.support_dict, support_list = super().fit(data)
        self.rules = []
        for i in range(1, len(support_list)):
            for freq_set in support_list[i]:
                # {1, 2, 3} -> [{1}, {2}, {3}]
                subs = [frozenset([item]) for item in freq_set]
                if i > 1: 
                    self.rules_from_conseq(freq_set, subs)  # {1, 2, 3} -> [{1}, {2}, {3}]
                else:
                    # 只含2个元素 A -> B, B->A
                    self.calc_confidence(subs, freq_set)  # {2, 3}  [{2}, {3}]
        return self.rules

In [25]:
apriori_rules = ApriorRules()
apriori_rules.fit(data)

[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({3, 5}), frozenset({2}), 1.0),
 (frozenset({2, 5}), frozenset({3}), 0.6666666666666666),
 (frozenset({2, 3}), frozenset({5}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]


[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({3, 5}), frozenset({2}), 1.0),
 (frozenset({2, 5}), frozenset({3}), 0.6666666666666666),
 (frozenset({2, 3}), frozenset({5}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]

## 项目案例: 发现毒菇的相似特性

In [27]:
import numpy as np

mushroom = np.loadtxt('./mushroom.dat')
mushroom[:2, :]

array([[  1.,   3.,   9.,  13.,  23.,  25.,  34.,  36.,  38.,  40.,  52.,
         54.,  59.,  63.,  67.,  76.,  85.,  86.,  90.,  93.,  98., 107.,
        113.],
       [  2.,   3.,   9.,  14.,  23.,  26.,  34.,  36.,  39.,  40.,  52.,
         55.,  59.,  63.,  67.,  76.,  85.,  86.,  90.,  93.,  99., 108.,
        114.]])

第一个特征表示有毒或可食用. 有毒为2, 可食用为1. 第二个为蘑菇伞的形状, 有六种可能的值3-8.

为了寻找毒蘑菇中存在的公共特征, 寻找包含特征值2的频繁集

In [28]:
mushroom.shape

(8124, 23)

In [29]:
mushroom_apriori= Apriori(min_support=.3)
support_dict, support_list = mushroom_apriori.fit(mushroom)

In [30]:
support_dict

({3.0, 85.0, 86.0}): 0.4441161989167898,
 frozenset({23.0, 38.0, 85.0}): 0.04431314623338257,
 frozenset({23.0, 63.0, 85.0}): 0.3741999015263417,
 frozenset({23.0, 85.0, 86.0}): 0.4155588380108321,
 frozenset({23.0, 85.0, 93.0}): 0.39192516001969474,
 frozenset({34.0, 85.0, 86.0}): 0.9731659281142294,
 frozenset({34.0, 85.0, 93.0}): 0.464795667159035,
 frozenset({36.0, 85.0, 86.0}): 0.8148695224027572,
 frozenset({36.0, 85.0, 93.0}): 0.42836041358936483,
 frozenset({38.0, 63.0, 85.0}): 0.18906942392909898,
 frozenset({38.0, 85.0, 86.0}): 0.3082225504677499,
 frozenset({38.0, 85.0, 93.0}): 0.07976366322008863,
 frozenset({52.0, 85.0, 86.0}): 0.40817331363860165,
 frozenset({59.0, 85.0, 86.0}): 0.6134908911866076,
 frozenset({59.0, 85.0, 93.0}): 0.45100935499753814,
 frozenset({63.0, 85.0, 86.0}): 0.5839487936976858,
 frozenset({63.0, 85.0, 93.0}): 0.4273756770064008,
 frozenset({67.0, 85.0, 86.0}): 0.5494830132939439,
 frozenset({67.0, 85.0, 93.0}): 0.3210241260462826,
 frozenset({76.0,

In [31]:
support_list

{23.0, 34.0, 39.0, 63.0, 85.0, 86.0}),
  frozenset({34.0, 39.0, 59.0, 85.0, 86.0, 93.0}),
  frozenset({23.0, 34.0, 39.0, 85.0, 86.0, 93.0}),
  frozenset({34.0, 39.0, 63.0, 85.0, 86.0, 93.0}),
  frozenset({34.0, 39.0, 59.0, 85.0, 86.0, 90.0}),
  frozenset({23.0, 34.0, 39.0, 85.0, 86.0, 90.0}),
  frozenset({34.0, 39.0, 63.0, 85.0, 86.0, 90.0}),
  frozenset({34.0, 39.0, 85.0, 86.0, 90.0, 93.0}),
  frozenset({23.0, 34.0, 39.0, 59.0, 63.0, 86.0}),
  frozenset({23.0, 34.0, 39.0, 59.0, 86.0, 93.0}),
  frozenset({23.0, 34.0, 39.0, 63.0, 86.0, 93.0}),
  frozenset({23.0, 34.0, 39.0, 59.0, 86.0, 90.0}),
  frozenset({23.0, 34.0, 39.0, 86.0, 90.0, 93.0}),
  frozenset({34.0, 39.0, 59.0, 63.0, 86.0, 90.0}),
  frozenset({23.0, 34.0, 39.0, 59.0, 90.0, 93.0}),
  frozenset({34.0, 39.0, 59.0, 85.0, 90.0, 93.0}),
  frozenset({34.0, 39.0, 59.0, 86.0, 90.0, 93.0}),
  frozenset({23.0, 36.0, 39.0, 59.0, 63.0, 85.0}),
  frozenset({23.0, 36.0, 39.0, 59.0, 85.0, 93.0}),
  frozenset({23.0, 36.0, 39.0, 63.0, 85.0, 