In [1]:
import numpy as np
dataset_filename = 'affinity_dataset.txt'
table = np.loadtxt(dataset_filename)

In [17]:
# 面包、牛奶、奶酪、苹果、香蕉
features = [u'面包', u'牛奶', u'奶酪', u'苹果', u'香蕉']
print(table[: 5])

[[ 0.  1.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  1.]
 [ 1.  1.  0.  0.  0.]
 [ 0.  0.  1.  1.  1.]]


In [6]:
# 计算规则“如果顾客购买了苹果，他们也会购买香蕉”的支持度（support）和置信度（confidence）
# 支持度：指数据集中规则应验的次数
# 置信度：符合给定条件（即规则的“如果”语句所表示的前提条件）的所有规则里，跟当前规则结论一致的比例
# 规则：一条规则由前提条件和结论两部分组成。规则的优劣有多种衡量方法，常见的有支持度和置信度
from collections import defaultdict
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)
n_features = 5 # 特征个数
for sample in table:
    for premise in range(n_features):
        if not sample[premise]:
            continue
        num_occurances[premise] += 1
        for conclusion in range(n_features):
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1
# 支持度
support = valid_rules
# 置信度
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    rule = (premise, conclusion)
    confidence[rule] = valid_rules[rule] / num_occurances[premise]

In [13]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("规则：如果一个人买了{}，那么他就会买{}".format(premise_name, conclusion_name))
    print(" - 置信度：{:.3f}".format(confidence[(premise, conclusion)]))
    print(" - 支持度：{}".format(support[(premise, conclusion)]))

In [14]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

规则：如果一个人买了牛奶，那么他就会买苹果
 - 置信度：0.346
 - 支持度：18
