In [1]:
import numpy as np
dataset_filename = 'affinity_dataset.txt'
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [2]:
print(X[:5])

[[ 0.  0.  1.  1.  1.]
 [ 1.  1.  0.  1.  0.]
 [ 1.  0.  1.  1.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  1.  0.  0.  1.]]


### 每一行代表一条交易数据，包含商品  
五种商品分别是面包、牛奶、奶酪、苹果和香蕉

In [3]:
features = ["bread", "milk", "cheese", "apples", "bananas"]

如果顾客购买了苹果，他们也会购买香蕉

In [4]:
# 先计算有多少顾客购买了苹果
num_apples_purchases = 0
for sample in X:
    if sample[3] == 1:
        num_apples_purchases += 1
print("{0} people bought Apples .".format(num_apples_purchases))

36 people bought Apples .


在36个cases中，来进行查看有多少人购买了苹果，他们也会购买香蕉；或者没有购买香蕉

In [5]:
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:
        if sample[4] == 1:
            rule_valid += 1
        else:
            rule_invalid += 1
print("{0} cases of the rule being valid were discovered.".format(rule_valid))
print("{0} cases of the rule being invalid were discovered.".format(rule_invalid))

21 cases of the rule being valid were discovered.
15 cases of the rule being invalid were discovered.


In [10]:
# now we have all information needed to compute Support and Confidence
support = rule_valid
confidence = (1.0*rule_valid/num_apples_purchases)
print("The support is {0} and the confidence is {1:.3f}".format(support, confidence))
print("As a percentage, that is {0:.1f}%.".format(100*confidence))

0.583333333333
The support is 21 and the confidence is 0.583
As a percentage, that is 58.3%.


In [12]:
from collections import defaultdict
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0:
            continue
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:
                continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = 1.0*valid_rules[(premise, conclusion)]/num_occurences[premise]

In [13]:
print(confidence)

defaultdict(<type 'float'>, {(0, 1): 0.5185185185185185, (1, 2): 0.15217391304347827, (3, 2): 0.6944444444444444, (1, 3): 0.1956521739130435, (0, 3): 0.18518518518518517, (3, 0): 0.1388888888888889, (3, 4): 0.5833333333333334, (3, 1): 0.25, (1, 4): 0.41304347826086957, (2, 4): 0.6585365853658537, (2, 0): 0.0975609756097561, (2, 3): 0.6097560975609756, (2, 1): 0.17073170731707318, (4, 3): 0.3559322033898305, (0, 4): 0.6296296296296297, (4, 2): 0.4576271186440678, (1, 0): 0.30434782608695654, (4, 1): 0.3220338983050847, (0, 2): 0.14814814814814814, (4, 0): 0.288135593220339})


In [14]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: if a person buys {0}, they will also buy {1}".format(premise_name, conclusion_name))
    print("- Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print("- Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: if a person buys bread, they will also buy milk
- Confidence: 0.519
- Support: 14

Rule: if a person buys milk, they will also buy cheese
- Confidence: 0.152
- Support: 7

Rule: if a person buys apples, they will also buy cheese
- Confidence: 0.694
- Support: 25

Rule: if a person buys milk, they will also buy apples
- Confidence: 0.196
- Support: 9

Rule: if a person buys bread, they will also buy apples
- Confidence: 0.185
- Support: 5

Rule: if a person buys apples, they will also buy bread
- Confidence: 0.139
- Support: 5

Rule: if a person buys apples, they will also buy bananas
- Confidence: 0.583
- Support: 21

Rule: if a person buys apples, they will also buy milk
- Confidence: 0.250
- Support: 9

Rule: if a person buys milk, they will also buy bananas
- Confidence: 0.413
- Support: 19

Rule: if a person buys cheese, they will also buy bananas
- Confidence: 0.659
- Support: 27

Rule: if a person buys cheese, they will also buy bread
- Confidence: 0.098
- Support: 4

Rule:

In [15]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print("- Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print("- Support : {0}".format(support[(premise, conclusion)]))
    print("")

In [16]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
- Confidence: 0.196
- Support : 9



In [17]:
# Sort by support
from pprint import pprint
pprint(list(support.items()))

[((0, 1), 14),
 ((1, 2), 7),
 ((3, 2), 25),
 ((1, 3), 9),
 ((3, 0), 5),
 ((4, 1), 19),
 ((3, 1), 9),
 ((1, 4), 19),
 ((0, 2), 4),
 ((2, 0), 4),
 ((2, 3), 25),
 ((2, 1), 7),
 ((4, 3), 21),
 ((0, 4), 17),
 ((1, 0), 14),
 ((4, 2), 27),
 ((0, 3), 5),
 ((3, 4), 21),
 ((2, 4), 27),
 ((4, 0), 17)]


In [18]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

In [19]:
for index in range(5):
    print("Rule : #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule : #1
Rule: If a person buys bananas they will also buy cheese
- Confidence: 0.458
- Support : 27

Rule : #2
Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support : 27

Rule : #3
Rule: If a person buys apples they will also buy cheese
- Confidence: 0.694
- Support : 25

Rule : #4
Rule: If a person buys cheese they will also buy apples
- Confidence: 0.610
- Support : 25

Rule : #5
Rule: If a person buys bananas they will also buy apples
- Confidence: 0.356
- Support : 21



In [20]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1),reverse=True)

In [22]:
print(sorted_confidence)

[((3, 2), 0.6944444444444444), ((2, 4), 0.6585365853658537), ((0, 4), 0.6296296296296297), ((2, 3), 0.6097560975609756), ((3, 4), 0.5833333333333334), ((0, 1), 0.5185185185185185), ((4, 2), 0.4576271186440678), ((1, 4), 0.41304347826086957), ((4, 3), 0.3559322033898305), ((4, 1), 0.3220338983050847), ((1, 0), 0.30434782608695654), ((4, 0), 0.288135593220339), ((3, 1), 0.25), ((1, 3), 0.1956521739130435), ((0, 3), 0.18518518518518517), ((2, 1), 0.17073170731707318), ((1, 2), 0.15217391304347827), ((0, 2), 0.14814814814814814), ((3, 0), 0.1388888888888889), ((2, 0), 0.0975609756097561)]


In [21]:
for index in range(5):
    print("Rule #{0}".format(index+1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy cheese
- Confidence: 0.694
- Support : 25

Rule #2
Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support : 27

Rule #3
Rule: If a person buys bread they will also buy bananas
- Confidence: 0.630
- Support : 17

Rule #4
Rule: If a person buys cheese they will also buy apples
- Confidence: 0.610
- Support : 25

Rule #5
Rule: If a person buys apples they will also buy bananas
- Confidence: 0.583
- Support : 21

