In [1]:
import numpy as np
dataset_filename = 'affinity_dataset.txt'
table = np.loadtxt(dataset_filename)

In [3]:
# 面包、牛奶、奶酪、苹果、香蕉
features = [u'面包', u'牛奶', u'奶酪', u'苹果', u'香蕉']
print(table[: 5])

[[ 0.  1.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  1.]
 [ 1.  1.  0.  0.  0.]
 [ 0.  0.  1.  1.  1.]]


In [13]:
# 计算规则“如果顾客购买了苹果，他们也会购买香蕉”的支持度（support）和置信度（confidence）
# 支持度：指数据集中规则应验的次数
# 置信度：符合给定条件（即规则的“如果”语句所表示的前提条件）的所有规则里，跟当前规则结论一致的比例
# 规则：一条规则由前提条件和结论两部分组成。规则的优劣有多种衡量方法，常见的有支持度和置信度
from collections import defaultdict
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)
n_features = 5 # 特征个数
for sample in table:
    for premise in range(n_features):
        if not sample[premise]:
            continue
        num_occurances[premise] += 1
        for conclusion in range(n_features):
            if conclusion == premise:
                continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1
# 支持度
support = valid_rules
# 置信度
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    rule = (premise, conclusion)
    confidence[rule] = valid_rules[rule] / num_occurances[premise]

In [14]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("规则：如果一个人买了{}，那么他就会买{}".format(premise_name, conclusion_name))
    print(" - 置信度：{:.3f}".format(confidence[(premise, conclusion)]))
    print(" - 支持度：{}".format(support[(premise, conclusion)]))

In [15]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

规则：如果一个人买了牛奶，那么他就会买苹果
 - 置信度：0.346
 - 支持度：18


In [16]:
# 找到支持度最高的
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

In [17]:
for index in range(5):
    print("规则 #{0}".format(index + 1))
    premise, conclusion = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

规则 #1
规则：如果一个人买了苹果，那么他就会买香蕉
 - 置信度：0.628
 - 支持度：27
规则 #2
规则：如果一个人买了香蕉，那么他就会买苹果
 - 置信度：0.474
 - 支持度：27
规则 #3
规则：如果一个人买了牛奶，那么他就会买香蕉
 - 置信度：0.519
 - 支持度：27
规则 #4
规则：如果一个人买了香蕉，那么他就会买牛奶
 - 置信度：0.474
 - 支持度：27
规则 #5
规则：如果一个人买了奶酪，那么他就会买苹果
 - 置信度：0.564
 - 支持度：22


In [19]:
# 找置信度最高的
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("规则 #{0}".format(index + 1))
    premise, conclusion = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

规则 #1
规则：如果一个人买了苹果，那么他就会买香蕉
 - 置信度：0.628
 - 支持度：27
规则 #2
规则：如果一个人买了面包，那么他就会买香蕉
 - 置信度：0.571
 - 支持度：16
规则 #3
规则：如果一个人买了奶酪，那么他就会买苹果
 - 置信度：0.564
 - 支持度：22
规则 #4
规则：如果一个人买了牛奶，那么他就会买香蕉
 - 置信度：0.519
 - 支持度：27
规则 #5
规则：如果一个人买了奶酪，那么他就会买香蕉
 - 置信度：0.513
 - 支持度：20


In [4]:
# scikit-learn库内置了著名的Iris植物分类数据集
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

In [35]:
# print(dataset.DESCR) 简介
attribute_means = X.mean(axis=0)
X_d = np.array(X >= attribute_means, dtype='int') # 将连续特征转为离散

In [36]:
# OneR
# 流程：算出每个特性在所有取值下的错误和，用以排序选出错误率最少的

# 找出特征错误率
# 数据集、类别分组、特征索引、特征值
def train_feature_value(X, y_true, feature_index, value):
    class_counts = defaultdict(int)
    for sample, y in zip(X, y_true):
        if sample[feature_index] == value:
            class_counts[y] += 1
    # 找出给定特征值在哪个类别中出现次数最多
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    incorrect_predictions = [class_count for class_value, class_count in class_counts.items() 
                             if class_value != most_frequent_class]
    error = sum(incorrect_predictions)
    return most_frequent_class, error

In [37]:
# 计算特定特征总错误率
def train_on_feature(X, y_true, feature_index):
    values = set(X[:,feature_index])
    predictors = {} # key:特征值, value:类别
    errors = []
    for current_value in values:
        most_fequent_class, error = train_feature_value(X, y_true, feature_index, current_value)
        predictors[current_value] = most_fequent_class
        errors.append(error)
    total_error = sum(errors)
    return predictors, total_error

In [61]:
from sklearn.cross_validation import train_test_split
Xd_train, Xd_test, y_train, y_test = train_test_split(X_d, y, random_state=14)  # random_state: 随机数种子
all_predictors = {}
errors = {}
for feature_index in range(Xd_train.shape[1]):
    predictors, total_error = train_on_feature(Xd_train, y_train, feature_index)
    all_predictors[feature_index] = predictors
    errors[feature_index] = total_error
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]
model = {
    'variable': best_feature,
    'predictor': all_predictors[best_feature]
}

In [55]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    # 用测试集预测
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

In [62]:
variable = model['variable']
predictor = model['predictor']
y_predicted = predict(Xd_test, model)
accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]
The test accuracy is 65.8%
