# 6.5 构建决策树解决多类问题

In [None]:
import math

def prob(data, element):
    """
    Calculates the percentage count of a given element
    Given a list and an element, returns the elements percentage count
    """
    element_count = 0
    
    # 测试条件以检查输入是否正确
    if len(data)==0 or element==None or not isinstance(element, (int,float)):
        return None
    element_count = data.count(element)
    return element_count / len(data)

def entropy(data):
    """
    Calculate entropy
    """
    entropy = 0.0
    
    if len(data)==0:
        return None
    if len(data)==1:
        return 0
    try:
        for element in data:
            p = prob(data, element)
            entropy += -1*p*math.log(p,2)
    except ValueError as e:
        print(e.message)
    
    return entropy

## 6.5.2 操作方法

In [2]:
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import pprint

def get_data():
    """
    Get Iris data
    """
    data = load_iris()
    x = data['data']
    y = data['target']
    label_names = data['target_names']
    return x,y,label_names.tolist()

def get_train_test(x,y):
    """
    Perpare a stratified train and test split
    """
    train_size = 0.8
    test_size = 1 - train_size
    input_dataset = np.column_stack([x,y])
    stratified_split = StratifiedShuffleSplit(input_dataset[:,-1],test_size=test_size,n_iter=1,random_state=77)
    
    for train_indx,test_indx in stratified_split:
        train_x = input_dataset[train_indx,:-1]
        train_y = input_dataset[train_indx,-1]
        test_x = input_dataset[test_indx,:-1]
        test_y = input_dataset[test_indx,-1]
    
    return train_x,train_y,test_x,test_y

def build_model(x,y):
    """
    Fit the model for the given attribute class label pairs
    """
    model = tree.DecisionTreeClassifier(criterion="entropy")
    model = model.fit(x,y)
    return model

def test_model(x,y,model,label_names):
    """
    Inspect the model for accuracy
    """
    y_predicted = model.predict(x)
    print("Model accuracy = %0.2f"%(accuracy_score(y,y_predicted) * 100) + "%\n")
    print("\nConfusion Matrix")
    print("=================")
    print(pprint.pprint(confusion_matrix(y,y_predicted)))
    print("\nClassification Report")
    print("=================")
    
    print(classification_report(y,y_predicted,target_names=label_names))

if __name__ == "__main__":
    # 加载数据
    x,y,label_names = get_data()
    # 将数据分割为训练集和测试集
    train_x,train_y,test_x,test_y = get_train_test(x,y)
    # 建模
    model = build_model(train_x,train_y)
    # 在训练集上评估模型
    test_model(train_x,train_y,model,label_names)
    # 在测试集上评估模型
    test_model(test_x,test_y,model,label_names)



Model accuracy = 100.00%


Confusion Matrix
array([[40,  0,  0],
       [ 0, 40,  0],
       [ 0,  0, 40]])
None

Classification Report
             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        40
 versicolor       1.00      1.00      1.00        40
  virginica       1.00      1.00      1.00        40

avg / total       1.00      1.00      1.00       120

Model accuracy = 96.67%


Confusion Matrix
array([[10,  0,  0],
       [ 0,  9,  1],
       [ 0,  0, 10]])
None

Classification Report
             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        10
 versicolor       1.00      0.90      0.95        10
  virginica       0.91      1.00      0.95        10

avg / total       0.97      0.97      0.97        30



## 6.5.3 工作原理

In [3]:
pwd

'C:\\Users\\Administrator\\Documents\\Python\\MyGit\\Machine Learning\\Python数据科学指南\\第6章 机器学习1'

## 6.5.4 更多内容

In [4]:
def get_feature_names():
    data = load_iris()
    return data['feature_names']

def probe_model(x, y, model, label_names):
    features_names = get_feature_names()
    features_importance = model.feature_importances_
    print("\nFeature Importance\n")
    print("=====================\n")
    for i, features_name in enumerate(features_names):
        print("%s = %0.3f"%(features_name, features_importance[i]))
        
    #将决策树导出成图
    tree.export_graphviz(model, out_file='tree.dot')

In [6]:
tree.export_graphviz(model, out_file='tree.dot')