In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.model_selection import train_test_split


%config InlineBackend.figure_format = 'svg'

plt.rcParams['font.sans-serif'] = ['Kaiti']
plt.rcParams['axes.unicode_minus'] = False


# 决策树——Sklearn  

### 优点：  
####        1、数据的包容性：可以包容缺失值 　　　 2、保留数据原有量纲  
####        2、可以多分类  
####        3、非线性可分数据集  
####        4、可视化  
####        5、异常值不敏感  

### 缺点：  
####        过拟合

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
"""
DT = DecisionTreeClassifier(
    criterion='gini',   # entropy、gini: 两种选择
    max_depth=None,     # 最大深度
    min_samples_split=2,     #　预剪枝：剪枝前最小分割的样本数
    min_samples_leaf=1,　   #  后剪枝：剪枝后节点的最小样本数
    max_features=None,      # 最多可用特征数
    max_leaf_nodes=None,     # 最多叶子节点的个数
    min_impurity_decrease=0.0,    # 不纯度下降的最小域

)
"""


In [None]:
from sklearn.datasets import load_wine

In [None]:
wine = load_wine()

In [None]:
data = pd.DataFrame(data=wine.data, columns=wine.feature_names)
data['target'] = wine.target
y = wine.target


In [None]:
data.head()

In [None]:
# 第一列与第二列的散点图

for i in data.target.unique():
    x_ = data[data.target == i]['alcohol']
    y_ = data[data.target == i]['malic_acid']
    plt.scatter(x_, y_)
plt.show()

In [None]:
import seaborn as sns

In [None]:
# sns.pairplot(data)   # 会自动把各维度的散点图画出来

In [None]:
# 画boxplot

for col in data.columns[:-1]:
    sns.boxplot(x = data.target, y = data[col])
    plt.title(f'{col}')
    plt.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)


In [None]:
DT = DecisionTreeClassifier().fit(x_train, y_train)

In [None]:
DT.get_depth()   # 获取深度

In [None]:
DT.score(x_test, y_test)   # 测试集分数

In [None]:
DT.score(x_train, y_train)   # 过拟合

In [None]:
# 画决策树的图
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(DT
                                ,out_file = None  # 导出文件
                                ,feature_names= wine.feature_names   # 特征名
                                ,class_names=wine.target_names        # 标签名
                                ,filled=True    # 填充颜色
                                ,rounded=True   # 
                                )
graph = graphviz.Source(dot_data) 
graph

In [None]:
test = []
train = []
for i in range(1,11):
    DT = DecisionTreeClassifier(max_depth=i).fit(x_train, y_train)
    test.append(DT.score(x_test, y_test))
    train.append(DT.score(x_train, y_train))
    
plt.plot(range(1,11), test, label='测试集')
plt.plot(range(1,11), train, label='训练集')
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score  # 交叉验证

In [None]:
DT = DecisionTreeClassifier(max_depth=3)

In [None]:
cross_val_score(DT, wine.data, wine.target, cv=5)

In [None]:

cross = []
for i in range(1,20):
    DT = DecisionTreeClassifier(max_depth=i).fit(x_train, y_train)
    cross.append(cross_val_score(DT, wine.data, wine.target, cv=5).mean())
plt.plot(range(1,20), cross)
# plt.legend()
plt.show()

# 网格搜索

In [None]:
from sklearn.model_selection import GridSearchCV

In [25]:
DT = DecisionTreeClassifier()

param = {
    'criterion': ['gini','entropy'],   # entropy、gini: 两种选择
    'max_depth':  np.arange(1,20),     # 最大深度
    'min_samples_split': np.arange(2,8),     #　预剪枝：剪枝前最小分割的样本数
    
    'min_impurity_decrease': np.linspace(0, 0.05,10),    # 不纯度下降的最小域
}

GS = GridSearchCV(DT, param, cv=5).fit(wine.data, wine.target)

In [27]:
GS.best_score_

0.9276190476190477

In [28]:
GS.best_params_   # 最优参数

{'criterion': 'entropy',
 'max_depth': 3,
 'min_impurity_decrease': 0.05,
 'min_samples_split': 2}

In [29]:
DT = GS.best_estimator_   # 把最好的参数传给DT,再构建模型

In [30]:
DT.fit(x_train, y_train)   # 训练

DecisionTreeClassifier(criterion='entropy', max_depth=3,
                       min_impurity_decrease=0.05)

In [31]:
DT.score(x_train, y_train)   # 评分：训练集

0.9838709677419355

In [32]:
DT.score(x_test, y_test)　　　# 评分：测试集

0.8333333333333334