# 分类算法-决策树算法

In [1]:
# 信息熵公式

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
"""
决策树对泰坦尼克号进行预测生死
:return: None
"""

# 获取数据
titan = pd.read_csv("./data/titanic.txt")
titan.info

<bound method DataFrame.info of       row.names pclass  survived  \
0             1    1st         1   
1             2    1st         0   
2             3    1st         0   
3             4    1st         0   
4             5    1st         1   
...         ...    ...       ...   
1308       1309    3rd         0   
1309       1310    3rd         0   
1310       1311    3rd         0   
1311       1312    3rd         0   
1312       1313    3rd         0   

                                                 name      age     embarked  \
0                        Allen, Miss Elisabeth Walton  29.0000  Southampton   
1                         Allison, Miss Helen Loraine   2.0000  Southampton   
2                 Allison, Mr Hudson Joshua Creighton  30.0000  Southampton   
3     Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)  25.0000  Southampton   
4                       Allison, Master Hudson Trevor   0.9167  Southampton   
...                                               ...      ..

In [5]:
# 处理数据，找出特征值和目标值
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']

# x.info
x.info()  # 用来判断是否有空值-年龄特征有空值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1313 non-null   object 
 1   age     633 non-null    float64
 2   sex     1313 non-null   object 
dtypes: float64(1), object(2)
memory usage: 30.9+ KB


In [7]:
y.info

<bound method Series.info of 0       1
1       0
2       0
3       0
4       1
       ..
1308    0
1309    0
1310    0
1311    0
1312    0
Name: survived, Length: 1313, dtype: int64>

In [11]:
# 一定要进行缺失值处理
x['age'].fillna(x['age'].mean(), inplace=True)

# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)  # 4？
print(x_train.head())

# a = x_train.to_dict(orient="records")
# print(a)

    pclass        age     sex
598    2nd  30.000000    male
246    1st  62.000000    male
905    3rd  31.194181  female
300    1st  31.194181  female
509    2nd  64.000000    male


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['age'].fillna(x['age'].mean(), inplace=True)


In [12]:
# 处理特征（特征工程）->类别->one_hot编码
dict_dc = DictVectorizer(sparse=False)

# 这一步是对字典进行特征抽取（字典中的值变为数值类型）,to_dict可以把df变为字典，records代表列名变为键
x_train = dict_dc.fit_transform(x_train.to_dict(orient="records"))
print(type(x_train))
print(dict_dc.get_feature_names_out())
print('-' * 50)

x_test = dict_dc.transform(x_test.to_dict(orient="records"))
print(x_train)

<class 'numpy.ndarray'>
['age' 'pclass=1st' 'pclass=2nd' 'pclass=3rd' 'sex=female' 'sex=male']
--------------------------------------------------
[[30.          0.          1.          0.          0.          1.        ]
 [62.          1.          0.          0.          0.          1.        ]
 [31.19418104  0.          0.          1.          1.          0.        ]
 ...
 [34.          0.          1.          0.          0.          1.        ]
 [46.          1.          0.          0.          0.          1.        ]
 [31.19418104  0.          0.          1.          0.          1.        ]]


In [17]:
# 用决策树进行预测（可调参数：max_depth）
dec = DecisionTreeClassifier()

# 训练
dec.fit(x_train, y_train)

# 预测准确率
print("预测的准确率：", dec.score(x_test, y_test))

# 导出决策树的结构
export_graphviz(dec, out_file="tree.dot",
                feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'female', 'male'])

预测的准确率： 0.8085106382978723


In [19]:
# 调整决策树的参数
# 分割数据集到训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
print(x_train.head())
# 处理特征（特征工程）->类别->one_hot编码
dict2 = DictVectorizer(sparse=False)

# 这一步是对字典进行特征抽取
x_train = dict2.fit_transform(x_train.to_dict(orient="records"))
print(type(x_train))
print(dict2.get_feature_names_out())
print('-' * 50)
x_test = dict2.transform(x_test.to_dict(orient="records"))

# print(x_train)
# 用决策树进行预测，修改max_depth为10，发现准确率提升了
dec = DecisionTreeClassifier(max_depth=10)

dec.fit(x_train, y_train)
#
# # 预测准确率
print("预测的准确率：", dec.score(x_test, y_test))
#
# # 导出决策树的结构
export_graphviz(dec, out_file="tree.dot",
                feature_names=dict2.get_feature_names_out())



    pclass        age     sex
598    2nd  30.000000    male
246    1st  62.000000    male
905    3rd  31.194181  female
300    1st  31.194181  female
509    2nd  64.000000    male
<class 'numpy.ndarray'>
['age' 'pclass=1st' 'pclass=2nd' 'pclass=3rd' 'sex=female' 'sex=male']
--------------------------------------------------
预测的准确率： 0.817629179331307


In [20]:
# 随机森林进行预测 （超参数调优），n_jobs：充分利用多核的一个参数
rf = RandomForestClassifier(n_jobs=-1)
# 120, 200, 300, 500, 800, 1200/2000, 5000
# n_estimators是森林中决策树的数目，也就是分类器的数目
# max_samples是最大样本数
# bagging类型
param = {"n_estimators": [120, 200, 300, 500, 800, 1200], "max_depth": [2, 3, 5, 8, 15, 25]}

# 网格搜索与交叉验证
gc = GridSearchCV(rf, param_grid=param, cv=3)

gc.fit(x_train, y_train)

print("准确率：", gc.score(x_test, y_test))

print("查看选择的参数模型：", gc.best_params_)

print("选择最好的模型是：", gc.best_estimator_)

print("每个超参数每次交叉验证的结果：", gc.cv_results_)


准确率： 0.8267477203647416
查看选择的参数模型： {'max_depth': 3, 'n_estimators': 120}
选择最好的模型是： RandomForestClassifier(max_depth=3, n_estimators=120, n_jobs=-1)
每个超参数每次交叉验证的结果： {'mean_fit_time': array([1.2481238 , 0.19448686, 0.2665627 , 0.30143571, 0.43689108,
       0.72106886, 0.12555337, 0.18210514, 0.31918391, 0.2985491 ,
       0.44901641, 0.72293742, 0.13137007, 0.18519084, 0.27145584,
       0.30261381, 0.45296939, 0.73783382, 0.13290604, 0.18370851,
       0.27302782, 0.30857436, 0.48158582, 0.72676516, 0.13293282,
       0.1829017 , 0.27346953, 0.31295125, 0.47036195, 0.76679738,
       0.12891452, 0.18938724, 0.2735188 , 0.30766638, 0.45852645,
       0.76352223]), 'std_fit_time': array([1.58262484e+00, 1.31424122e-02, 6.81864854e-03, 3.52052391e-03,
       4.34729457e-03, 8.98209807e-03, 1.43915833e-03, 1.76185895e-03,
       8.13065922e-02, 1.76961911e-03, 8.60159428e-03, 1.26747876e-02,
       8.25674960e-03, 4.03834857e-03, 1.33705976e-03, 4.04388396e-03,
       4.38553266e-03, 3.603