In [56]:
from sklearn.datasets import load_iris, fetch_20newsgroups, load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [57]:
np.log(1/32)
np.sqrt(16)
np.square(4)

16

# 决策树

In [58]:
'''
决策树实例：对泰坦尼克号分析
'''
titan = pd.read_csv('../python_ml/data/titanic.txt')
titan.info

<bound method DataFrame.info of       row.names pclass  survived  \
0             1    1st         1   
1             2    1st         0   
2             3    1st         0   
3             4    1st         0   
4             5    1st         1   
...         ...    ...       ...   
1308       1309    3rd         0   
1309       1310    3rd         0   
1310       1311    3rd         0   
1311       1312    3rd         0   
1312       1313    3rd         0   

                                                 name      age     embarked  \
0                        Allen, Miss Elisabeth Walton  29.0000  Southampton   
1                         Allison, Miss Helen Loraine   2.0000  Southampton   
2                 Allison, Mr Hudson Joshua Creighton  30.0000  Southampton   
3     Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)  25.0000  Southampton   
4                       Allison, Master Hudson Trevor   0.9167  Southampton   
...                                               ...      ..

In [59]:
# 处理数据-找出特征值和目标值
x = titan[['pclass', 'age', 'sex']]  # 特征值
y = titan[['survived']]  # 目标值
x.info  # 有空值，进行缺失值处理

<bound method DataFrame.info of      pclass      age     sex
0       1st  29.0000  female
1       1st   2.0000  female
2       1st  30.0000    male
3       1st  25.0000  female
4       1st   0.9167    male
...     ...      ...     ...
1308    3rd      NaN    male
1309    3rd      NaN    male
1310    3rd      NaN    male
1311    3rd      NaN  female
1312    3rd      NaN    male

[1313 rows x 3 columns]>

In [60]:
x.describe(include=object) # 看字符串的描述：加include=object

Unnamed: 0,pclass,sex
count,1313,1313
unique,3,2
top,3rd,male
freq,711,850


In [61]:
x['age'].fillna(x['age'].mean(), inplace=True)  # 缺失值处理,机器学习和深度学习不能有nan值
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
print(x_train.head(50))

     pclass        age     sex
598     2nd  30.000000    male
246     1st  62.000000    male
905     3rd  31.194181  female
300     1st  31.194181  female
509     2nd  64.000000    male
313     1st  31.194181  female
779     3rd  24.000000  female
925     3rd  31.194181    male
577     2nd  31.194181    male
949     3rd  31.194181    male
692     3rd  21.000000    male
1059    3rd  31.194181    male
929     3rd  31.194181    male
462     2nd  23.000000  female
1200    3rd  31.194181    male
1109    3rd  31.194181  female
1116    3rd  31.194181  female
141     1st  44.000000  female
1177    3rd  31.194181  female
1208    3rd  31.194181    male
1006    3rd  31.194181  female
112     1st  37.000000    male
1176    3rd  31.194181    male
830     3rd   6.000000    male
1199    3rd  31.194181    male
968     3rd  31.194181    male
898     3rd  31.194181    male
839     3rd  31.194181  female
33      1st  41.000000    male
2       1st  30.000000    male
962     3rd  31.194181    male
1172    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['age'].fillna(x['age'].mean(), inplace=True)  # 缺失值处理,机器学习和深度学习不能有nan值


In [62]:
# 看下性别为女性的数量
x_train[x_train['sex'] == 'female'].count()

pclass    341
age       341
sex       341
dtype: int64

In [63]:
# 查看存活率1(未存活率0)
y_train[y_train == 0].count()

survived    650
dtype: int64

In [64]:
# 特征工程处理
# 机器学习和深度学习不能输入稀疏矩阵=False
dict = DictVectorizer(sparse=False)
# 对字典进行特征抽取，变成字典，records代表列名变键
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
type(x_train)

numpy.ndarray

In [65]:
dict.get_feature_names()  # 查看特征值名



['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']

In [66]:
# 处理测试集特征值>one-hot编码
x_test = dict.transform(x_test.to_dict(orient='records'))
x_test

array([[27.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ],
       [21.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [39.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       ...,
       [28.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ]])

In [67]:
x_train

array([[30.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [62.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ],
       ...,
       [34.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [46.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ]])

In [79]:
# 决策树预测,修改max_depth
dec = DecisionTreeClassifier(max_depth=8)
# 调参后发现max_depth=8时，准确率最大 0.8297872340425532
# 训练
dec.fit(x_train,y_train)
# 预测
# y_predict = dec.predict(x_test)
# 准确率
print('预测准确率:', dec.score(x_test,y_test))

预测准确率: 0.8297872340425532


In [69]:
# 导出决策树的结构
export_graphviz(dec, out_file="tree.dot",
                feature_names=dict.get_feature_names())



In [84]:
# 随机森林预测
rf = RandomForestClassifier(n_jobs=2) # n_jobs充分利用多核
param = {"n_estimators": [1000,2000,2500], "max_depth": [2, 3, 5, 8, 15, 25]}
gc = GridSearchCV(rf,param_grid=param,cv=4)
# 训练
gc.fit(x_train,y_train)
# 预测
y_predict = gc.predict(x_test)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [86]:
print('准确率:', gc.score(x_test,y_test))

准确率: 0.8328267477203647


In [87]:
print('查看选择的参数模型:', gc.best_params_)
print('sa:', gc.best_score_)
print('查看最优的估计器算法模型:', gc.best_estimator_)

查看选择的参数模型: {'max_depth': 3, 'n_estimators': 2500}
sa: 0.8252032520325203
查看最优的估计器算法模型: RandomForestClassifier(max_depth=3, n_estimators=2500, n_jobs=2)


ID3:偏向特征值大的特征：g(D,A) = H(D) - H(D|A)  条件熵越小，信息增益越大
C4.5：偏向特征值少的特征：Gainratio(D,A) = Gain(D,A)/HA(D)
基尼系数: