# 第八题
## 加载数据

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

In [2]:
mnist=fetch_openml('mnist_784',version=1)

In [3]:
X=mnist.data
y=mnist.target.astype(np.uint8)

In [4]:
X.shape,y.shape

((70000, 784), (70000,))

## 划分数据集

In [5]:
X_train=X[:50000]
y_train=y[:50000]
X_val=X[50000:60000]
y_val=y[50000:60000]
X_test=X[60000:]
y_test=y[60000:]

In [6]:
X_train.shape,X_val.shape,X_test.shape

((50000, 784), (10000, 784), (10000, 784))

## 建立模型

In [7]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [8]:
random_forest_clf=RandomForestClassifier(n_estimators=100,random_state=42)
extra_trees_clf=ExtraTreesClassifier(n_estimators=100,random_state=42)
svm_clf=LinearSVC(random_state=42)
mlp_clf=MLPClassifier(random_state=42)

## 单个模型预测

In [9]:
for model in (random_forest_clf,extra_trees_clf,svm_clf,mlp_clf):
    print("Training the ",model.__class__)
    model.fit(X_train,y_train)

Training the  <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Training the  <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>
Training the  <class 'sklearn.svm._classes.LinearSVC'>




Training the  <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


In [12]:
for model in (random_forest_clf,extra_trees_clf,svm_clf,mlp_clf):
    print("the score of ",model.__class__.__name__,"is",model.score(X_val,y_val))

the score of  RandomForestClassifier is 0.9736
the score of  ExtraTreesClassifier is 0.9743
the score of  LinearSVC is 0.8578
the score of  MLPClassifier is 0.9646


## 集成模型

In [13]:
from sklearn.ensemble import VotingClassifier

In [14]:
vot_clf = VotingClassifier([('random_forest_clf', random_forest_clf), (
    'extra_trees,clf', extra_trees_clf), ('svm_clf', svm_clf), ('mlp_clf', mlp_clf)])

In [15]:
vot_clf.fit(X_train,y_train)



VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(random_state=42)),
                             ('extra_trees,clf',
                              ExtraTreesClassifier(random_state=42)),
                             ('svm_clf', LinearSVC(random_state=42)),
                             ('mlp_clf', MLPClassifier(random_state=42))])

In [16]:
vot_clf.score(X_val,y_val)

0.9745

In [17]:
[estimator.score(X_val,y_val) for estimator in vot_clf.estimators_]

[0.9736, 0.9743, 0.8578, 0.9646]

Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to `None` using `set_params()` like this:

In [18]:
vot_clf.set_params(svm_clf=None)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(random_state=42)),
                             ('extra_trees,clf',
                              ExtraTreesClassifier(random_state=42)),
                             ('svm_clf', None),
                             ('mlp_clf', MLPClassifier(random_state=42))])

In [19]:
vot_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees,clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [20]:
vot_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(random_state=42),
 MLPClassifier(random_state=42)]

In [21]:
del vot_clf.estimators_[2]

In [22]:
vot_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 MLPClassifier(random_state=42)]

In [23]:
# 再来观察得分
vot_clf.score(X_val,y_val)

0.9763

相较于0.9745有所提高。

In [24]:
# 将投票方式改成软投票
vot_clf.voting='soft'

In [25]:
vot_clf.score(X_val,y_val)

0.97

# 第九题