# Voting Classifier

In [1]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)


voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rf", rnd_clf), ("svc", svm_clf)], 
    voting="soft")
voting_clf.fit(X, y)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.888
VotingClassifier 0.92


  if diff:


# Bagging and Pasting

In [3]:
# Using same training method but different training subsets of the training set.
# Sampling with replacement = Bagging
# Sampling without replacement = Pasting

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs = 2)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [4]:
accuracy_score(y_test, y_pred)

0.92

In [5]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.872

## Out-of-Bad evaluation

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    bootstrap=True, n_jobs=2, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.9013333333333333

In [7]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

## Random Forests

In [8]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=2)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [9]:
accuracy_score(y_test, y_pred_rf)

0.92

In [10]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter="random", max_leaf_nodes=16), 
                            n_estimators=500, max_samples=1.0, bootstrap=True, 
                           n_jobs = 2)
bag_clf.fit(X_train, y_train)

y_pred_bag = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred_bag)

0.912

In [11]:
# If you allow trees to be trained with random threshold for feature values instead of searching for the best threshold
# it is called extra-trees (extremely randomized trees)

from sklearn.ensemble import ExtraTreesClassifier
extra_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=2)
extra_clf.fit(X_train, y_train)

y_pred_extra = extra_clf.predict(X_test)
accuracy_score(y_test, y_pred_extra)

0.912

## Feature Importance

In [12]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=2)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.11337356329775969
sepal width (cm) 0.024467013130352967
petal length (cm) 0.43090273185160916
petal width (cm) 0.43125669172027914


# Exercises

## Exercise 8

In [13]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST original')
X,y = mnist["data"], mnist["target"]

X.shape

(70000, 784)

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X = pca.fit_transform(X)
X.shape

(70000, 154)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20000, random_state=42)

In [16]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=10000, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators':[50, 100, 200, 500], 'max_depth':[3, 5, 10], 'max_features':[10, 50, 100]}

rf_clf = RandomForestClassifier()

rf_grid = GridSearchCV(rf_clf, param_grid, n_jobs=2, cv=3, verbose=10)

In [18]:
import numpy as np
idx = np.random.permutation(np.arange(len(X_train)))

In [19]:
rf_grid.fit(X_train[idx[:1000]], y_train[idx[:1000]])

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    7.3s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   13.8s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   26.1s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   30.2s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   37.3s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   52.3s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed:  2.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 5, 10], 'max_features': [10, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [20]:
rf_grid.best_estimator_.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
from sklearn.metrics import accuracy_score

y_pred_rf = rf_grid.best_estimator_.predict(X_val)
accuracy_score(y_val, y_pred_rf)

0.9226

In [22]:
param_grid = {'n_estimators':[50, 100, 200, 500], 'max_depth':[3, 5, 10], 'max_features':[10, 50, 100]}

extr_clf = ExtraTreesClassifier()

extr_grid = GridSearchCV(extr_clf, param_grid, n_jobs=2, cv=3, verbose=10)

In [23]:
extr_grid.fit(X_train[idx[:1000]], y_train[idx[:1000]])

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    2.7s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    7.5s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:    9.1s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   11.7s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   15.2s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   19.8s
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed:   29.2s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 5, 10], 'max_features': [10, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [24]:
extr_grid.best_estimator_.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features=50, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [25]:
y_pred_extr = extr_grid.best_estimator_.predict(X_val)
accuracy_score(y_val, y_pred_extr)

0.9169

In [26]:
help(SVC)

Help on class SVC in module sklearn.svm.classes:

class SVC(sklearn.svm.base.BaseSVC)
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time complexity
 |  is more than quadratic with the number of samples which makes it hard
 |  to scale to dataset with more than a couple of 10000 samples.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  other, see the corresponding section in the narrative documentation:
 |  :ref:`svm_kernels`.
 |  
 |  Read more in the :ref:`User Guide <svm_classification>`.
 |  
 |  Parameters
 |  ----------
 |  C : float, optional (default=1.0)
 |      Penalty parameter C of the error term.
 |  
 |  kernel : string, optional (default='rbf')
 |       Specifies the kernel type to be used in the algorithm.
 |       It must be one of 'linear', 'poly

In [27]:
from scipy.stats import reciprocal, uniform

svc_clf = SVC(probability=True)

param_grid = {'C': uniform(1, 100), 'gamma': reciprocal(0.1, 1)}

svc_grid = RandomizedSearchCV(svc_clf, param_grid, n_iter=100, random_state=42, cv=3, verbose=10, n_jobs=2)

In [28]:
svc_grid.fit(X_train[idx[:1000]], y_train[idx[:1000]])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    2.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    4.8s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    6.7s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   10.2s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   13.0s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   17.3s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   21.0s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   26.4s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   31.2s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   37.6s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   43.1s
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed:   50.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   56.6s
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Do

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=2,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000291A74E7CC0>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000291A78F2C50>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [29]:
svc_grid.best_estimator_.fit(X_train[idx[:5000]], y_train[idx[:5000]])

0.1108

In [None]:
y_pred_svc = svc_grid.best_estimator_.predict(X_val)
accuracy_score(y_val, y_pred_svc)

In [None]:
y_pred_svc = svc_grid.best_estimator_.predict(X_train)
accuracy_score(y_train, y_pred_svc)

In [30]:
from sklearn.ensemble import VotingClassifier

In [None]:
voting_clf = VotingClassifier([('rf', rf_grid.best_estimator_),
                               ('extr', extr_grid.best_estimator_),
                               ('svc', svc_grid.best_estimator_)], voting="soft")

In [None]:
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_val)