# Hard Voting Classifier

### Using the make_moons datasets and applying the Random Forest, Logistic Regresssion, SVM and ensemble of all three as Voting Classifier.

In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split 

In [2]:
np.random.seed(0)
X,y = datasets.make_moons(n_samples=1500,noise=0.3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [3]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(
                estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
                voting = 'hard'
)

voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.862222222222
RandomForestClassifier 0.893333333333
SVC 0.915555555556
VotingClassifier 0.911111111111


# Soft Voting Classifier 

### The difference between the hard and soft voting classifier is that in the hard voting classifier, the majority of the predicted class from the different classifier is taken as predicted class whereas in the soft voting classifier , the averaged over all the individual classifiers is taken.

In [5]:
np.random.seed(0)
X,y = datasets.make_moons(n_samples=1500,noise=0.3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)
voting_clf = VotingClassifier(
                estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
                voting = 'soft'
)

voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [7]:
from sklearn.metrics import accuracy_score

for clf in (log_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.862222222222
RandomForestClassifier 0.893333333333
SVC 0.915555555556
VotingClassifier 0.908888888889


# Bagging and Pasting 

### Bagging is done with replacement where Pasting is done without replacement

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# n_jobs means using all the available cores of CPU, n_estimators are the number of decision trees to be modelled, max_samples 
# are the number of samples to be taken for one model.

bag_clf = BaggingClassifier(
            DecisionTreeClassifier(), n_estimators=500,
            max_samples = 100, bootstrap=True,n_jobs=-1
)

bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
print('Accuracy of Bagging ', accuracy_score(y_test,y_pred))

Accuracy of Bagging  0.913333333333


In [11]:
bag_clf = BaggingClassifier(
            DecisionTreeClassifier(), n_estimators=500,
            max_samples = 100, bootstrap=False,n_jobs=-1
)

bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)
print('Accuracy of Pasting ', accuracy_score(y_test,y_pred))

Accuracy of Pasting  0.911111111111


### Checking the prediction of out of bag instances and theoritically the oob score and accuracy are very similar 

In [12]:
bag_clf = BaggingClassifier(
            DecisionTreeClassifier(), n_estimators=500,
            max_samples = 100, bootstrap=True,n_jobs=-1, oob_score=True)
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.90571428571428569

In [13]:
bag_clf.oob_decision_function_

array([[ 0.88789238,  0.11210762],
       [ 0.95384615,  0.04615385],
       [ 0.96982759,  0.03017241],
       ..., 
       [ 0.93791574,  0.06208426],
       [ 0.68777293,  0.31222707],
       [ 0.00883002,  0.99116998]])

# Random Forest Classifier

In [14]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
print('Accuracy Score ',accuracy_score(y_test,y_pred_rf))

Accuracy Score  0.911111111111


## Extremely Randomized Trees (Extra Trees) is much faster than Random Forest as it uses random thresholds for each feature rather than finding the best possible threshold.

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

ext_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16,n_jobs=-1)
ext_clf.fit(X_train, y_train)

y_pred_ext = ext_clf.predict(X_test)
print('Accuracy Score ',accuracy_score(y_test,y_pred_ext))

Accuracy Score  0.906666666667


### Method to find the important features in Random Forest , similarly we can find in Decision Trees

In [16]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"],iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)


sepal length (cm) 0.108359487526
sepal width (cm) 0.0256364264132
petal length (cm) 0.461047860704
petal width (cm) 0.404956225356


# AdaBoost

### AdaBoosting also known as adaptive boosting in which first base classifier is trained and used to make prediction on all of the training set , there are few instances which are not predicted well , so relative weights to those points are increased and pobability of choosing those points increases for the next classifier and again the prediction is made and these steps are repeated until we get smallest prediction error.

In [17]:
from sklearn.ensemble import AdaBoostClassifier

# Multiclass version of adaboost is SAMME (Stagewise Adaptive Modelling using a Multiclass Exponential Loss Function), in the
# class of binary class it performs same as adaboost.

ada_clf = AdaBoostClassifier(
                DecisionTreeClassifier(max_depth=1), n_estimators=200,
                algorithm = "SAMME" , learning_rate=0.5
)

ada_clf.fit(X_train,y_train)
y_pred_ada = ada_clf.predict(X_test)
print("Accuracy Score of AdaBoost(SAMME method)", accuracy_score(y_test,y_pred_ada))

Accuracy Score of AdaBoost(SAMME method) 0.902222222222


In [18]:
# SAMME.R (Stagewise Adaptive Modelling using a Multiclass Exponential Loss Function. Real), finds the class probabilities 
# in case of classifier which can predict class probabilities.

ada_clf = AdaBoostClassifier(
                DecisionTreeClassifier(max_depth=1), n_estimators=200,
                algorithm = "SAMME.R" , learning_rate=0.5
)

ada_clf.fit(X_train,y_train)
y_pred_ada = ada_clf.predict(X_test)
print("Accuracy Score of AdaBoost(SAMME.R method)", accuracy_score(y_test,y_pred_ada))

Accuracy Score of AdaBoost(SAMME.R method) 0.884444444444


# Gradient Boosting 

### The gradient boosting works similar to AdaBoost, but it tries to fit the new predictor to the residual errors obtained from the previous predictor.

In [29]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X,y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)

y3 = y - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X,y3)

y_pred = sum(tree.predict(X) for tree in (tree_reg1,tree_reg2,tree_reg3))


In [30]:
y_pred

array([ 0.06975585,  0.06975585,  0.63768116, ...,  0.62518239,
        0.06975585,  0.05040951])

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators = 3, learning_rate = 1.0)
gbrt.fit(X,y)
gbrt.predict(X)

array([ 0.00671784,  0.07861027,  0.93049823, ...,  0.9665889 ,
        0.00671784,  0.14284551])

### Early Stopping in gradient boosting using staged_predict()

In [31]:
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X,y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train,y_train)


errors = [mean_squared_error(y_val,y_pred)
         for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train,y_train)

y_pred1 = gbrt.predict(X_val)
y_pred2 = gbrt_best.predict(X_val)

print("MSE of normal Gradient Boosting ", mean_squared_error(y_val,y_pred1))
print("MSE of best Gradient Boosting ", mean_squared_error(y_val,y_pred2))

MSE of normal Gradient Boosting  0.0703353697942
MSE of best Gradient Boosting  0.0697126831512


### In the above code, first all the trees are modeled then we find optimal number of trees to find the best solution. But it takes lot of time in case of large dataset , therefore we can use another approach where we can stop training the model when the validation error stops improving.

In [36]:
gbrt = GradientBoostingRegressor(max_depth=2,warm_start = True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train,y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val,y_pred)
    # print(val_error)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping
print('MSE : ',val_error)

MSE :  0.0704562158102


### Stochastic Gradient Boosting actually randomly samples the training set and uses small amount of the instances to train the model.

### As in the below code , we have used subsample as 0.25 which means that it take 25% of the training set for training the model in each iteration. It is faster than the normal gradient boosting.

In [37]:
gbrt = GradientBoostingRegressor(max_depth=2,warm_start = True,subsample=0.25)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train,y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val,y_pred)
    # print(val_error)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping
print('MSE : ',val_error)

MSE :  0.0755182860675
