# Max-Voting

### Getting Ready

In [1]:
import os
import pandas as pd

In [2]:
os.chdir(".../Chapter 2")
os.getcwd()

'/Users/Dippies/PACKT - Ensemble ML/Chapter 2'

#### Download the dataset Cryotherapy.csv from the github location and copy the same to your working directory. Let's read the dataset.

In [3]:
cryotherapy_data = pd.read_csv("Cryotherapy.csv")

#### Let's take a glance at the data with the below code:

In [35]:
cryotherapy_data.head(5)

Unnamed: 0,sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment
0,1,35,12.0,5,1,100,0
1,1,29,7.0,5,1,96,1
2,1,50,8.0,1,3,132,0
3,1,32,11.75,7,3,750,0
4,1,67,9.25,1,1,42,0


### How to do it...

#### We import the required libraries for building the decision tree, support vector machines and logistic regression models. We also import VotingClassifier for max voting

In [None]:
# Import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

#### We move onto building our feature set and creating our train & test dataset

In [36]:
# We create train & Test sample from our dataset
from sklearn.cross_validation import train_test_split

# create feature & response variables
feature_columns = ['sex', 'age', 'Time', 'Number_of_Warts', 'Type', 'Area']
X = cryotherapy_data[feature_columns]
Y = cryotherapy_data['Result_of_Treatment']

In [37]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.20, random_state=1)

### Hard Voting

#### We build our models with decision tree, support vector machines and logistic regression algorithms

In [112]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier(random_state=1)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=1)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=1)
estimators.append(('Logistic Regression', logit_model))

#dt_model.fit(X_train,Y_train)
#svm_model.fit(X_train,Y_train)
#knn_model.fit(X_train,Y_train)

#### We build individual models with each of the classifiers we have chosen

In [122]:
from sklearn.metrics import accuracy_score

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, Y_train)
    Y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(Y_test, Y_pred))

DecisionTreeClassifier 0.833333333333
SVC 0.944444444444
LogisticRegression 0.777777777778


#### We proceed to ensemble our models and use VotingClassifier to score accuracy

In [125]:
# Using VotingClassifier() to build ensemble model with Hard Voting
ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
ensemble_model.fit(X_train,Y_train)
predicted_labels = ensemble_model.predict(X_test)            
print("Classifier Accuracy using Hard Voting: ", accuracy_score(Y_test, predicted_labels))

Classifier Accuracy using Hard Voting:  0.944444444444


### Soft Voting

#### The below code creates an ensemble using soft voting:

In [138]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier(random_state=1)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=1, probability=True)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=1)
estimators.append(('Logistic Regression', logit_model))

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, Y_train)
    Y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(Y_test, Y_pred))
    
# Using VotingClassifier() to build ensemble model with Soft Voting
ensemble_model = VotingClassifier(estimators=estimators, voting='soft')
ensemble_model.fit(X_train,Y_train)
predicted_labels = ensemble_model.predict(X_test)            
print("Classifier Accuracy using Soft Voting: ", accuracy_score(Y_test, predicted_labels))

DecisionTreeClassifier 0.833333333333
SVC 0.944444444444
LogisticRegression 0.777777777778
Classifier Accuracy using Soft Voting:  0.888888888889


# Averaging

#### We download the dataset whitewines.csv from the github location and copy the same to your working directory. Let's read the dataset.

In [206]:
wine_data = pd.read_csv("whitewines.csv")

#### Let's take a glance at the data with the below code

In [207]:
wine_data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.62,0.24,1.1,0.039,6.0,62.0,0.9934,3.41,0.32,10.4,5
1,5.7,0.22,0.2,16.0,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6
2,5.9,0.19,0.26,7.4,0.034,33.0,123.0,0.995,3.49,0.42,10.1,6
3,5.3,0.47,0.1,1.3,0.036,11.0,74.0,0.99082,3.48,0.54,11.2,4
4,6.4,0.29,0.21,9.65,0.041,36.0,119.0,0.99334,2.99,0.34,10.933333,6


#### We import the required libraries

In [217]:
# Import required libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

#### We create the response and the feature set

In [247]:
# Create feature and response variable set
from sklearn.cross_validation import train_test_split

# create feature & response variables
feature_columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\
                   'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',\
                   'density', 'pH', 'sulphates', 'alcohol']
X = wine_data[feature_columns]
Y = wine_data['quality']

#### We split our data into train & test set

In [254]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.30, random_state=1)

#### We build our base regression learners with linear regression, SVR & decision tree

In [255]:
# Build base learners
linreg_model = LinearRegression()
svr_model = SVR()
regressiontree_model = DecisionTreeRegressor()


linreg_model.fit(X_train, Y_train)
svr_model.fit(X_train, Y_train)
regressiontree_model.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

#### Use the base learners to predict on the test data

In [256]:
linreg_predictions = linreg_model.predict(X_test)
svr_predictions = svr_model.predict(X_test)
regtree_predictions = regressiontree_model.predict(X_test)

#### We add the predictions and divide by the number of base learners

In [257]:
average_predictions=(linreg_predictions + svr_predictions + regtree_predictions)/3

# Weighted Averaging


In [4]:
os.chdir(".../Chapter 2")
os.getcwd()

'/Users/Dippies/PACKT - Ensemble ML/Chapter 2'

#### We download the Diagnostic Wisconsin Breast Cancer database  wisc_bc_data.csv from the github location and copy the same to your working directory. Let's read the dataset.

In [6]:
cancer_data = pd.read_csv("wisc_bc_data.csv")

#### Let's take a look at the data with the below code

In [7]:
cancer_data.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


#### We import the required libraries

In [8]:
# Import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

#### We create the response and the feature set

In [18]:
# Create feature and response variable set
# We create train & Test sample from our dataset
from sklearn.cross_validation import train_test_split

# create feature & response variables
X = cancer_data.iloc[:,2:32]
Y = cancer_data['diagnosis']

In [19]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.30, random_state=1)

#### We build our base classifier models

In [27]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier()
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(probability=True)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression()
estimators.append(('Logistic Regression', logit_model))

#### We fit our models on the test data

In [28]:
dt_model.fit(X_train, Y_train)
svm_model.fit(X_train, Y_train)
logit_model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [5]:
#### We use the predict_proba() function to predict the class probabilities

In [30]:
dt_predictions = dt_model.predict_proba(X_test)
svm_predictions = svm_model.predict_proba(X_test)
logit_predictions = logit_model.predict_proba(X_test)

#### We assign different weights to each of the models to get our final predictions

In [31]:
weighted_average_predictions=(dt_predictions * 0.3 + svm_predictions * 0.4 + logit_predictions * 0.3)