## Import libraries and data

In [1]:
import random
from pandas import read_csv
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn import clone
from sklearn.datasets import load_iris
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              AdaBoostClassifier)
from sklearn.externals.six.moves import xrange
from sklearn.tree import DecisionTreeClassifier

authorship = read_csv('../../data/authorship.csv')

## EDA

In [2]:
authorship.ix[random.sample(authorship.index, 5)]

Unnamed: 0,a,all,also,an,and,any,are,as,at,be,...,what,when,which,who,will,with,would,your,BookID,Author
354,24,8,1,5,54,1,3,11,12,7,...,1,5,2,7,3,9,5,0,1,London
541,40,6,0,17,53,1,3,10,17,6,...,2,3,2,1,2,19,1,2,5,London
330,29,4,1,2,61,0,14,8,13,6,...,3,9,1,2,8,17,2,30,1,London
202,41,9,1,4,42,7,3,14,6,18,...,7,2,11,1,6,11,10,9,6,Austen
414,25,10,2,4,45,1,4,14,10,13,...,3,6,5,5,12,7,6,0,2,London


In [3]:
authors = list(set(authorship.Author.values))
print authors

['Austen', 'London', 'Shakespeare', 'Milton']


Replace author names with numbers

In [4]:
le = preprocessing.LabelEncoder()
le.fit(authors)
authorship['Author_num'] = le.transform(authorship['Author'])
print authorship['Author_num']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
...
826    3
827    3
828    3
829    3
830    3
831    3
832    3
833    3
834    3
835    3
836    3
837    3
838    3
839    3
840    3
Name: Author_num, Length: 841, dtype: int64


## Create random forest

In [5]:
# Create a random variable (random forests work best with a random variable)
authorship['random'] = [random.random() for i in range(841)]

In [6]:
#What are some of the stop words we're looking at?
features = list(authorship.columns)
features.remove('Author')
features.remove('Author_num')
features.remove('BookID')
print features

['a', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', 'do', 'down', 'even', 'every', 'for', 'from', 'had', 'has', 'have', 'her', 'his', 'if', 'in', 'into', 'is', 'it', 'its', 'may', 'more', 'must', 'my', 'no', 'not', 'now', 'of', 'on', 'one', 'only', 'or', 'our', 'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'then', 'there', 'things', 'this', 'to', 'up', 'upon', 'was', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'your', 'random']


In [7]:
# create a test and training set
x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123)    
x, y = authorship[features], authorship.Author_num.values

In [8]:
# Fit Model
from sklearn.cross_validation import cross_val_score

etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

scores = cross_val_score(etclf, x, y)
print scores.mean()

# Print Confusion Matrix
print metrics.confusion_matrix(etclf.predict(x_test), y_test)
print authors

0.976278869386
[[135   2   0   0]
 [  1  96   0   1]
 [  0   0  20   0]
 [  0   1   0  81]]
['Austen', 'London', 'Shakespeare', 'Milton']


## Questions

#### 1. Determine how changing the parameters in the random forest model changes the performance of the model

##### Number of estimators

In [9]:
for estimator in range(1,20):
    etclf = ExtraTreesClassifier(n_estimators=estimator)
    etclf.fit(x_train, y_train)
    scores = cross_val_score(etclf, x, y)
    print "With " + str(estimator) + " estimators, the score is " + str(scores.mean())

With 1 estimators, the score is 0.797870403609
With 2 estimators, the score is 0.771798552678
With 3 estimators, the score is 0.881385933882
With 4 estimators, the score is 0.903763367741
With 5 estimators, the score is 0.919159062089
With 6 estimators, the score is 0.931153676287
With 7 estimators, the score is 0.93947051793
With 8 estimators, the score is 0.937174300841
With 9 estimators, the score is 0.957204857754
With 10 estimators, the score is 0.939516789688
With 11 estimators, the score is 0.956103474909
With 12 estimators, the score is 0.960891043355
With 13 estimators, the score is 0.964407696982
With 14 estimators, the score is 0.958501738021
With 15 estimators, the score is 0.972737158195
With 16 estimators, the score is 0.966758962029
With 17 estimators, the score is 0.973936017386
With 18 estimators, the score is 0.971601760009
With 19 estimators, the score is 0.972741364718


As we can see from the above, the more estimators used the superior the performace of the model

###### Size of test set

In [36]:
for testSize in range(0,10):
    x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=testSize/10.0, random_state=123)    
    x, y = authorship[features], authorship.Author_num.values
    etclf = ExtraTreesClassifier(n_estimators=20)
    etclf.fit(x_train, y_train)
    scores = cross_val_score(etclf, x, y)
    print "With a test size of " + str(testSize/10.0) + " we get a score of " + str(scores.mean())

With a test size of 0.0 we get a score of 0.969182282704
With a test size of 0.1 we get a score of 0.967996043084
With a test size of 0.2 we get a score of 0.967953614696
With a test size of 0.3 we get a score of 0.968008844231
With a test size of 0.4 we get a score of 0.970381323472
With a test size of 0.5 we get a score of 0.97867256282
With a test size of 0.6 we get a score of 0.971520928181
With a test size of 0.7 we get a score of 0.981045042061
With a test size of 0.8 we get a score of 0.970360290854
With a test size of 0.9 we get a score of 0.975118050483


The size of the test size seems to make very little difference to the performance of the model.  A larger test set seems to marginally improve the performance.

##### Size of random state

In [38]:
for randomSize in range(0,10):
    x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=randomSize*10)    
    x, y = authorship[features], authorship.Author_num.values
    etclf = ExtraTreesClassifier(n_estimators=20)
    etclf.fit(x_train, y_train)
    scores = cross_val_score(etclf, x, y)
    print "With a random size of " + str(randomSize*10.0) + " we get a score of " + str(scores.mean())

With a random size of 0.0 we get a score of 0.973919009715
With a random size of 10.0 we get a score of 0.971563538145
With a random size of 20.0 we get a score of 0.977465109006
With a random size of 30.0 we get a score of 0.978659761674
With a random size of 40.0 we get a score of 0.980977192956
With a random size of 50.0 we get a score of 0.973897795521
With a random size of 60.0 we get a score of 0.968017438854
With a random size of 70.0 we get a score of 0.965572540832
With a random size of 80.0 we get a score of 0.97512646353
With a random size of 90.0 we get a score of 0.971537935851


The size of the random state seems to make no difference to the model performance

#### 2. See how using adaboost does on guess work

In [10]:
ensemble = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ensemble.fit(x, y)
ensemble.score(x, y)

1.0

Adaboost seems to do remarkably well when using the DecisionTreeClassifier

In [45]:
etclf = RandomForestClassifier(n_estimators=21)
etclf.fit(x_train, y_train)
scores = cross_val_score(etclf, x, y)

ensemble = AdaBoostClassifier(base_estimator=RandomForestClassifier())
ensemble.fit(x, y)
ensemble.score(x, y)

1.0

It also seems to work very well when using the random forest classifer

#### 3. Try timing adaboost in comparison to randomforests to see how performance changes

In [11]:
def testAdaboost():
    ensemble = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
    ensemble.fit(x, y)
    ensemble.score(x, y)

testAdaboost()

In [12]:
def testRandomforest():
    etclf = ExtraTreesClassifier(n_estimators=20)
    etclf.fit(x_train, y_train)
    scores = cross_val_score(etclf, x, y)

testRandomforest()

In [13]:
%timeit testAdaboost()
%timeit testRandomforest()

100 loops, best of 3: 11.5 ms per loop
10 loops, best of 3: 87.2 ms per loop


Boosting seems to have a clear speed advantage over randomforests

#### 4. Build a bagging algorithm

In [40]:
from sklearn.ensemble import BaggingClassifier

ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                             bootstrap=True,
                             bootstrap_features=False,
                             n_estimators=21)
ensemble.fit(x, y)
ensemble.score(x, y)

1.0

Bagging algorithms also seem to do amazingly well

#### 5. How can ensemble methods be distributed across a cluster of servers? Can they be?

It is possible to distribute ensemble methods acorss a cluster of servers through techniques such as "clustering".  For example, a ‘two-stage’ course could be used whereby clustering first takes place in local sites and then in a global site. The process runs along the lines of:

* Local clustering results transmitted to server site 
* These local results form an ensemble  
* Use the ensemble to generate global clustering results 

A key consideration would be around how much sharing is required between servers and whether there are sharing restrictions.  It is apparently possible (albeit complex) to have privacy aware computation of a model when instances of the target data are distributed across different servers.

Experimental results show that ensembles distributed across a cluster of servers can provide good classification
accuracies while adhering to data/model sharing constraints.

A rather complex paper explains more here - http://arxiv.org/pdf/1204.4521.pdf