# Ensembles
1. Bagging  
2. Random Subspace 
3. Boosting
4. Feature Importance from Random Forests

In [None]:
import pandas as pd
hotel_rev_pd = pd.read_csv('HotelRevHelpfulnessV2.csv')
hotel_rev_pd.head()

In [None]:
y = hotel_rev_pd.pop('reviewHelpfulness').values
X = hotel_rev_pd.values
X.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.ensemble import BaggingClassifier

kNN = KNeighborsClassifier(n_neighbors=3) 
dtree = DecisionTreeClassifier(criterion='entropy')

## Bagging
Ensembles based on Bagging. 
- 10 ensemble members are trained using bootstrap resampling
- Works for decision trees
- Doesn't work for k-NN

In [None]:
kNN_bag = BaggingClassifier(kNN, 
                            n_estimators = 10,
                            max_samples = 1.0, 
                            bootstrap = True)

tree_bag = BaggingClassifier(dtree, 
                            n_estimators = 10,
                            max_samples = 1.0, # bootstrap resampling 
                            bootstrap = True)

In [None]:
folds = 8
reps = 10
v = 10
cv=RepeatedKFold(n_repeats=reps, n_splits=folds)

scores_kNN = cross_val_score(kNN, X, y, cv=cv, verbose = v, n_jobs = -1)
scores_kNN_bag = cross_val_score(kNN_bag, X, y, cv=cv, verbose = v, n_jobs = -1)

print("Mean for kNN {:.2f}".format(scores_kNN.mean()))
print("Mean for kNN_bag {:.2f}".format(scores_kNN_bag.mean()))

scores_tree = cross_val_score(dtree, X, y, cv=cv, verbose = v, n_jobs = -1)
scores_tree_bag = cross_val_score(tree_bag, X, y, cv=cv, verbose = v, n_jobs = -1)

print("Mean for D-Tree {:.2f}".format(scores_tree.mean()))
print("Mean for D_Tree_bag {:.2f}".format(scores_tree_bag.mean()))

---
## Random Subspace
The evaluation above shows that bootstrap resampling works for decision tree ensembles but not for k-NN.   
This is because k-NN is a *stable* classifier so boodstrap resampling does not produce diversity.  
  
However a random subspace strategy will produce diversity for k-NN.  
In the examples below we generate an ensemble of 10 classifiers each trained using a subet of 50% of the features selected at random.


In [None]:
random_SS_kNN = BaggingClassifier(kNN, 
                            n_estimators = 10,
                            max_samples=1.0, 
                            max_features=0.5)

random_SS_tree = BaggingClassifier(dtree, 
                            n_estimators = 10,
                            max_samples=1.0, 
                            max_features=0.5)

In [None]:
folds = 8
reps = 10
v = 0
cv=RepeatedKFold(n_repeats=reps, n_splits=folds)

scores_kNN = cross_val_score(kNN, X, y, cv=cv, verbose = v, n_jobs = -1)
scores_kNN_rSS = cross_val_score(random_SS_kNN, X, y, cv=cv, verbose = v, n_jobs = -1)

print("Mean for kNN {:.2f}".format(scores_kNN.mean()))
print("Mean for kNN_rand_SS {:.2f}".format(scores_kNN_rSS.mean()))

scores_tree = cross_val_score(dtree, X, y, cv=cv, verbose = v, n_jobs = -1)
scores_tree_rSS = cross_val_score(random_SS_tree, X, y, cv=cv, verbose = v, n_jobs = -1)

print("Mean for D-Tree {:.2f}".format(scores_tree.mean()))
print("Mean for D_Tree_rand_SS {:.2f}".format(scores_tree_rSS.mean()))

---
## Boosting
Default classifier is a Decision Tree of depth 1, a decision stump.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adaBoost = AdaBoostClassifier(n_estimators=100, algorithm = 'SAMME')
scores_adaBoost = cross_val_score(adaBoost, X, y, cv=folds, verbose = v, n_jobs = -1)

scores_adaBoost.mean()  

In [None]:
# Train Adaboost on all data and check weights for first 10 models
ab = adaBoost.fit(X,y)
ab.estimator_weights_[:10]

In [None]:
import matplotlib.pyplot as plt
plt.plot(ab.estimator_errors_, label='Errors')
plt.plot(ab.estimator_weights_, label='Weights')
plt.legend()
plt.xlabel("Iteration")
plt.ylabel("Error & Weight")

---
<h1><span style="color:red">Bonus Material</span></h1>

## Random Forest Feature Importance
As a side effect of building so many decision trees Random Forest is able to provide an estimate of feature importance. 

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(X,y)

In [None]:
RF.feature_importances_

In [None]:
FI_df = pd.DataFrame(RF.feature_importances_, index=hotel_rev_pd.columns,columns =['FI Score'])
FI_df.sort_values('FI Score', inplace=True, ascending = False)
FI_df

In [None]:
%matplotlib inline
pl = FI_df.plot.bar(figsize=(10,5))
pl.set_ylabel("Feature Importance")