# Decision Trees & Ensemble Learning tutorial
## Libraries : Populating the iterative namespace¶

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import tree
# Ensemble Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
# pip install xgboot #https://pypi.python.org/pypi/xgboost/
# import xgboost  #( Prior: for mac brew tap homebrew/versions; brew install gcc --without-multilib) 
from matplotlib import pyplot as plt
%matplotlib inline

## Load Data  for Classification

In [None]:
''' Pima-Indians-Diabetes-Data-Set 
DataSet description: https://www.kaggle.com/dssariya/pima-indians-diabetes-data-set 
A data frame with 768 observations on the following 9 variables.
V1: Number of times pregnant
V2: Plasma glucose concentration (glucose tolerance test)
V3: Diastolic blood pressure (mm Hg)
V4: Triceps skin fold thickness (mm)
V5: 2-Hour serum insulin (mu U/ml)
V6: Body mass index (weight in kg/(height in m)\^2)
V7: Diabetes pedigree function
V8: Age (years)
V9: Class variable (1:tested positive for diabetes, 0: tested negative for diabetes)
'''
#url = "http://mlearn.ics.uci.edu/databases/pima-indians-diabetes/pima-indians-diabetes.data"
url = "/Users/habiboulaye/Projects/OPSOURCES/ml-scalpel/datasets/diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(url, names=names)
print(data.shape)
data.head()

In [None]:
data['class'].value_counts()
#class distribution
data.hist(column = 'class')
plt.xlim(-0.5,1.5)

conclusion ?

##### Visualise Decision Tree using GraphViz

In [None]:
# GraphViz - Tree visualisation
import pydotplus as pydot #Python interface to Graphviz's Dot language - pip install pydotplus #MacOS: brew install graphviz
from IPython.display import Image
from sklearn.externals.six import StringIO
# Download graphviz
os.environ["PATH"] += os.pathsep + "/usr/local/Cellar/graphviz/2.38.0_1/bin/" # make
#os.environ["PATH"] += os.pathsep + "D:\\path\to\graphviz\\bin" # Windows
#print(os.environ["PATH"])

In [None]:
#Instanciate a Decision Tree Classifier and display default parametes
dtc = tree.DecisionTreeClassifier(max_depth=3) #max_depth=1, criterion="entropy")
#Train the classifier
X, y = data[names[:-1]], data['class']
dtc.fit(X, y)

Display tree model

In [None]:
dot_data = StringIO()
tree.export_graphviz(dtc, out_file=dot_data) #,feature_names=predictor)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

Conclusion?
* Interpretation of values in tree nodes from Root to leaves
* Vary the max_depth parameter and display tree

## CrossValidation Strategy - Model selection

In [None]:
cv_url =  'crossval.png' #'http://i.imgur.com/N9HZktu.png'
Image(url=cv_url)

In [None]:
#nb folds of cross validation
kfold = 10

##### 0- Decision Tree model
Generate decision rules using simple series of Yes or No questions to classify

In [None]:
dtc = tree.DecisionTreeClassifier(max_depth=3) # default max_depth=?    
scores = cross_val_score(dtc, X, y, cv=kfold)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

##### 1- Random Forest Model
Strategy of trees construction to reduce correlation: Each tree is built using sampling with replacement of training dataset and a random subset of features

In [None]:
rfc = RandomForestClassifier(n_estimators=100) # n_estimators=?, max_features=?
scores = cross_val_score(rfc, X, y, cv=kfold)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

##### 2- ExtraTrees Model
Another modification of bagging where random trees are constructed from samples of the training dataset.

In [None]:
etc = ExtraTreesClassifier(n_estimators=100)   
scores = cross_val_score(etc, X, y, cv=kfold)
print("etc accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

##### 3- Gradient Boosting Model
Combines a sequence of trees that attempt to correct the mistakes of the models before them in the sequence.

In [None]:
gbc = GradientBoostingClassifier(n_estimators=100)   
scores = cross_val_score(gbc, X, y, cv=kfold)
print("gbc accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
##### 4- eXtreme Gradient Boosting
#Very fast and effective version of Gradient boosting
#xgb = xgboost.XGBClassifier()
#scores = cross_validation.cross_val_score(gbc, X, y, cv=kfold)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

? conclusion

### Variables Importance

In [None]:
# Plot the feature importances of the forest
gbc.fit(X,y)
importances = gbc.feature_importances_
#show to plot importances
importances = pd.DataFrame({'feature':X.columns,'importance':np.round(importances,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
#print(importances)
importances.plot(kind='bar', figsize=(10,5))

Conclusion ?

## TrainTest Strategy - Model evaluation

### Split TrainTest

In [None]:
import sklearn.utils
data = sklearn.utils.shuffle(data)
data['is_train'] = np.random.uniform(0, 1, len(data)) <= 3/4 #.75
train, test = data[data['is_train']==True], data[data['is_train']==False]
print("size train/test: {0}/{1}".format(train.shape[0], test.shape[0]))
X_train, y_train = train[names[:-1]], train['class']
X_test, y_test = test[names[:-1]], test['class']

### Performance evaluation

In [None]:
from sklearn import metrics

In [None]:
clf_map = {'RandomForest': rfc, 'ExtraTrees': etc, 'GradientBoosting': gbc  }
clf_preds_map = {}
for cname, clf in clf_map.items():
    clf.fit(X_train, y_train)
    clf_preds_map[cname] = clf.predict(X_test)

##### Accuracy

In [None]:
for cname, _ in clf_map.items():
    print("=> {0} accuracy = [{1:.2f}]]".format(cname, metrics.accuracy_score(y_test,clf_preds_map[cname])))

Conclusion ?

##### Confusion matrix

In [None]:
conf_map = {}
for cname, _ in clf_map.items():
    conf_map[cname]=metrics.confusion_matrix(y_test, clf_preds_map[cname])

In [None]:
import seaborn as sns
fig, _ = plt.subplots(figsize=(12, 3))
print('Confusion Matrix of the 3 Classifiers')
plt.subplot(1, 3, 1) # starts from 1
for i, cname in enumerate(conf_map):
    plt.subplot(1, 3, i+1)
    plt.title(cname);
    sns.heatmap(conf_map[cname], annot=True,  fmt='')

##### Area Under Curve - Receiver Operating Characteristic

In [None]:
colors = ['seagreen', 'blue', 'darkorange', 'indigo']
i = 0
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for cname, clf in clf_map.items():
    fpr, tpr, thresholds = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
    roc_auc = metrics.auc(fpr, tpr)
    print("=> {0} auc_roc [{1:.2f}] ]".format(cname, roc_auc))
    plt.plot(fpr, tpr, color=colors[i],label='ROC {0} (area = {1:.2f})'.format(cname, roc_auc))
    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', color='k', label='Good Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

Conclusion ?