Import libraries

In [24]:
import numpy as np #computations
import pandas as pd #dataframes
import matplotlib.pyplot as plt #plots

from sklearn.datasets import load_iris #dataset

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier #decision tree algorithm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split #dataset splitting for training and testing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

from sklearn.metrics import roc_curve, auc

We will use the iris dataset to test Decision Trees and Random Forests characteristics and how they influence the results on accuracy.

In [25]:
iris = load_iris()
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


We split the dataset into features(x) and response(y)

In [26]:
y=df.species
x=df.drop('species',axis=1)

Now, we split again but into training and testing samples

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,
                                                    random_state=0,
                                                    stratify=y)

This is for calling the decision tree classifier, just to see how it performs out of the box

In [28]:
clf = DecisionTreeClassifier()

In [29]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

The accuracy on the test samples is around 96%

In [30]:
print("Acc: ", clf.score(X_test,y_test))

Acc:  0.9777777777777777


Let's see if we can do better if we use emsemble models and hyperparameter tuning

In [31]:
pipelines = {
    'dt' : make_pipeline(DecisionTreeClassifier(random_state=0)),
    'rf' : make_pipeline(RandomForestClassifier(random_state=0)),
    'gb' : make_pipeline(GradientBoostingClassifier(random_state=0)),
    'svc': make_pipeline(SVC(random_state=0))
 }

In [32]:
dt_hyperparameters = {
    'decisiontreeclassifier__criterion' : ['gini','entropy']
}

In [33]:
rf_hyperparameters = {
    'randomforestclassifier__n_estimators' : [1,2,5,10,20,50,100,200],
    'randomforestclassifier__max_features' : ['auto','sqrt',0.33]
}

In [34]:
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators' : [1,2,5,10,20,50,100,200],
    'gradientboostingclassifier__learning_rate' : [0.05,0.1,0.2],
    'gradientboostingclassifier__max_depth' : [1,3,5]
}

In [35]:
svc_hyperparameters = {
    'svc__kernel' : ['rbf','poly','sigmoid']
}

In [36]:
hyperparameters = {
    'dt' : dt_hyperparameters,
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters,
    'svc': svc_hyperparameters 
}

In [37]:
fitted_models = {}
for name,pipeline in pipelines.items():
    model = GridSearchCV(pipeline,hyperparameters[name],cv=10)
    model.fit(X_train,y_train)
    fitted_models[name] = model
    print(name,"ok")

dt ok
rf ok
gb ok
svc ok


Cross-validated score for the best models

In [38]:
for name,model in fitted_models.items():
    print(name,model.best_score_)

dt 0.9333333333333333
rf 0.9428571428571428
gb 0.9619047619047619
svc 0.9428571428571428


best parameters found by gridsearch

In [39]:
fitted_models['dt'].best_params_

{'decisiontreeclassifier__criterion': 'gini'}

In [40]:
fitted_models['rf'].best_params_

{'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__n_estimators': 2}

In [41]:
fitted_models['gb'].best_params_

{'gradientboostingclassifier__learning_rate': 0.05,
 'gradientboostingclassifier__max_depth': 1,
 'gradientboostingclassifier__n_estimators': 1}

In [42]:
fitted_models['svc'].best_params_

{'svc__kernel': 'poly'}

Score of the models when predicting

In [43]:
for name,model in fitted_models.items():
    print(name,model.score(X_test,y_test))

dt 0.9777777777777777
rf 0.9333333333333333
gb 0.9555555555555556
svc 1.0


Predictions using support vector classifier

In [44]:
pred = fitted_models['svc'].predict(X_test)
np.column_stack((pred,y_test))

array([['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['setosa', 'setosa'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['virginica', 'virginica'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['versicolor', 'versicolor'],
       ['versicolor', 'versicolor'],
       ['versicolor', 'versicolor'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['versicolor', 'versicolor'],
       ['virg

Support Vector Classifier could predict perfectly the species even if at the cross validation stage was not perfect.