In [1]:
import numpy as np #computations
import pandas as pd #dataframes
import matplotlib.pyplot as plt #plots

from sklearn.datasets import load_iris #dataset

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier #decision tree algorithm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split #dataset splitting for training and testing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

from sklearn.metrics import roc_curve, auc

  from numpy.core.umath_tests import inner1d


We will use the iris dataset to test Decision Trees and Random Forests characteristics and how they influence the results on accuracy.

In [2]:
iris = load_iris()
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [3]:
y=df.species
x=df.drop('species',axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,
                                                    random_state=0,
                                                    stratify=y)

In [5]:
clf = DecisionTreeClassifier()

In [6]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
print("Acc: ", clf.score(X_test,y_test))

Acc:  0.9777777777777777


Let's see if we can do better if we use emsemble models and hyperparameter tuning

In [8]:
pipelines = {
    'dt' : make_pipeline(DecisionTreeClassifier(random_state=0)),
    'rf' : make_pipeline(RandomForestClassifier(random_state=0)),
    'gb' : make_pipeline(GradientBoostingClassifier(random_state=0)),
    'svc': make_pipeline(SVC(random_state=0))
 }

In [9]:
dt_hyperparameters = {
    'decisiontreeclassifier__criterion' : ['gini','entropy']
}

In [10]:
rf_hyperparameters = {
    'randomforestclassifier__n_estimators' : [1,2,5,10,20,50,100,200],
    'randomforestclassifier__max_features' : ['auto','sqrt',0.33]
}

In [11]:
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators' : [1,2,5,10,20,50,100,200],
    'gradientboostingclassifier__learning_rate' : [0.05,0.1,0.2],
    'gradientboostingclassifier__max_depth' : [1,3,5]
}

In [12]:
pipelines['svc'].get_params()

{'memory': None,
 'steps': [('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=0, shrinking=True,
     tol=0.001, verbose=False))],
 'svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=0, shrinking=True,
   tol=0.001, verbose=False),
 'svc__C': 1.0,
 'svc__cache_size': 200,
 'svc__class_weight': None,
 'svc__coef0': 0.0,
 'svc__decision_function_shape': 'ovr',
 'svc__degree': 3,
 'svc__gamma': 'auto',
 'svc__kernel': 'rbf',
 'svc__max_iter': -1,
 'svc__probability': False,
 'svc__random_state': 0,
 'svc__shrinking': True,
 'svc__tol': 0.001,
 'svc__verbose': False}

In [13]:
svc_hyperparameters = {
    'svc__kernel' : ['rbf','poly','sigmoid']
}

In [14]:
hyperparameters = {
    'dt' : dt_hyperparameters,
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters,
    'svc': svc_hyperparameters 
}

In [15]:
fitted_models = {}
for name,pipeline in pipelines.items():
    model = GridSearchCV(pipeline,hyperparameters[name],cv=10)
    model.fit(X_train,y_train)
    fitted_models[name] = model
    print(name," has been fitted!")

dt  has been fitted!
rf  has been fitted!
gb  has been fitted!
svc  has been fitted!


Cross-validated score for the best models

In [16]:
for name,model in fitted_models.items():
    print(name,model.best_score_)

dt 0.9333333333333333
rf 0.9428571428571428
gb 0.9619047619047619
svc 0.9428571428571428


features of the best models

In [17]:
fitted_models['dt'].best_params_

{'decisiontreeclassifier__criterion': 'gini'}

In [18]:
fitted_models['rf'].best_params_

{'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__n_estimators': 2}

In [19]:
fitted_models['gb'].best_params_

{'gradientboostingclassifier__learning_rate': 0.05,
 'gradientboostingclassifier__max_depth': 1,
 'gradientboostingclassifier__n_estimators': 1}

In [20]:
fitted_models['svc'].best_params_

{'svc__kernel': 'poly'}

Score of the models when predicting

In [21]:
for name,model in fitted_models.items():
    print(name,model.score(X_test,y_test))

dt 0.9777777777777777
rf 0.9333333333333333
gb 0.9555555555555556
svc 1.0


Predictions using support vector classifier

In [22]:
pred = fitted_models['svc'].predict(X_test)
np.column_stack((pred,y_test))

array([['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['setosa', 'setosa'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['virginica', 'virginica'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['versicolor', 'versicolor'],
       ['versicolor', 'versicolor'],
       ['versicolor', 'versicolor'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['setosa', 'setosa'],
       ['versicolor', 'versicolor'],
       ['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['virginica', 'virginica'],
       ['versicolor', 'versicolor'],
       ['virg