In [2]:
!pip install optuna
!wget https://github.com/oreilly-japan/ml-security-jp/raw/master/ch02/dataset.csv

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 14.3 MB/s 
[?25hCollecting cliff
  Downloading cliff-3.10.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 10.0 MB/s 
[?25hCollecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting alembic
  Downloading alembic-1.7.5-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 89.5 MB/s 
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 5.2 MB/s 
[?25hCollecting cmd2>=1.0.0
  Downloading cmd2-2.3.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 111.4 MB/s 
[?25hCollecting stevedore>=2.0.1
  Downloading stevedore-3.5.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 5.4 MB/s 
Collecting pbr!=2.1.

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import optuna
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

training_data = np.genfromtxt('dataset.csv', delimiter=',', dtype=np.int32)
X = training_data[:,:-1]
y = training_data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=101
)

class Objective_DTC:
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __call__(self, trial):
        params = {
            'criterion': trial.suggest_categorical('criterion', {'gini', 'entropy'}),
            'splitter': trial.suggest_categorical('splitter', {'best', 'random'}),
            'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 64),
            'max_depth': trial.suggest_int('max_depth', 2, 64)
        }
        model = DecisionTreeClassifier(**params)
        scores = cross_validate(model, 
                                            X=self.X,
                                            y=self.y,
                                            scoring='accuracy',
                                            n_jobs=-1)
        return scores['test_score'].mean()

objective = Objective_DTC(X_train, y_train)
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=60)
print('params:', study.best_params)

[32m[I 2021-12-05 01:14:02,142][0m A new study created in memory with name: no-name-d97af2f1-7ac9-45e6-9caa-41bf72d7786d[0m
[32m[I 2021-12-05 01:14:02,967][0m Trial 0 finished with value: 0.9050209234452575 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_features': 'log2', 'min_samples_split': 57, 'max_depth': 50}. Best is trial 0 with value: 0.9050209234452575.[0m
[32m[I 2021-12-05 01:14:03,092][0m Trial 1 finished with value: 0.7072581717819972 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_features': 'auto', 'min_samples_split': 29, 'max_depth': 2}. Best is trial 0 with value: 0.9050209234452575.[0m
[32m[I 2021-12-05 01:14:03,228][0m Trial 2 finished with value: 0.9137285809658037 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_features': 'auto', 'min_samples_split': 50, 'max_depth': 51}. Best is trial 2 with value: 0.9137285809658037.[0m
[32m[I 2021-12-05 01:14:03,363][0m Trial 3 finished with value: 0.9252616070126795 a

params: {'criterion': 'entropy', 'splitter': 'best', 'max_features': 'auto', 'min_samples_split': 2, 'max_depth': 39}
