# Hyperparameters tuning

In [1]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets.methods import dataset_information, delete_dataset_features, fill_dataset_samples, extract_samples_labels
from predictors.treepredictors import TreePredictor
from commons.stopping_criteria import NodeImpurityLevel, NodeMinimumSamples
from commons.splitting_criteria import ThresholdCondition, MembershipCondition
from commons.splitting_criteria import information_gain
from commons.losses import zero_one_loss, samples_error, create_folds, cross_validation
from commons.plotting import plot_train_test_error, plot_confusion_matrices

## Dataset & preprocessing

In [2]:
## CONSTANTS

DATASET_FILENAME = 'datasets/mushroom_secondary.csv'
SAMPLES_NUMBER = 10_000
DELETION_THRESHOLD = 0.20
IMPUTATION_VALUE = 'u'
NORMALIZATION = True

In [3]:
## Loading dataset

mushroom_dataset = pd.read_csv(DATASET_FILENAME, sep=';', nrows=SAMPLES_NUMBER)

## Dataset preprocessing

mushroom_dataset = delete_dataset_features(mushroom_dataset, DELETION_THRESHOLD)
mushroom_dataset = fill_dataset_samples(mushroom_dataset, IMPUTATION_VALUE)
samples_set, labels_set = extract_samples_labels(mushroom_dataset)

## Train and test split

In [4]:
## CONSTANTS

RANDOM_SEED = 1234
TEST_SIZE = 0.2
SHUFFLE = True

In [5]:
## Train and test split

train_sample_set, test_sample_set, train_labels_set, test_labels_set = train_test_split(samples_set, labels_set, test_size=TEST_SIZE, shuffle=SHUFFLE, random_state=RANDOM_SEED)

print(f'Original dataset size: {len(samples_set)}')
print(f'Train set size:        {len(train_sample_set)}')
print(f'Test set size:         {len(test_sample_set)}')

Original dataset size: 10000
Train set size:        8000
Test set size:         2000


In [6]:
## Hyperparameters (no limitations)

continuous_condition = ThresholdCondition
categorical_condition = MembershipCondition
node_stopping_criteria = []
tree_stopping_criteria = []
decision_metric = information_gain

In [7]:
## Model definition

tree_predictor = TreePredictor(
    continuous_condition=continuous_condition,
    categorical_condition=categorical_condition,
    decision_metric=decision_metric,
    tree_stopping_criteria=tree_stopping_criteria,
    node_stopping_criteria=node_stopping_criteria
)

In [8]:
## Model training

tree_predictor.fit(train_sample_set, train_labels_set)

In [9]:
## Model evaluation

predictor = tree_predictor.predict
misclassification_train_error = samples_error(predictor, zero_one_loss, train_labels_set, train_sample_set)
misclassification_test_error = samples_error(predictor, zero_one_loss, test_labels_set, test_sample_set)

print(f'Train error: {misclassification_train_error}')
print(f'Test error:  {misclassification_test_error}')

Train error: 0.0
Test error:  0.003


## Cross validation

In [15]:
## CONSTANTS

FOLDS = 5
VERBOSE = True

In [16]:
## Folds split

np.random.seed(RANDOM_SEED)
np.random.shuffle(samples_set)
np.random.seed(RANDOM_SEED)
np.random.shuffle(labels_set)

data_folds = create_folds(samples_set, FOLDS)
labels_folds = create_folds(labels_set, FOLDS)

print(f'Original dataset size: {len(samples_set)}')
print(f'Folds number:          {len(data_folds)}')
print(f'folds size:            {len(data_folds[0])}')

Original dataset size: 10000
Folds number:          5
folds size:            2000


In [17]:
## Hyperparameters (no limitations)

continuous_condition = ThresholdCondition
categorical_condition = MembershipCondition
node_stopping_criteria = []
tree_stopping_criteria = []
decision_metric = information_gain

In [18]:
## Model definition

tree_predictor = TreePredictor(
    continuous_condition=continuous_condition,
    categorical_condition=categorical_condition,
    decision_metric=decision_metric,
    tree_stopping_criteria=tree_stopping_criteria,
    node_stopping_criteria=node_stopping_criteria
)

In [19]:
## Cross validation

cross_validation_value = cross_validation(tree_predictor, zero_one_loss, data_folds, labels_folds, VERBOSE)

print(f'final cross validation estimation: {cross_validation_value}')

Running cross validation on 5 folds of size 2000
Fold 0 iteration:
Training model on folds [-0]...
Testing model on fold [0]...
> fold [0] error: 0.003
Fold 1 iteration:
Training model on folds [-1]...
Testing model on fold [1]...
> fold [1] error: 0.002
Fold 2 iteration:
Training model on folds [-2]...
Testing model on fold [2]...
> fold [2] error: 0.0015
Fold 3 iteration:
Training model on folds [-3]...
Testing model on fold [3]...
> fold [3] error: 0.0015
Fold 4 iteration:
Training model on folds [-4]...
Testing model on fold [4]...
> fold [4] error: 0.0025
>> final cross validation value: 0.0021000000000000003
final cross validation estimation: 0.0021000000000000003
