## Test class for running BinOCT implementation

In [1]:
import sys
sys.path.append("..")
import preprocessing.Datasets as DS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from imblearn.metrics import specificity_score
import os
from modelling.binOCText.binoct import binOptimalDecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# Manually loading binary dataset
problem = DS.banknote 
pname = problem.__name__.upper()
print(f'---{pname}---')
df = pd.read_csv('C:/Users/nilku/Desktop/PROJECTS/BAN-thesis-optimal-trees/datasets/binary/BANKNOTE_binary.csv')
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, random_state=42)

---BANKNOTE---


### Testing implementaiton

In [2]:
# Parameters for GridSearchCV (reference model)
params = {'max_depth': range(2, 5),
'min_samples_split': [2,3,4],
'min_samples_leaf': [1,2]}

# Train reference model
clf = DecisionTreeClassifier(random_state=4)
gcv = GridSearchCV(estimator=clf,param_grid=params, scoring='accuracy', cv=3)
gcv.fit(X_train,y_train)

# Get optimal parameters
optimal_depth = gcv.best_params_['max_depth']
optimal_min_samples = gcv.best_params_['min_samples_split']

# Train optimal tree
octree = binOptimalDecisionTreeClassifier(max_depth=optimal_depth, min_samples_split=optimal_min_samples, warmstart=True, timelimit=100, output=True)
octree.fit(X_train.values, y_train.values)

Training data include 1097 instances, 72 features.
Set parameter Username
Academic license - for non-commercial use only - expires 2025-05-13
Set parameter TimeLimit to value 100
Gurobi Optimizer version 11.0.2 build v11.0.2rc0 (win64 - Windows 11.0 (22631.2))

CPU model: AMD Ryzen 5 3500U with Radeon Vega Mobile Gfx, instruction set [SSE2|AVX|AVX2]
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 19808 rows, 18712 columns and 2617520 nonzeros
Model fingerprint: 0x9a354d38
Variable types: 17584 continuous, 1128 integer (1128 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+03]
  Objective range  [2e-03, 2e-03]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+03]

User MIP start did not produce a new incumbent solution

Found heuristic solution: objective 0.6856678
Presolve removed 16 rows and 16 columns (presolve time = 5s) ...
Presolve removed 16 rows and 16 columns
Presolve time: 6.15s
Presolved: 19792 rows

### Extracting metrics

In [3]:
# Predict using optimal tree
y_test_pred = octree.predict(X_test)

# Compute accuracy
accuracy = np.mean(y_test == y_test_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.96


In [4]:
# unique features used
splits = octree.split
features_used = [splits[i][0] for i in splits]
features_used = len(np.unique(features_used))

# number of nodes
num_nodes = len(octree.n_index)

# number of leaves
num_leaves = len(octree.l_index)

# number of rules
num_rules = len(octree.b_index) 

# average and max dept (equal in this case)
max_depth = octree.max_depth
avg_depth = octree.max_depth

# tree is always balanced
is_imbalanced_tree = False

# Changing variable to match with metrics class later
y_true = y_test
y_pred = y_test_pred

metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred,zero_division=0),
        "auc": roc_auc_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred,zero_division=0),
        'recall': recall_score(y_true, y_pred,zero_division=0),
        'specificity': specificity_score(y_true, y_pred),
        '#leaves': num_leaves,
        '#nodes': num_nodes,
        '#rules': num_nodes-num_leaves,
        'max_depth': max_depth,
        'avg_depth': avg_depth,
        'binary_features_used': features_used,
        'imbalanced': is_imbalanced_tree
    }

print(metrics)

{'accuracy': 0.96, 'f1': 0.9571984435797666, 'auc': 0.9606033198552884, 'precision': 0.9461538461538461, 'recall': 0.968503937007874, 'specificity': 0.9527027027027027, '#leaves': 16, '#nodes': 31, '#rules': 15, 'max_depth': 4, 'avg_depth': 4, 'binary_features_used': 12, 'imbalanced': False}
