# Introduction

A notebook to deal with Training and Testing Analysis of ORCT (_Optimal Randomized Classification Trees_ Blanquero et Al.2018).

### Remark
* Here we didn't define any class for ORCT
* We use data from Iris dataset: for a generalized version to manage any kind of datasets look at notebook 'Analysis with Class'

In [None]:
# dataframe management
import pandas as pd
import math
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import json
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from functools import reduce # Valid in Python 2.6+, required in Python 3
import operator
from pyomo.environ import *
from pyomo.opt import SolverFactory

# Preprocessing of dataset
Let's load the Iris dataset:

In [2]:
iris = pd.read_csv('... .csv') #IrisCategorical.csv
iris = iris.drop('Id', axis=1)
iris_std = iris.copy()
iris.head(5)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
scaler = MinMaxScaler() # also MaxAbsScaler()

In [4]:
#Preprocessing: we get the columns names of features which have to be standardized
columns_names = list(iris)
index_features = list(range(0,len(iris_std.columns)-1))

#The name of the classes K
classes = iris_std['Species'].unique().tolist()
classes_en = [i for i in range(len(classes))] 

#Encoder processing
le = preprocessing.LabelEncoder()
le.fit(iris_std['Species'])

iris_std['Species'] = le.transform(iris_std['Species']) 

#Scaling phase
iris_std[columns_names[0:4]] = scaler.fit_transform(iris_std[columns_names[0:4]])

iris_std.head(1)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0.222222,0.625,0.067797,0.041667,0


Splitting the dataset between __Training & Testing Sets__

In [5]:
df = iris_std[columns_names[:-1]]
y = iris_std[columns_names[4]]
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25)
df_train = pd.concat([X_train, y_train], axis=1, join_axes=[X_train.index])

Objects useful to deal with trees (of depth 2) and their topology

In [6]:
BF_in_NL_R = {4:[],5:[2],6:[1],7:[1,3]}
BF_in_NL_L = {4:[1,2],5:[1],6:[3],7:[]}
I_in_k = {i : list(df_train[df_train['Species']== i].index) for i in range(len(classes))}
my_W = {(i,j): 0.5 if i != j else 0 for i in classes_en for j in classes_en}
index_instances = list(X_train.index)
my_x = {(i,j): df_train.loc[i][j] for i in index_instances for j in index_features}

In [7]:
def B_in_NR(model, i):
    if i==4:
        return []
    elif i==5:
        return [2]
    elif i==6:
        return [1]
    elif i==7:
        return [1,3]
def B_in_NL(model, i):
    if i==4:
        return [1,2]
    elif i==5:
        return [1]
    elif i==6:
        return [3]
    elif i==7:
        return []

def I_k(model,i):
    if i==0:
        return I_in_k[0]
    elif i==1:
        return I_in_k[1]
    elif i==2:
        return I_in_k[2]

# Model definition
We initialize the __model__ and the sets K, N_L, N_B, I, I_k, N_L_L, N_L_R and f_s are declared abstractly using the Set component:

In [8]:
model = ConcreteModel() #ConcretModel()
# Instances & Classes
# Assume a dict I_in_k, with keys k and values of a list of I's in that k

model.I = Set(initialize=set(i for k in I_in_k for i in I_in_k[k]))
model.K = Set(initialize=I_in_k.keys())
model.I_k = Set(model.K,initialize=I_k)    ##########################

# Features
model.f_s =Set(initialize=index_features)

# Nodes Leaf N_L & Nodes Breanch N_B
model.N_B = Set(initialize=set(i for k in BF_in_NL_R for i in BF_in_NL_R[k]))
model.N_L = Set(initialize=BF_in_NL_R.keys())
model.N_L_R = Set(model.N_L,initialize=B_in_NR)
model.N_L_L = Set(model.N_L,initialize=B_in_NL)

Similarly, the model parameters are defined abstractly using the __Param__ component:

In [9]:
# Cost of misclassification
model.W = Param(model.K, model.K, within=NonNegativeReals, initialize=my_W)

# Value for the instance i-th of the feature j-th
model.x = Param(model.I, model.f_s, within=PercentFraction, initialize=my_x)

The __Var__ component is used to define the decision variables:

In [10]:
#random initialization
init_a = np.random.uniform(low=-1.0, high=1.0, size=None)
init_mu = np.random.uniform(low=-1.0, high=1.0, size=None)
init_C = np.random.uniform(low=0.0, high=1.0, size=None)
init_P = np.random.uniform(low=0.0, high=1.0, size=None)
init_p = np.random.uniform(low=0.0, high=1.0, size=None)

# The weigths of feature j-th in breanch node t-th
model.a = Var(model.f_s, model.N_B, within=Reals, bounds = (-1.0,1.0),initialize=init_a)

# The intercepts of the linear combinations correspond to decision variables
model.mu = Var(model.N_B, within = Reals, bounds = (-1.0,1.0),initialize=init_mu)

# The variables thtat take into account if node t is labeled with class k
model.C = Var(model.K, model.N_L, within = PercentFraction,initialize=init_C)

# An auxiliary variables
model.P = Var(model.I,model.N_L,within = PercentFraction,initialize=init_P)
model.p = Var(model.I,model.N_B,within = PercentFraction,initialize=init_p)

Several definition of functions: tools useful to characterize the objective function

In [11]:
# Minimize the cost of misclassification
def cost_rule(model):
    return sum( sum( sum( model.P[i,t]* sum(model.W[k,j]*model.C[j,t] for j in model.K if k!=j)  for t in model.N_L) for i in model.I_k[k] ) for k in model.K )
model.cost = Objective(rule=cost_rule, sense=minimize)

In [12]:
# We must add the following set of constraints for making a single class prediction at each leaf node:
def Pr(model,i,tl):
    return  reduce(operator.mul,(model.p[i,t] for t in model.N_L_L[tl]),1)*reduce(operator.mul,(1-model.p[i,tr] for tr in model.N_L_R[tl]),1) == model.P[i,tl]
model.Pr = Constraint(model.I,model.N_L, rule=Pr)

def pr(model, i , tb):
    return 1 / (1 + exp(-512*(   (sum(model.x[i,j]*model.a[j,tb]for j in model.f_s)/4)-model.mu[tb]  ))) ==model.p[i,tb]
model.pr = Constraint(model.I,model.N_B, rule=pr)

Similarly, rule functions are used to define constraint expressions in the __Constraint__ component:

In [13]:
# We must add the following set of constraints for making a single class prediction at each leaf node:
def class_in_leaf(model, tl):
    return  sum(model.C[k,tl] for k in model.K) == 1
model.class_in_leaf = Constraint(model.N_L, rule=class_in_leaf)

# We force each class k to be identified by, at least, one terminal node, by adding the set of constraints below:
def leaf_in_class(model,k):
    return sum(model.C[k,tl] for tl in model.N_L) >=1
model.leaf_in_class = Constraint(model.K, rule=leaf_in_class)

In [14]:
opt = SolverFactory('ipopt',executable='C:/.../ipopt.exe')# in executable the directory path of ipopt.exe
# Create a model instance and optimize
#instance = model.create_instance()
results = opt.solve(model,tee=True)
#instance.display()

Ipopt 3.11.1: 

******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit http://projects.coin-or.org/Ipopt
******************************************************************************

NOTE: You are using Ipopt by default with the MUMPS linear solver.
      Other linear solvers might be more efficient (see Ipopt documentation).


This is Ipopt version 3.11.1, running with linear solver mumps.

Number of nonzeros in equality constraint Jacobian...:     3357
Number of nonzeros in inequality constraint Jacobian.:       12
Number of nonzeros in Lagrangian Hessian.............:     1165

Total number of variables............................:      811
                     variables with only lower bounds:        0
                variables with lower and upper bounds:      811


  72 2.7597845e+001 2.77e-001 4.79e+003  -1.7 4.89e-001   1.7 1.98e-001 4.33e-001h  1
  73 2.7547199e+001 2.71e-001 4.69e+003  -1.7 3.01e-001   2.1 1.42e-001 2.34e-002h  1
  74 2.7546670e+001 2.71e-001 1.67e+004  -1.7 2.71e-001   2.5 2.27e-001 2.52e-004h  1
  75r2.7546670e+001 2.71e-001 1.00e+003  -0.6 0.00e+000   2.1 0.00e+000 3.23e-007R  4
  76r2.7336396e+001 1.93e-001 9.99e+002  -0.6 1.27e+002    -  1.36e-003 1.41e-003f  1
  77 2.7336309e+001 1.93e-001 1.09e+004  -1.7 4.26e-001    -  1.35e-001 2.36e-005h  1
  78 2.7340174e+001 1.93e-001 1.12e+004  -1.7 1.50e+002    -  9.84e-005 9.59e-005f  1
  79 2.7323596e+001 1.93e-001 6.72e+003  -1.7 1.09e+002    -  1.82e-004 3.50e-005H  1
iter    objective    inf_pr   inf_du lg(mu)  ||d||  lg(rg) alpha_du alpha_pr  ls
  80 2.7323522e+001 1.93e-001 6.72e+003  -1.7 1.84e+002    -  2.46e-006 2.49e-006f  2
  81 2.7402324e+001 2.80e-001 1.08e+006  -1.7 1.60e+002    -  9.17e-006 1.97e-003f  2
  82 2.7405152e+001 2.77e-001 1.06e+006  -1.7 6.72e-001    

iter    objective    inf_pr   inf_du lg(mu)  ||d||  lg(rg) alpha_du alpha_pr  ls
 160 5.3934343e+000 1.52e-003 1.73e+005  -3.8 1.72e+000   0.4 5.89e-005 6.07e-003f  1
 161 5.3716457e+000 1.52e-003 1.23e+005  -3.8 9.74e-001  -0.1 1.69e-005 6.79e-003f  1
 162 5.3681149e+000 1.55e-003 1.48e+005  -3.8 3.36e+001   0.4 1.48e-007 1.41e-004f  1
 163 5.2408709e+000 1.65e-003 1.17e+005  -3.8 1.46e+000  -0.1 7.19e-005 1.22e-002f  1
 164 5.2317243e+000 1.65e-003 1.26e+005  -3.8 7.06e-001   0.3 5.98e-004 3.70e-003f  1
 165 5.2264148e+000 1.70e-003 1.22e+005  -3.8 3.15e+000  -0.2 4.15e-006 8.44e-004f  1
 166 5.2215856e+000 1.70e-003 1.25e+005  -3.8 2.66e-001   0.3 1.14e-004 2.29e-003f  1
 167 5.2150052e+000 1.73e-003 1.24e+005  -3.8 5.15e+000  -0.2 5.83e-005 3.98e-004f  1
 168 5.2151564e+000 1.73e-003 1.25e+005  -3.8 8.17e+000   0.2 5.23e-006 2.29e-005f  2
 169 5.2093803e+000 1.73e-003 1.28e+005  -3.8 4.94e+000  -0.3 1.40e-002 2.09e-004f  1
iter    objective    inf_pr   inf_du lg(mu)  ||d||  lg(rg) 

 247r9.1362970e+000 8.82e-002 7.70e+004  -0.0 7.17e+000    -  3.57e-003 9.97e-003f  1
 248r9.2163648e+000 5.79e-002 7.65e+004  -0.0 1.01e+001    -  1.23e-002 7.60e-003f  1
 249r9.3796570e+000 5.40e-002 7.51e+004  -0.0 1.71e+000    -  3.21e-002 1.82e-002f  1
iter    objective    inf_pr   inf_du lg(mu)  ||d||  lg(rg) alpha_du alpha_pr  ls
 250r9.4486325e+000 4.62e-002 7.78e+004  -0.0 4.11e+000    -  1.78e-002 1.25e-002f  1
 251r9.8448699e+000 3.71e-002 7.51e+004  -0.0 8.20e-001    -  7.51e-002 3.38e-002f  1
 252r1.0272391e+001 2.91e-002 6.69e+004  -0.0 2.25e-001   2.6 1.56e-001 1.11e-001f  1
 253r1.1492624e+001 4.01e-002 5.52e+004  -0.0 4.62e-001   2.1 1.46e-001 1.61e-001f  1
 254r1.6099665e+001 1.07e-001 1.41e+005  -0.0 5.34e-001   1.7 1.58e-001 3.78e-001f  1
 255r2.0058169e+001 3.33e-002 7.60e+004  -0.0 2.15e-001    -  2.67e-001 7.12e-001f  1
 256r2.5338910e+001 3.96e-002 2.29e+004  -0.0 1.91e-001    -  6.30e-001 1.00e+000f  1
 257r2.9715590e+001 1.34e-001 2.24e+004  -0.0 5.06e-001    

 334 3.2825841e+000 8.44e-003 2.22e+001  -3.8 8.19e-002    -  1.00e+000 1.00e+000f  1
 335 3.2181152e+000 6.09e-003 1.67e+006  -3.8 7.04e-002    -  1.00e+000 4.46e-001h  1
 336 3.0581530e+000 1.27e-002 2.73e+001  -3.8 8.63e-002    -  1.00e+000 1.00e+000f  1
 337 2.9889423e+000 1.18e-002 2.05e+007  -3.8 1.65e-001    -  9.17e-001 2.40e-001h  1
 338 2.7778432e+000 8.37e-002 1.78e+007  -3.8 7.10e-001    -  2.40e-001 5.45e-001f  1
 339 2.7769026e+000 8.36e-002 6.30e+006  -3.8 1.48e+000    -  3.74e-003 7.05e-004h  1
iter    objective    inf_pr   inf_du lg(mu)  ||d||  lg(rg) alpha_du alpha_pr  ls
 340 2.7342139e+000 6.65e-002 9.95e+006  -3.8 6.85e-002    -  3.53e-003 2.03e-001f  1
 341 2.6973923e+000 5.15e-002 7.66e+006  -3.8 4.80e-002    -  5.37e-002 2.24e-001h  1
 342 2.6497186e+000 3.89e-002 1.22e+007  -3.8 8.62e-002    -  1.78e-002 2.44e-001f  1
 343 2.5438152e+000 2.74e-002 1.55e+007  -3.8 4.39e-002    -  6.80e-001 2.95e-001f  1
 344 2.4624692e+000 2.29e-002 3.40e+007  -3.8 2.22e-001    

 421 7.8265310e-001 5.75e-003 1.40e+006  -3.8 1.21e-001    -  7.31e-001 1.85e-001h  1
 422 7.7986658e-001 5.98e-003 1.26e+006  -3.8 1.56e-001    -  8.63e-002 1.01e-001f  2
 423 7.7827143e-001 5.38e-003 1.03e+006  -3.8 7.72e-002    -  1.14e-001 1.78e-001h  1
 424 7.8181287e-001 1.60e-003 3.73e+005  -3.8 4.60e-002    -  6.01e-002 1.00e+000f  1
 425 7.8343599e-001 5.55e-004 4.41e-001  -3.8 1.73e-002    -  1.00e+000 1.00e+000h  1
 426 7.8381783e-001 2.22e-005 8.61e-003  -3.8 1.10e-003    -  1.00e+000 1.00e+000h  1
 427 7.5216022e-001 8.53e-004 8.88e+003  -5.7 4.55e-002    -  9.38e-001 3.43e-001f  1
 428 7.4208607e-001 6.86e-004 6.35e+003  -5.7 4.60e-002    -  7.85e-001 3.32e-001h  1
 429 7.3294250e-001 5.61e-004 6.55e+003  -5.7 2.97e-002    -  1.17e-001 4.70e-001h  1
iter    objective    inf_pr   inf_du lg(mu)  ||d||  lg(rg) alpha_du alpha_pr  ls
 430 7.3027613e-001 4.88e-004 1.71e+004  -5.7 2.30e-002    -  6.69e-001 1.60e-001h  1
 431 7.2616871e-001 4.04e-004 1.18e+004  -5.7 2.10e-002    

In iteration 500, 1 Slack too small, adjusting variable bound
 501 6.8744559e-001 1.27e-005 1.17e+003  -8.6 2.21e-002    -  8.87e-002 3.23e-001h  1
In iteration 501, 2 Slacks too small, adjusting variable bounds
 502 6.8744292e-001 1.24e-005 1.13e+003  -8.6 5.01e-002    -  5.70e-003 3.50e-002h  1
In iteration 502, 1 Slack too small, adjusting variable bound
 503 6.8743936e-001 1.14e-005 1.04e+003  -8.6 7.11e-002    -  2.04e-002 8.03e-002h  1
In iteration 503, 1 Slack too small, adjusting variable bound
 504 6.8743629e-001 1.09e-005 9.90e+002  -8.6 2.85e-002    -  1.94e-002 4.48e-002h  1
In iteration 504, 2 Slacks too small, adjusting variable bounds
 505 6.8741974e-001 9.10e-006 7.60e+002  -8.6 2.62e-002    -  1.15e-002 2.32e-001h  1
In iteration 505, 1 Slack too small, adjusting variable bound
 506 6.8739796e-001 7.61e-006 4.81e+002  -8.6 2.63e-002    -  3.31e-001 3.67e-001h  1
 507 6.8737994e-001 6.10e-006 2.96e+002  -8.6 3.14e-002    -  3.31e-002 3.85e-001h  1
In iteration 507, 2 Sl

In [15]:
print(results)
print(value(model.cost))


Problem: 
- Lower bound: -inf
  Upper bound: inf
  Number of objectives: 1
  Number of constraints: 791
  Number of variables: 811
  Sense: unknown
Solver: 
- Status: ok
  Message: Ipopt 3.11.1\x3a Optimal Solution Found
  Termination condition: optimal
  Id: 0
  Error rc: 0
  Time: 9.170470476150513
Solution: 
- number of solutions: 0
  number of solutions displayed: 0

0.6873381562439903


Several definition of functions: tools useful to deal with __Test Analysis__

In [16]:
# Function to store the variables results
def extraction_va(model):
    
    mu = {str(model.mu[i]): model.mu[i].value for i in model.mu}
    a = {str(model.a[i]): model.a[i].value for i in model.a}
    C = {str(model.C[i]): model.C[i].value for i in model.C}
    
    return {'mu': mu,'a':a ,'C':C}

In [17]:
def my_sigmoid(a,x,mu,scale=512):
    l = len(x)
    val = (sum([a[i]*x   for i, x in enumerate(x)]) / l) - mu 
    # The default value is 512 as suggested in Blanquero et Al.
    return 1 / (1 + math.exp(-scale*val))

# An easy way to manage product within elements of an iterable object
def multiply_numpy(iterable):
    return np.prod(np.array(iterable))

# Calculate the probability of an individual falling into a given leaf node:
def Prob(model,var,x, leaf_idx):
    left = [my_sigmoid(list(var['a']['a['+str(i)+','+str(tl)+']'] for i in index_features),x,var['mu']['mu['+str(tl)+']']) for tl in model.N_L_L[leaf_idx] ]
    right = [1-my_sigmoid(list(var['a']['a['+str(i)+','+str(tr)+']'] for i in index_features),x,var['mu']['mu['+str(tr)+']']) for tr in model.N_L_R[leaf_idx] ]
    return multiply_numpy(left)*multiply_numpy(right)

#Calculate the predicted label of a single instance
def comp_label(model,x,var):
    prob ={k : sum(Prob(model,var,x,i)*var['C']['C['+str(k)+','+str(i)+']'] for i in model.N_L) for k in model.K}
    return int(max(prob, key=prob.get))

#Generate a list of predicted labels for the test set
def predicted_lab(model,X_test,var):
    label = []
    for i in range(0,len(X_test)):
        label.append(comp_label(model,list(X_test.iloc[i]),var))
    return label

#Calculate the accuracy out of sample
def accuracy(y,y_pred):
    l = [1 if y[i]==y_pred[i] else 0 for i in range(0,len(y))]
    return sum(l)/len(y)

In [18]:
var = extraction_va(model)
y_pred = predicted_lab(model,X_test,var)
yy= list(y_test)
print(accuracy(yy,y_pred))
confusion_matrix(y_test,y_pred)

0.9736842105263158


array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  1, 17]], dtype=int64)