In [1]:
from gurobipy import *
import pandas as pd
import numpy as np
import requests
from io import StringIO
m = Model('mip1')

Academic license - for non-commercial use only


### Data Wrangling

In [31]:
url2 = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names2 = ('Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
          'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 
          'Mitoses', 'class')
remote2 = requests.get(url2).content
breast_cancer_df = pd.read_csv(StringIO(remote2.decode('utf-8')), names = names2)


In [32]:
breast_cancer_df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
def nullify(x):
    if x == '?':
        return np.nan
    else:
        return x

In [4]:
breast_cancer_df = breast_cancer_df.applymap(nullify)
breast_cancer_df = breast_cancer_df.drop(columns = 'Sample code number')
breast_cancer_df = breast_cancer_df.dropna(axis = 0,how = 'any').reset_index(drop = True)

In [5]:
breast_cancer_df = breast_cancer_df.astype(int)
breast_cancer_df = breast_cancer_df / breast_cancer_df.max()
breast_cancer_df['class'] = breast_cancer_df['class'].replace(0.5, 0).astype(int)
breast_cancer_df.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,class
0,0.5,0.1,0.1,0.1,0.2,0.1,0.3,0.1,0.1,0
1,0.5,0.4,0.4,0.5,0.7,1.0,0.3,0.2,0.1,0
2,0.3,0.1,0.1,0.1,0.2,0.2,0.3,0.1,0.1,0
3,0.6,0.8,0.8,0.1,0.3,0.4,0.3,0.7,0.1,0
4,0.4,0.1,0.1,0.3,0.2,0.1,0.3,0.1,0.1,0


In [6]:
breast_cancer_df1 = breast_cancer_df[0:543]
breast_cancer_df2 = breast_cancer_df[543:]

In [7]:
breast_cancer_df.shape

(683, 10)

In [8]:
breast_cancer = breast_cancer_df.values
breast_cancer

array([[0.5, 0.1, 0.1, ..., 0.1, 0.1, 0. ],
       [0.5, 0.4, 0.4, ..., 0.2, 0.1, 0. ],
       [0.3, 0.1, 0.1, ..., 0.1, 0.1, 0. ],
       ...,
       [0.5, 1. , 1. , ..., 1. , 0.2, 1. ],
       [0.4, 0.8, 0.6, ..., 0.6, 0.1, 1. ],
       [0.4, 0.8, 0.8, ..., 0.4, 0.1, 1. ]])

In [9]:
training_data = breast_cancer[:543,:]
X_train = training_data[:,:-1]

In [10]:
rows, cols = X_train.shape

### Determine the depth

In [11]:
depth = 3

### CART Tree

In [12]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth = depth, max_leaf_nodes = 2**depth)
clf.fit(X_train, breast_cancer_df[0:543].values[:, -1].reshape((-1, 1)))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
total_nodes = clf.tree_.node_count
leaf_nodes = round(total_nodes / 2)
branch_nodes = total_nodes // 2

In [14]:
left_nodes = clf.tree_.children_left
right_nodes = clf.tree_.children_right
left_nodes = left_nodes[0:branch_nodes]
right_nodes = right_nodes[0:branch_nodes]

In [15]:
feature = clf.tree_.feature
initial_a = []
initial_a_tmp = []
for i in range(len(feature)):
    if feature[i] != -2 and feature[i+1] != -2:
        initial_a.append(feature[i])
    elif feature[i] != -2 and feature[i+1] == -2:
        initial_a_tmp.append(feature[i])
    else:
        continue
initial_a.extend(initial_a_tmp)
initial_a

[2, 0, 5, 8, 5, 4, 6]

In [16]:
threshold = clf.tree_.threshold
initial_b = []
initial_b_tmp = []
for i in range(len(threshold)):
    if threshold[i] != -2 and threshold[i+1] != -2:
        initial_b.append(threshold[i])
    elif threshold[i] != -2 and threshold[i+1] == -2:
        initial_b_tmp.append(threshold[i])
    else:
        continue
initial_b.extend(initial_b_tmp)
initial_b

[0.2500000074505806,
 0.6500000059604645,
 0.15000000223517418,
 0.8499999940395355,
 0.15000000223517418,
 0.45000000298023224,
 0.45000000298023224]

In [17]:
test_data = breast_cancer[543:,:]
X_test = test_data[:,:-1]
Y_test = breast_cancer_df2['class']

In [18]:
clf.score(X_test, Y_test)

0.95

### OCT

### Determine the alpha and K

In [19]:
alpha = 0.5
K = 2

In [20]:
max_diff = np.max(X_train, 0) - np.min(X_train, 0)
epsilon = np.ones([1, cols], dtype = int) * max_diff
for i in range(cols):
    old = 0
    for j in range(1, rows):
        diff = abs(X_train[j, i] - X_train[old ,i])
        old = j
        if diff < epsilon[0, i] and diff != 0:
            epsilon[0, i] = diff

epsilon = abs(epsilon)
epsilon

array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]])

In [21]:
max_epsilon = np.max(epsilon)

n = rows
num_feature = cols

### Determine the Y matrix

In [22]:
Y = np.zeros([rows,K], dtype = int) - 1

Y[breast_cancer_df1.index, breast_cancer_df1['class'].astype(int)] = 1

Y

array([[ 1, -1],
       [ 1, -1],
       [ 1, -1],
       ...,
       [ 1, -1],
       [ 1, -1],
       [ 1, -1]])

In [23]:
l = m.addVars(leaf_nodes, vtype = GRB.BINARY, name = "l")

z = m.addVars(n, leaf_nodes, vtype = GRB.BINARY, name = "z")

N_kt = m.addVars(K, leaf_nodes, vtype = GRB.INTEGER, name = "N_kt")

N_t = m.addVars(leaf_nodes, vtype = GRB.INTEGER, name = "N_t")

c_kt = m.addVars(K, leaf_nodes, vtype = GRB.BINARY, name = "c")

L = m.addVars(leaf_nodes, vtype = GRB.INTEGER, name = "L")

a = m.addVars(branch_nodes, num_feature, vtype = GRB.BINARY, name = 'a')

b = m.addVars(branch_nodes ,vtype = GRB.CONTINUOUS, name = "b")

d = m.addVars(branch_nodes ,vtype = GRB.BINARY, name = "d")

### Warm start

In [24]:
# warm start using the results of CART algorithm

for i in range(branch_nodes):
    a[i, initial_a[i]].start = 1
    b[i].start = initial_b[i]

In [25]:
m.update()

In [26]:
m.setObjective(L.sum() + alpha * d.sum(), GRB.MINIMIZE)

### OCT Constraints

In [28]:
for i in range(branch_nodes):
    b[i].setAttr(GRB.Attr.LB, 0)
    m.addConstr(b[i] <= d[i])
    m.addConstr(a.sum(i, '*') == d[i])
    m.addConstr(d[i] == 1)
    m.addConstr(l[i] == 1)
    
for i in range(leaf_nodes):
    m.addConstr(L[i] >= 0)
    m.addConstr(N_t[i] == z.sum('*', i))
    m.addConstr(l[i] == c_kt.sum('*', i))
    m.addConstr(z.sum('*', i) >= l[i])
    for j in range(K):
        m.addConstr(L[i] >= N_t[i] - N_kt[j,i] - n * (1 - c_kt[j,i]))
        m.addConstr(L[i] <= N_t[i] - N_kt[j,i] + n * c_kt[j,i])
        m.addConstr(N_kt[j,i] == 1/2 * sum(z.select('*', i) * (Y[:,j] + 1)))

for i in range(n):
    m.addConstr(z.sum(i, '*') == 1)
    
for i in range (leaf_nodes):
    for j in range(n):
        m.addConstr(z[j,i] <= l[i])

all_branch_nodes = list(reversed(range(branch_nodes)))
depth_dict = {}
for i in range(depth):
    depth_dict[i] = sorted(all_branch_nodes[-2**i:])
    for j in range(2**i):
        all_branch_nodes.pop()

all_leaf_nodes = list(range(leaf_nodes))
branch_dict = {}
for i in range(branch_nodes):
    for k in range(depth):
        if i in depth_dict[k]:
            floor_len = len(depth_dict[k])
            step = 2**depth // floor_len
            sliced_leaf = [all_leaf_nodes[i:i+step] for i in range(0, 2**depth, step)]
            idx = depth_dict[k].index(i)
            branch_dict[i] = sliced_leaf[idx]
        else:
            continue
            
for j in range(n):
    for i in range(leaf_nodes):
        for k in range(branch_nodes):
            if i in branch_dict[k]:
                length = len(branch_dict[k])
                idx = branch_dict[k].index(i)
                if idx+1 <= length//2:
                    m.addConstr(sum(a.select(k, '*') * (X_train[j,:] + epsilon[0,:])) <= b[k] + (1 + max_epsilon) * (1 - z[j,i]))
                elif idx+1 > length//2:
                    m.addConstr(sum(a.select(k, '*') * X_train[j,:]) >= b[k] - (1 - z[j,i]))
            else:
                continue

In [29]:
m.Params.timelimit = 600

In [30]:
m.optimize()

Optimize a model with 18664 rows, 4477 columns and 187358 nonzeros
Variable types: 7 continuous, 4470 integer (4438 binary)
Coefficient statistics:
  Matrix range     [1e-01, 5e+02]
  Objective range  [5e-01, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 5e+02]

MIP start did not produce a new incumbent solution

Presolve removed 674 rows and 15 columns
Presolve time: 0.31s
Presolved: 17990 rows, 4462 columns, 160967 nonzeros
Variable types: 0 continuous, 4462 integer (4431 binary)

Root relaxation: objective 3.500000e+00, 2512 iterations, 0.33 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0    3.50000    0  212          -    3.50000      -     -    2s
Another try with MIP start
H    0     0                     145.5000000    3.50000  97.6%     -    3s
     0     0    3.50000    0 1044  145.50000    3.50000  97.6%     -    3s
     0     0   

In [38]:
for v in m.getVars():
    print(v.varName, v.x)

l[0] 1.0
l[1] 1.0
l[2] 1.0
l[3] 1.0
z[0,0] 1.0
z[0,1] -0.0
z[0,2] 0.0
z[0,3] -0.0
z[1,0] -0.0
z[1,1] 0.0
z[1,2] -0.0
z[1,3] 1.0
z[2,0] 1.0
z[2,1] -0.0
z[2,2] 0.0
z[2,3] -0.0
z[3,0] 0.0
z[3,1] -0.0
z[3,2] 0.0
z[3,3] 1.0
z[4,0] 1.0
z[4,1] -0.0
z[4,2] 0.0
z[4,3] -0.0
z[5,0] -0.0
z[5,1] -0.0
z[5,2] -0.0
z[5,3] 1.0
z[6,0] -0.0
z[6,1] 0.0
z[6,2] 1.0
z[6,3] 0.0
z[7,0] 1.0
z[7,1] -0.0
z[7,2] 0.0
z[7,3] -0.0
z[8,0] 1.0
z[8,1] -0.0
z[8,2] -0.0
z[8,3] -0.0
z[9,0] 1.0
z[9,1] 0.0
z[9,2] 0.0
z[9,3] -0.0
z[10,0] 1.0
z[10,1] 0.0
z[10,2] 0.0
z[10,3] -0.0
z[11,0] 1.0
z[11,1] -0.0
z[11,2] 0.0
z[11,3] -0.0
z[12,0] -0.0
z[12,1] -0.0
z[12,2] -0.0
z[12,3] 1.0
z[13,0] 0.0
z[13,1] -0.0
z[13,2] 1.0
z[13,3] -0.0
z[14,0] -0.0
z[14,1] -0.0
z[14,2] -0.0
z[14,3] 1.0
z[15,0] -0.0
z[15,1] 1.0
z[15,2] -0.0
z[15,3] 0.0
z[16,0] 1.0
z[16,1] -0.0
z[16,2] 0.0
z[16,3] -0.0
z[17,0] 1.0
z[17,1] 0.0
z[17,2] 0.0
z[17,3] -0.0
z[18,0] -0.0
z[18,1] -0.0
z[18,2] -0.0
z[18,3] 1.0
z[19,0] 1.0
z[19,1] -0.0
z[19,2] 0.0
z[19,3] -0.0
z[20

z[360,1] -0.0
z[360,2] -0.0
z[360,3] -0.0
z[361,0] 1.0
z[361,1] -0.0
z[361,2] 0.0
z[361,3] -0.0
z[362,0] 1.0
z[362,1] -0.0
z[362,2] 0.0
z[362,3] 0.0
z[363,0] 1.0
z[363,1] -0.0
z[363,2] 0.0
z[363,3] -0.0
z[364,0] 1.0
z[364,1] -0.0
z[364,2] 0.0
z[364,3] 0.0
z[365,0] 1.0
z[365,1] -0.0
z[365,2] 0.0
z[365,3] 0.0
z[366,0] 1.0
z[366,1] -0.0
z[366,2] 0.0
z[366,3] -0.0
z[367,0] 0.0
z[367,1] -0.0
z[367,2] -0.0
z[367,3] 1.0
z[368,0] 1.0
z[368,1] -0.0
z[368,2] 0.0
z[368,3] -0.0
z[369,0] 1.0
z[369,1] -0.0
z[369,2] 0.0
z[369,3] -0.0
z[370,0] 1.0
z[370,1] -0.0
z[370,2] 0.0
z[370,3] -0.0
z[371,0] 1.0
z[371,1] -0.0
z[371,2] 0.0
z[371,3] -0.0
z[372,0] -0.0
z[372,1] -0.0
z[372,2] -0.0
z[372,3] 1.0
z[373,0] 1.0
z[373,1] -0.0
z[373,2] 0.0
z[373,3] -0.0
z[374,0] 1.0
z[374,1] -0.0
z[374,2] 0.0
z[374,3] -0.0
z[375,0] 1.0
z[375,1] -0.0
z[375,2] -0.0
z[375,3] -0.0
z[376,0] 1.0
z[376,1] -0.0
z[376,2] 0.0
z[376,3] -0.0
z[377,0] -0.0
z[377,1] 0.0
z[377,2] -0.0
z[377,3] 1.0
z[378,0] 1.0
z[378,1] -0.0
z[378,2] 0.0
z

In [39]:
print('Obj:', m.objVal)

Obj: 20.5


### Obtain the Tree Structure

In [40]:
# Obtain the coefficients of a and b
coff_a = np.zeros([branch_nodes, num_feature], dtype = int)
# coff_b = np.zeros(branch_nodes, dtype = int)
coff_b = np.zeros(branch_nodes)

for i in range(branch_nodes):
    tmp1 = m.getVarByName('b' + '[' + str(i) + ']')
#     coff_b[i] = int(tmp1.x)
    coff_b[i] = tmp1.x
    for j in range(num_feature):
        tmp2 = m.getVarByName('a' + '[' + str(i) + ',' + str (j) + ']')
        coff_a[i,j] = int(tmp2.x)


In [41]:
# Obtain the labels of leaf nodes
labels = np.zeros(leaf_nodes, dtype = int) - 1
coff_c = np.zeros([K, leaf_nodes], dtype = int)

for i in range(K):
    for j in range(leaf_nodes):
        tmp3 = m.getVarByName('c' + '[' + str(i) + ',' + str (j) + ']')
        coff_c[i,j] = int(tmp3.x)

k_idx, t_idx = np.where(coff_c == 1)
# for i in range(leaf_nodes):
for i in range(len(k_idx)):
    labels[t_idx[i]] = k_idx[i]

### Test Data

In [42]:
test_data = breast_cancer[543:,:]
X_test = test_data[:,:-1]
Y_test = breast_cancer_df2['class']
# Y_test = Y_test.astype(int)
# Y_test = Y_test // 2 - 1

In [43]:
t_rows, t_cols = X_test.shape

In [44]:
tmp = np.zeros([t_rows, 1], dtype = int)
Y_predict = np.hstack((np.reshape(Y_test.values, (t_rows, 1)), tmp))

In [45]:
num_nodes = 0
for i in range(branch_nodes): 
    tmp4 = m.getVarByName('d' + '[' + str(i) + ']')
    num_nodes += int(tmp4.x)

In [46]:
init = np.array([], dtype = int).reshape(0, num_feature)
nodes = {}
for i in range(num_nodes * 2):
    nodes[i] = init

# split
for i in range(t_rows):
    if np.dot(coff_a[0,:], np.transpose(X_test[i,:])) < coff_b[0]:
        nodes[0] = np.vstack([X_test[i,:], nodes[0]])
        if np.dot(coff_a[1,:], np.transpose(X_test[i,:])) < coff_b[1]:
            nodes[2] = np.vstack([X_test[i,:], nodes[2]])
            Y_predict[i,1] = labels[0]
            
        elif np.dot(coff_a[1,:], np.transpose(X_test[i,:])) >= coff_b[1]:
            nodes[3] = np.vstack([X_test[i,:], nodes[3]])
            Y_predict[i,1] = labels[1]
            
    elif np.dot(coff_a[0,:], np.transpose(X_test[i,:])) >= coff_b[0]:
        nodes[1] = np.vstack([X_test[i,:], nodes[1]])
        if np.dot(coff_a[2,:], np.transpose(X_test[i,:])) < coff_b[2]:
            nodes[4] = np.vstack([X_test[i,:], nodes[4]])
            Y_predict[i,1] = labels[2]
            
        elif np.dot(coff_a[2,:], np.transpose(X_test[i,:])) >= coff_b[2]:
            nodes[5] = np.vstack([X_test[i,:], nodes[5]])
            Y_predict[i,1] = labels[3]


In [47]:
for i in range(len(nodes)):
    if i > 1:
        print('node' + ' ' + str(i+2) + ' (predicted label is ' + str(labels[i-2]) + ')')
    else:
        print('node' + ' ' + str(i+2))
    
    print(nodes[i])

node 2
[[0.5 1.  1.  ... 0.8 1.  0.2]
 [0.2 0.1 0.1 ... 0.1 0.1 0.1]
 [0.3 0.1 0.1 ... 0.1 0.1 0.1]
 ...
 [0.5 0.1 0.1 ... 0.3 0.1 0.1]
 [0.5 0.1 0.1 ... 0.2 0.1 0.1]
 [0.2 0.1 0.1 ... 0.2 0.1 0.1]]
node 3
[[0.4 0.8 0.8 0.5 0.4 0.5 1.  0.4 0.1]
 [0.4 0.8 0.6 0.4 0.3 0.4 1.  0.6 0.1]
 [0.5 1.  1.  0.5 0.4 0.5 0.4 0.4 0.1]
 [0.5 1.  1.  1.  0.4 1.  0.5 0.6 0.3]
 [1.  1.  1.  1.  0.5 1.  1.  1.  0.7]
 [0.3 1.  0.7 0.8 0.5 0.8 0.7 0.4 0.1]
 [0.5 1.  1.  0.8 0.5 0.5 0.7 1.  0.1]
 [0.7 0.8 0.8 0.7 0.3 1.  0.7 0.2 0.3]
 [0.3 0.1 0.1 0.2 0.3 0.4 0.1 0.1 0.1]
 [0.2 0.1 0.1 0.1 0.2 0.5 0.1 0.1 0.1]
 [0.4 0.6 0.6 0.5 0.7 0.6 0.7 0.7 0.3]
 [0.3 0.1 0.3 0.1 0.3 0.4 0.1 0.1 0.1]
 [0.8 1.  1.  1.  0.6 1.  1.  1.  1. ]
 [1.  0.4 0.3 1.  0.3 1.  0.7 0.1 0.2]
 [0.5 1.  1.  1.  1.  1.  1.  0.1 0.1]
 [1.  0.5 1.  0.3 0.5 0.8 0.7 0.8 0.3]
 [0.5 0.3 0.2 0.8 0.5 1.  0.8 0.1 0.2]
 [0.4 0.8 0.6 0.3 0.4 1.  0.7 0.1 0.1]
 [1.  0.3 0.4 0.5 0.3 1.  0.4 0.1 0.1]
 [0.2 0.5 0.7 0.6 0.4 1.  0.7 0.6 0.1]
 [0.8 1.  1.  

### Test Accuracy

In [48]:
1 - sum(np.minimum(abs(Y_predict[:,1] - Y_predict[:,0]), 1)) / t_rows

0.9785714285714285