In [1]:
import sys
import os

# Source: https://stackoverflow.com/questions/16780014/import-file-from-parent-directory
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__name__))))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

import category_encoders as ce

from lusi.ecoc import SVMRandomInvariantsECOC
from lusi.types import InvariantTypes
from utils import run_experiment

In [2]:
df = pd.read_csv("../data/balance_scale.csv", header=None)
df

Unnamed: 0,0,1,2,3,4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [3]:
X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values
X

array([[1, 1, 1, 1],
       [1, 1, 1, 2],
       [1, 1, 1, 3],
       ...,
       [5, 5, 5, 3],
       [5, 5, 5, 4],
       [5, 5, 5, 5]])

In [4]:
# Transform labels to numerical values
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
num_dimensions = X.shape[1]
num_classes = len(np.unique(y))

print("Num dimensions: ", num_dimensions)
print("Num classes: ", num_classes)

Num dimensions:  4
Num classes:  3


In [6]:
def run_single_experiment(X_train, X_test, y_train, y_test, clf, model_parameters):
    model = GridSearchCV(clf, model_parameters, cv=5, scoring='accuracy', n_jobs=4)
    model.fit(X_train, y_train)
    
    print('Best estimator: ', model.best_estimator_)
    
    y_hat = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_hat)
    
    print('Accuracy: ', accuracy)
    
    return accuracy

In [7]:
encoding = np.eye(num_classes)
encoding

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [8]:
# Define models
models = [
    ('Baseline', SVMRandomInvariantsECOC(encoding)),
    ('Vapnik', SVMRandomInvariantsECOC(encoding, invariant_type=InvariantTypes.VAPNIK, tolerance=20)),
    ('Projections', SVMRandomInvariantsECOC(encoding, invariant_type=InvariantTypes.PROJECTION, tolerance=20)),
    ('Hyperplanes', SVMRandomInvariantsECOC(encoding, invariant_type=InvariantTypes.HYPERPLANE, tolerance=20)),
]

In [9]:
# Define models parameters
models_parameters = [
    {
        'kernel': ['rbf'],
        'C': [0.001, 0.01, 0.1, 1.0, 10.0],
        'gamma': [0.001, 0.01, 0.1, 1.0, 'auto'],
        'delta': [0.001, 0.01, 0.1, 1.0],
        'num_invariants': [0],
    },
    {
        'kernel': ['rbf'],
        'C': [0.001, 0.01, 0.1, 1.0, 10.0],
        'gamma': [0.001, 0.01, 0.1, 1.0, 'auto'],
        'delta': [0.001, 0.01, 0.1, 1.0],
        'num_invariants': np.arange(1, num_dimensions + 2),
    },
    {
        'kernel': ['rbf'],
        'C': [0.001, 0.01, 0.1, 1.0, 10.0],
        'gamma': [0.001, 0.01, 0.1, 1.0, 'auto'],
        'delta': [0.001, 0.01, 0.1, 1.0],
        'num_invariants': np.arange(1, num_dimensions + 1),
    },
    {
        'kernel': ['rbf'],
        'C': [0.001, 0.01, 0.1, 1.0, 10.0],
        'gamma': [0.001, 0.01, 0.1, 1.0, 'auto'],
        'delta': [0.001, 0.01, 0.1, 1.0],
        'num_invariants': np.arange(1, num_dimensions + 1),
    }
]

In [10]:
# Define seeds and train sizes
seeds = [47, 4, 81]
train_sizes = [1.0, 0.5, 0.1]

In [11]:
results_df = run_experiment(X, y, seeds, train_sizes, models, models_parameters)
results_df.to_csv('results/balance_scale.csv', index=False)

-------------------- EXPERIMENT 1 --------------------




------ Using 1.0 of the training data ------


Training Baseline model

Running experiment with random_state=47
Best estimator:  SVMRandomInvariantsECOC(C=0.001,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.001, num_invariants=0, random_state=47)
Accuracy:  0.912
Running experiment with random_state=4
Best estimator:  SVMRandomInvariantsECOC(C=0.001,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.001, num_invariants=0, random_state=4)
Accuracy:  0.912
Running experiment with random_state=81
Best estimator:  SVMRandomInvariantsECOC(C=0.001,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.001, num_invariants=0, random_state=81)
Accuracy:  0.912



Training Vapnik model

Running experi

Best estimator:  SVMRandomInvariantsECOC(C=0.1, delta=1.0,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.1,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=1, random_state=81, tolerance=20)
Accuracy:  0.864





------ Using 0.1 of the training data ------


Training Baseline model

Running experiment with random_state=47
Best estimator:  SVMRandomInvariantsECOC(C=0.01,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.001, num_invariants=0, random_state=47)
Accuracy:  0.864
Running experiment with random_state=4
Best estimator:  SVMRandomInvariantsECOC(C=0.01,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.001, num_invariants=0, random_state=4)
Accuracy:  0.864
Running e

Best estimator:  SVMRandomInvariantsECOC(C=1.0,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.1,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=3, random_state=4, tolerance=20)
Accuracy:  0.904
Running experiment with random_state=81
Best estimator:  SVMRandomInvariantsECOC(C=0.1,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.1,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=2, random_state=81, tolerance=20)
Accuracy:  0.896





------ Using 0.5 of the training data ------


Training Baseline model

Running experiment with random_state=47
Best estimator:  SVMRandomInvariantsECOC(C=0.01,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0.

Best estimator:  SVMRandomInvariantsECOC(C=0.1,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.1,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=3, random_state=4, tolerance=20)
Accuracy:  0.832
Running experiment with random_state=81
Best estimator:  SVMRandomInvariantsECOC(C=0.1,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.001,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=4, random_state=81, tolerance=20)
Accuracy:  0.528



-------------------- EXPERIMENT 3 --------------------




------ Using 1.0 of the training data ------


Training Baseline model

Running experiment with random_state=47
Best estimator:  SVMRandomInvariantsECOC(C=0.001,
                        encod

Best estimator:  SVMRandomInvariantsECOC(C=0.01,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.01,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=2, random_state=47, tolerance=20)
Accuracy:  0.576
Running experiment with random_state=4
Best estimator:  SVMRandomInvariantsECOC(C=0.1,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        gamma=0.1,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=2, random_state=4, tolerance=20)
Accuracy:  0.856
Running experiment with random_state=81
Best estimator:  SVMRandomInvariantsECOC(C=10.0, delta=1.0,
                        encoding=array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]),
                        invariant_type=<InvariantTypes.H