In [1]:
import sys
import os

# Source: https://stackoverflow.com/questions/16780014/import-file-from-parent-directory
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__name__))))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

import category_encoders as ce

from lusi.ecoc import SVMRandomInvariantsECOC
from lusi.types import InvariantTypes

In [2]:
df = pd.read_csv("../data/dermatology.data", header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,2,0,25,4
362,3,2,1,0,1,0,0,0,0,0,...,1,0,1,0,0,0,2,0,36,4
363,3,2,2,2,3,2,0,2,0,0,...,0,3,0,3,0,0,2,3,28,3
364,2,1,3,1,2,3,0,2,0,0,...,0,2,0,1,0,0,2,3,50,3


In [3]:
# Process last attribute column
df.iloc[:, -2] = df.iloc[:, -2].replace('?', np.nan)
df.iloc[:, -2] = df.iloc[:, -2].fillna(df.iloc[:, -2].median()).astype(int)

In [4]:
df.iloc[:, -1] = df.iloc[:, -1].apply(lambda x: x - 1)
np.unique(df.iloc[:, -1].values)

array([0, 1, 2, 3, 4, 5])

In [5]:
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values
X

array([[ 2,  2,  0, ...,  1,  0, 55],
       [ 3,  3,  3, ...,  1,  0,  8],
       [ 2,  1,  2, ...,  2,  3, 26],
       ...,
       [ 3,  2,  2, ...,  2,  3, 28],
       [ 2,  1,  3, ...,  2,  3, 50],
       [ 3,  2,  2, ...,  3,  0, 35]])

In [6]:
num_dimensions = X.shape[1]
num_classes = len(np.unique(y))

print("Num dimensions: ", num_dimensions)
print("Num classes: ", num_classes)

Num dimensions:  34
Num classes:  6


In [7]:
def run_single_experiment(X_train, X_test, y_train, y_test, clf, model_parameters):
    model = GridSearchCV(clf, model_parameters, cv=5, scoring='accuracy', n_jobs=4)
    model.fit(X_train, y_train)
    
    print('Best estimator: ', model.best_estimator_)
    
    y_hat = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_hat)
    
    print('Accuracy: ', accuracy)
    
    return accuracy

## Experiment 1: Using 80% of the data

### Baseline model

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=47)

In [9]:
# Define hyperparameters that will be used in the Grid Search
baseline_parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.01, 0.1, 1.0, 10.0],
    'gamma': [0.01, 0.1, 1.0, 'auto', 'scale'],
    'degree': np.arange(10),
}

In [10]:
svm_baseline = SVC(random_state=47)

In [11]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    svm_baseline,
    baseline_parameters
)

Best estimator:  SVC(C=0.1, degree=0, gamma=0.01, kernel='linear', random_state=47)
Accuracy:  0.9459459459459459


0.9459459459459459

### Random projections ECOC

In [12]:
encoding = np.eye(num_classes)
encoding

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [13]:
ecoc_proj_params = {
    'kernel': ['rbf'],
    'C': [0.001, 0.01, 0.1, 1.0, 10.0],
    'gamma': [0.001, 0.01, 0.1, 1.0, 'auto'],
    'delta': [0.001, 0.01, 0.1, 1.0],
    'num_invariants': np.arange(1, num_dimensions + 1),
    'normalize_projections': [False, True],
}

In [14]:
ecoc_projections = SVMRandomInvariantsECOC(
    encoding,
    invariant_type=InvariantTypes.PROJECTION,
    tolerance=20,
    random_state=47,
)

In [15]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_projections,
    ecoc_proj_params
)

Best estimator:  SVMRandomInvariantsECOC(C=0.001,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001, num_invariants=8, random_state=47,
                        tolerance=20)
Accuracy:  0.9594594594594594


0.9594594594594594

### Random hyperplanes ECOC

In [16]:
ecoc_hyper_params = {
    'kernel': ['rbf'],
    'C': [0.001, 0.01, 0.1, 1.0, 10.0],
    'gamma': [0.001, 0.01, 0.1, 1.0, 'auto'],
    'delta': [0.001, 0.01, 0.1, 1.0],
    'num_invariants': np.arange(1, num_dimensions + 1),
}

In [17]:
ecoc_hyperplanes = SVMRandomInvariantsECOC(
    encoding,
    invariant_type=InvariantTypes.HYPERPLANE,
    tolerance=20,
    random_state=47,
)

In [18]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_hyperplanes,
    ecoc_hyper_params
)

Best estimator:  SVMRandomInvariantsECOC(C=0.001, delta=1.0,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=1, random_state=47, tolerance=20)
Accuracy:  0.9594594594594594


0.9594594594594594

## Experiment 2: Using 50% of the data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=47)

In [20]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    svm_baseline,
    baseline_parameters
)

Best estimator:  SVC(C=0.1, degree=0, gamma=0.01, kernel='linear', random_state=47)
Accuracy:  0.9672131147540983


0.9672131147540983

In [21]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_projections,
    ecoc_proj_params
)

Best estimator:  SVMRandomInvariantsECOC(C=1.0, delta=0.1,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001, num_invariants=16, random_state=47,
                        tolerance=20)
Accuracy:  0.9289617486338798


0.9289617486338798

In [22]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_hyperplanes,
    ecoc_hyper_params
)

Best estimator:  SVMRandomInvariantsECOC(C=0.001, delta=1.0,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=1, random_state=47, tolerance=20)
Accuracy:  0.9508196721311475


0.9508196721311475

## Experiment 3: Using 30% of the data

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=47)

In [24]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    svm_baseline,
    baseline_parameters
)

Best estimator:  SVC(C=0.01, degree=2, gamma=0.1, kernel='poly', random_state=47)
Accuracy:  0.9221789883268483


0.9221789883268483

In [25]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_projections,
    ecoc_proj_params
)

Best estimator:  SVMRandomInvariantsECOC(C=10.0,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001, normalize_projections=True,
                        num_invariants=16, random_state=47, tolerance=20)
Accuracy:  0.9221789883268483


0.9221789883268483

In [26]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_hyperplanes,
    ecoc_hyper_params
)

Best estimator:  SVMRandomInvariantsECOC(C=10.0,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=19, random_state=47, tolerance=20)
Accuracy:  0.933852140077821


0.933852140077821

## Experiment 4: Using 20% of the data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=47)

In [28]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    svm_baseline,
    baseline_parameters
)



Best estimator:  SVC(C=10.0, degree=1, gamma='auto', kernel='poly', random_state=47)
Accuracy:  0.9215017064846417


0.9215017064846417

In [29]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_projections,
    ecoc_proj_params
)



Best estimator:  SVMRandomInvariantsECOC(C=0.1,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001, num_invariants=15, random_state=47,
                        tolerance=20)
Accuracy:  0.9180887372013652


0.9180887372013652

In [30]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_hyperplanes,
    ecoc_hyper_params
)



Best estimator:  SVMRandomInvariantsECOC(C=0.1,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=21, random_state=47, tolerance=20)
Accuracy:  0.8976109215017065


0.8976109215017065

## Experiment 5: Using 10% of the data

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, random_state=47)

In [32]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    svm_baseline,
    baseline_parameters
)



Best estimator:  SVC(C=0.1, degree=0, gamma=0.01, kernel='linear', random_state=47)
Accuracy:  0.9212121212121213


0.9212121212121213

In [33]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_projections,
    ecoc_proj_params
)



Best estimator:  SVMRandomInvariantsECOC(C=1.0,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001, num_invariants=17, random_state=47,
                        tolerance=20)
Accuracy:  0.8909090909090909


0.8909090909090909

In [34]:
run_single_experiment(
    X_train,
    X_test,
    y_train,
    y_test,
    ecoc_hyperplanes,
    ecoc_hyper_params
)



Best estimator:  SVMRandomInvariantsECOC(C=0.001,
                        encoding=array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]]),
                        gamma=0.001,
                        invariant_type=<InvariantTypes.HYPERPLANE: 'HYPERPLANE'>,
                        num_invariants=29, random_state=47, tolerance=20)
Accuracy:  0.8757575757575757


0.8757575757575757