In [1]:
import utils
import numpy as np
from svm import MySVM
from dataset import Dataset
from kernels import compute_kernel_matrix
from voting import Voting

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
dataset_id = 2
dataset = Dataset(['data/Xtr{}.csv'.format(dataset_id)],
                      ['data/Xtr{}_mat50.csv'.format(dataset_id)],
                      ['data/Ytr{}.csv'.format(dataset_id)],
                      ['data/Xte{}.csv'.format(dataset_id)],
                      ['data/Xte{}_mat50.csv'.format(dataset_id)])

print 'train: {}, val: {}'.format(len(dataset.dataset['train']['labels']), len(dataset.dataset['val']['labels']))
print 'train 0/1 proportion:', (np.bincount(dataset.dataset['train']['labels']).astype(float)
                                / len(dataset.dataset['train']['labels']))

train: 2000, val: 0
train 0/1 proportion: [0.5 0.5]


In [4]:
spectrum_size = 5
feature_extractor = 'spectrum'
kernel = 'rbf'
normalization = True
C = 10.

for spectrum_size in [6]:
    print 'spectrum size: {}'.format(spectrum_size)
    print "Computing Kernels..."
    K_train_r = compute_kernel_matrix(dataset.dataset['train']['sequences'],
                                      spectrum_size=spectrum_size,
                                      feature_extractor=feature_extractor, kernel=kernel,
                                      normalization=normalization)
    for C in [10.]:
        print 'C: {}'.format(C)
        print "Training SVM..."
        my_svm_r = MySVM(C=C, dual=True, verbose=False)
        scores, models = utils.cross_val(my_svm_r, K_train_r, dataset.dataset['train']['labels'])
        print "Accuracy: {} (+/- {})".format(scores.mean(), scores.std() * 2)

spectrum size: 6
Computing Kernels...
C: 10.0
Training SVM...
Fold 0: 0.62
Fold 1: 0.6425
Fold 2: 0.63
Fold 3: 0.6625
Fold 4: 0.625
Accuracy: 0.636 (+/- 0.0304302481094)


In [5]:
Y = dataset.dataset['train']['labels']
k=5
nb_samples = len(Y)
fold_size = int(float(len(Y)) / k)
nb_train = nb_samples - fold_size
nb_val = fold_size
scores = np.zeros((k,))
models = []
i = 0

idx_val = [j for j in range(i * fold_size, (i+1) * fold_size)]
idx_train = [j for j in (range(i * fold_size) + range((i+1) * fold_size, nb_samples))]

Y_train = [Y[j] for j in idx_train]
Y_val = [Y[j] for j in idx_val]

K_train = np.reshape([K_train_r[j, l] for j in idx_train for l in idx_train], (nb_train, nb_train))
K_val = np.reshape([K_train_r[j, l] for j in idx_val for l in idx_train], (nb_val, nb_train))


In [8]:
voting = Voting(models)
# voting.score(K_val, Y_val, 0.5)
Y_pred = voting.predict(K_val,0.5)

In [9]:
Y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,