In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from sklearn import svm
from sklearn.metrics import accuracy_score

In [28]:
random = np.random.RandomState(0)
orgn_N, test_N, K = 10000, 2000, 5
orgn_data = np.hstack([
    (np.load('data_hw2/train_data.npy') - 10) / 20, np.expand_dims(np.load('data_hw2/train_label.npy'), axis=1)])
test_data = np.hstack([
    (np.load('data_hw2/test_data.npy') - 10) / 20, np.expand_dims(np.load('data_hw2/test_label.npy'), axis=1)])
genders = []
for gender in [1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1]:
    genders.append(np.ones(3397) if gender else np.zeros(3397))
genders = np.expand_dims(np.hstack(genders), -1)
orgn_data = np.hstack([orgn_data, genders])
genders = []
for gender in [1, 0, 1, 0]:
    genders.append(np.ones(3397) if gender else np.zeros(3397))
genders = np.expand_dims(np.hstack(genders), -1)
test_data = np.hstack([test_data, genders])
random.shuffle(orgn_data)
random.shuffle(test_data)
orgn_data = orgn_data[:orgn_N]
test_data = test_data[:test_N]
fold_size = orgn_N // K
fold_data = [orgn_data[i*fold_size: (i+1)*fold_size] for i in range(K - 1)]
fold_data.append(orgn_data[(K-1)*fold_size:])

In [32]:
DIM = 310

def decision2proba(x):
    return 1 / (1 + np.exp(-x))

def decision2class(x):
    return (x > 0).astype(np.int8)

for i in range(K):
    models = [
        [svm.SVC(kernel='rbf', gamma=0.001,  C=1) for k in range(4)],
        [svm.SVC(kernel='rbf', gamma=1,     C=1) for k in range(4)],
        [svm.SVC(kernel='rbf', gamma=0.01, C=1) for k in range(4)],
    ]
    
    valid_data = fold_data[i]
    train_data = np.vstack(fold_data[:i] + fold_data[i+1:])
    train_X, train_y = train_data[:,:DIM], train_data[:,310]
    valid_X, valid_y = valid_data[:,:DIM], valid_data[:,310]
    tests_X, tests_y = test_data[:, :DIM], test_data[:, 310]
    
    subclass_valid, subclass_tests = [], []
    
    # 0 vs 1, -1; 0 vs 1, -1; -1 vs 0, 1
    for j in range(3):
        proj = lambda y: np.array(np.eye(3)[j])[y.astype(np.int8)]
        _valid_y, _tests_y = proj(valid_y), proj(tests_y)
        _train_data = np.hstack([train_data[:, :310], proj(train_data[:, 310:])])

        subset_0 = _train_data[train_data[:, 311] == 0]
        subset_1 = _train_data[train_data[:, 311] == 1]
        
        subset_2 = np.vstack([subset_0[subset_0[:, 310] == 1], subset_1[subset_1[:, 310] == 0]])
        subset_3 = np.vstack([subset_0[subset_0[:, 310] == 0], subset_1[subset_1[:, 310] == 1]])
        np.random.shuffle(subset_2)
        np.random.shuffle(subset_3)
        subsets = [subset_0, subset_1, subset_2, subset_3]
        
        predict_valid, predict_tests = [], []
        for k in range(4):
            models[j][k].fit(subsets[k][:, :310], subsets[k][:, 310])
            predict_valid.append(decision2proba(models[j][k].decision_function(valid_X)))
            predict_tests.append(decision2proba(models[j][k].decision_function(tests_X)))

        predict_valid = np.array(predict_valid)
        y_hat = np.max(np.vstack([
            np.min(predict_valid[[0, 2], :], axis=0), 
            np.min(predict_valid[[1, 3], :], axis=0)
        ]), axis=0).reshape(-1, 1)
        subclass_valid.append(y_hat)
        valid_acc = accuracy_score(_valid_y, np.argmax(np.hstack([1-y_hat, y_hat]), axis=-1))
        
        predict_tests = np.array(predict_tests)
        y_hat = np.max(np.vstack([
            np.min(predict_tests[[0, 2], :], axis=0), 
            np.min(predict_tests[[1, 3], :], axis=0)
        ]), axis=0).reshape(-1, 1)
        subclass_tests.append(y_hat)
        tests_acc = accuracy_score(_tests_y, np.argmax(np.hstack([1-y_hat, y_hat]), axis=-1))
        print("model_%d valid_acc=%.4f tests_acc=%.4f" % (j, valid_acc, tests_acc))
            
    valid_acc = accuracy_score(valid_y, np.array([0, 1, -1])[np.argmax(subclass_valid, axis=0)])
    tests_acc = accuracy_score(tests_y, np.array([0, 1, -1])[np.argmax(subclass_tests, axis=0)])
    print("Fold %d: valid=%.4f test=%.4f\n" % (i, valid_acc, tests_acc))

model_0 valid_acc=0.6725 tests_acc=0.6660
model_1 valid_acc=1.0000 tests_acc=0.6765
model_2 valid_acc=0.6825 tests_acc=0.6575
Fold 0: valid=0.8000 test=0.5710

model_0 valid_acc=0.6825 tests_acc=0.6660
model_1 valid_acc=1.0000 tests_acc=0.6765
model_2 valid_acc=0.6890 tests_acc=0.6575
Fold 1: valid=0.8190 test=0.5870

model_0 valid_acc=0.6745 tests_acc=0.6660
model_1 valid_acc=1.0000 tests_acc=0.6765
model_2 valid_acc=0.6900 tests_acc=0.6575
Fold 2: valid=0.8020 test=0.5625

model_0 valid_acc=0.6935 tests_acc=0.6660
model_1 valid_acc=1.0000 tests_acc=0.6765
model_2 valid_acc=0.6740 tests_acc=0.6575
Fold 3: valid=0.8245 test=0.5750

model_0 valid_acc=0.6725 tests_acc=0.6660
model_1 valid_acc=1.0000 tests_acc=0.6765
model_2 valid_acc=0.6750 tests_acc=0.6575
Fold 4: valid=0.7975 test=0.5515

