In [12]:
from __future__ import division

import numpy as np
import pickle, random, time
import pandas as pd

from sklearn import svm
from itertools import chain, combinations
from copy import deepcopy
from sklearn.decomposition import PCA

In [13]:
# Greedy SVM for Regional Statistic
def greedy_forward_SVM(F, y, num_feature, num_selected, num_repeats, num_test, C_val, Gamma_val, eps_val):
    '''
    Args:
        F: data
        y: label
        num_feature: number of feature in total. should be 120 but only 100
        num_selected: number of feature selected
        num_repeats: iter of repeat to avoid noise
        num_test: number of test sample
        C_val: c value as SVM hypter parameter
        Gamma_val: gamma value as SVM hyper parameter
        eps_val: eplison value as SVM hyper parameter
    '''
    best_acc_featsize = np.zeros((num_feature,))  ### accuracy with best subset of different sizes
    all_feat_remained = np.arange(num_feature)
    feat_order = list()

    num_samples = F.shape[0]

    for i in range(num_selected):  ## adds one feature per step

        train_acc_cur = np.zeros((len(all_feat_remained), num_repeats))
        test_acc_cur = np.zeros((len(all_feat_remained), num_repeats))
        train_acc_avg = np.zeros((len(all_feat_remained),))
        test_acc_avg = np.zeros((len(all_feat_remained),))
        # print("%d-th feature selection" % ( i+1) )

        for j in range(len(all_feat_remained)):  ## selects one feature out of the remaining ones

            cur_feat_list = deepcopy(feat_order)
            cur_feat_list.append(all_feat_remained[j])
            X = F[:, cur_feat_list]
            # print("%d-th feature selection and feature list= [%s]" % ( i+1, ', '.join(map(str, cur_feat_list))))

            for iter in range(num_repeats):
                # print("%d-th feature selection and %d-th iteration and feature list= [%s]" % ( i+1, iter+1,', '.join(map(str, cur_feat_list))))

                np.random.seed(3 * iter + 10)
                inds = np.random.choice(len(y), num_test)

                X_test = X[inds, :]
                y_test = y[inds]
                X_train = np.delete(X, inds, 0)
                y_train = np.delete(y, inds)

                clf = svm.SVR(C=C_val, epsilon=eps_val, kernel='rbf', degree=4, gamma=Gamma_val, tol=0.001,
                              cache_size=200, max_iter=-1)
                clf.fit(X_train, y_train)
                predicted_train_labels = clf.predict(X_train)
                train_score = clf.score(X_train, y_train)
                # train_error= sum( [1. for k in pred_diff_train if k != 0])/len(predicted_train_labels)
                train_acc_cur[j, iter] = train_score
                predicted_test_labels = clf.predict(X_test)
                test_score = clf.score(X_test, y_test)
                # test_error= sum( [1. for k in pred_diff_test if k != 0])/len(predicted_test_labels)
                test_acc_cur[j, iter] = test_score
                # print("%d-th feature, current list [%s], and its acc= %f" % ( j+1, ', '.join(map(str, cur_feat_list)), test_acc_cur[ j, iter] ))

        for k in range(len(all_feat_remained)):
            train_acc_avg[k] = np.mean(train_acc_cur[k, :])
            test_acc_avg[k] = np.mean(test_acc_cur[k, :])

        best_acc_featsize[i] = np.max(test_acc_avg)
        best_testacc_ind = np.unravel_index(test_acc_avg.argmax(), test_acc_avg.shape)
        feat_order.append(all_feat_remained[best_testacc_ind])
        # print(" current best feature index= %d" %  (all_feat_remained[best_testacc_ind]) )

        all_feat_remained = np.delete(all_feat_remained, best_testacc_ind, 0)

    return feat_order, best_acc_featsize

In [14]:
start_time = time.time()

F3 = pickle.load(open("./data/stats_data.p", "rb"))
# Reading stats_data

labels_meso = np.zeros((45,)).astype(int)
labels_old = np.hstack((np.ones((42,)), np.zeros((23,)))).astype(int)
labels_117 = np.hstack((np.ones((67,)), np.zeros((50,)))).astype(int)

F3_117 = F3[65:]
F3_old = F3[:65]

d117_delete_data = [2, 7, 14, 15, 30, 46, 47, 50, 54, 59, 72, 111]
dold_delete_data = [1, 5, 6, 8, 10, 12, 14, 16, 21, 23, 26, 29, 33, 35, 37, 43]
F3_117 = np.delete(F3_117, d117_delete_data, 0)
print(F3_117.shape)
F3_old = np.delete(F3_old, dold_delete_data, 0)
print(F3_old.shape)


(105, 100)
(49, 100)


In [15]:
data1 = pd.read_excel('./data/LUI_mTBI_NPwithZscores June 2018 cleaned up.xlsx')
d117 = data1['WAIS_DigitSpanBack_z'].get_values()
data2 = pd.read_excel(
    './data/mTBI data first cycle Lui Grossman Miles 2018 review and update for combining.xlsx')
dold = data2['Digit Span Backward z'].get_values()

In [16]:
using_data = 'positive'

delete_np_117 = [6, 13, 14, 15, 25, 29, 76, 77, 78]
delete_np_old = [2, 6, 8, 18, 31, 32, 35, 37, 38]
d117 = np.delete(d117, delete_np_117, 0)
dold = np.delete(dold, delete_np_old, 0)

print(d117.shape)
print(dold.shape)

labels_old = np.delete(labels_old, dold_delete_data, 0)
labels_117 = np.delete(labels_117, d117_delete_data, 0)

val = np.concatenate((d117, dold))

L = np.hstack((labels_117, labels_old))

print(np.count_nonzero(L))

y = L.tolist()
num_samples = len(L)

# F2= np.delete( F2, unavailable_Npysch_ind, 0)
# F3= np.delete( F3, unavailable_Npysch_ind, 0)
# L= np.delete( L, unavailable_Npysch_ind, 0)

F = np.concatenate((F3_117, F3_old))

if using_data == 'postitive':
    print("using positive data for training")
    val = val[L == 1]
    F = F[L == 1, :]
    L = L[L == 1]
elif using_data== 'negative':
    print("using negative data for training")
    val = val[L == 0]
    F = F[L == 0, :]
    L = L[L == 0]
    print(len(L))
    print(F.shape)
    print(len(val))
else:
    print('using combined data for training')

num_feature = F.shape[1]
num_test = 20
num_repeats = 25  ## shows number of times we shuffle the data and test on testing
train_accuracies = [0] * num_repeats
test_accuracies = [0] * num_repeats

tot_num = F.shape[0]

pos_ind = np.nonzero(L)[0]
neg_ind = np.where(L == 0)[0]

num_pos_training = np.int(np.floor((len(L) - num_test) * len(pos_ind) / (len(pos_ind) + len(neg_ind))))
num_neg_training = np.int(len(L) - num_test - num_pos_training)

# Best feature set in negative

num_selected = 10
eps = 0.05

C_range = np.logspace(-3, 3.0, num=5)
gamma_range = np.logspace(-5, 4, 5)

test_acc_all = np.zeros((C_range.shape[0], gamma_range.shape[0], F.shape[1]))
selected_feat = np.zeros((C_range.shape[0], gamma_range.shape[0], num_selected))

print("Start")
for i in range(len(C_range)):
    for j in range(len(gamma_range)):
        a1, a2 = greedy_forward_SVM(F, val, num_feature, num_selected, num_repeats, num_test, C_range[i], gamma_range[j],
                                    eps)
        a3 = np.asarray(a1)
        test_acc_all[i, j, :] = a2
        selected_feat[i, j, :] = a3
        print("(", C_range[i], ",", gamma_range[j], ") value of (C, Gamma), and best feat indices=", a3)

max_ind = np.unravel_index(test_acc_all.argmax(), test_acc_all.shape)
print("maximum test accuracy= %.3f, achieved by using %d features and ( C, Gamma)= ( %.3f, %.3f) from 14 metrics" % (
test_acc_all[max_ind], max_ind[2] + 1, C_range[max_ind[0]], gamma_range[max_ind[1]]))
print("Selected feature indices for 3xiter=", selected_feat[max_ind[0], max_ind[1], 0:max_ind[2] + 1])

# print("\nMaximum Train Accuracy with 100 iteration: %f " % np.max(train_acc_all), "%")
# print("\nMaximum Test Accuracy: %f, by feature indices= %s " % ( np.max(test_acc_avg), subset_indices[best_testacc_ind[0]] ) )

# print("\nall accuracies:", test_acc_avg)

end_time = time.time()
tot_time = end_time - start_time
print("total time=", tot_time)

(105,)
(49,)
84
using combined data for training
Start
( 0.001 , 1e-05 ) value of (C, Gamma), and best feat indices= [68 96 94 66 50 95 67 49 48  2]
( 0.001 , 0.0017782794100389228 ) value of (C, Gamma), and best feat indices= [68 96 94 66 50 95 67 49 48  2]
( 0.001 , 0.31622776601683794 ) value of (C, Gamma), and best feat indices= [68 67 22 49  0 21 93 16 19 17]
( 0.001 , 56.23413251903491 ) value of (C, Gamma), and best feat indices= [92 15 16 31 93 17 19 18 70 69]
( 0.001 , 10000.0 ) value of (C, Gamma), and best feat indices= [ 8  6  5  9  7  0  2 71 72  3]
( 0.03162277660168379 , 1e-05 ) value of (C, Gamma), and best feat indices= [68 96 94 66 50 95 67 49 48  2]
( 0.03162277660168379 , 0.0017782794100389228 ) value of (C, Gamma), and best feat indices= [68 96 94 66 50 95 67 48 49  2]
( 0.03162277660168379 , 0.31622776601683794 ) value of (C, Gamma), and best feat indices= [68 21 96 20 95 22 67  5 23 74]
( 0.03162277660168379 , 56.23413251903491 ) value of (C, Gamma), and best fea