In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pandas as pd
from tqdm import tqdm
import time

from copy import deepcopy
from sklearn.feature_selection import RFECV
from sklearn import svm
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold

In [2]:
data_path = 'data_154.xlsx'
label_path = 'label_154.xlsx'

data = pd.read_excel(data_path, index_col=0, sheet_name=0)
label = pd.read_excel(label_path, index_col=0, sheet_name=0)
features = np.array (list(data.columns))

predict = [ 'T1 Letter Number']

negative_idx = np.array ((np.arange(22) + 27).tolist() + (np.arange(48) + 106).tolist())

print(data.head(), label.head())

       ak-1_L_thal-mean  ak-1_L_thal-std  ak-1_L_thal-skew  ak-1_L_thal-kurt  \
HT102          0.605042         0.128671         -0.345832          0.060403   
HT103          0.649086         0.133304         -0.238727         -0.237111   
HT105          0.661849         0.170823         -0.002355         -0.595842   
HT106          0.667329         0.128948         -0.183180         -0.034880   
HT107          0.693921         0.155623          0.443730          0.196212   

       ak-1_L_thal-etrp  ak-2_R_thal-mean  ak-2_R_thal-std  ak-2_R_thal-skew  \
HT102          8.750292          0.606492         0.120894         -0.525165   
HT103          8.752230          0.700264         0.114282         -0.033532   
HT105          8.739804          0.653673         0.146263         -0.039633   
HT106          8.754848          0.657183         0.140268         -0.318437   
HT107          8.749220          0.684795         0.139449         -0.211015   

       ak-2_R_thal-kurt  ak-2_R_thal-e

In [None]:
for p in predict:

    X = data.iloc[negative_idx]
    y = (label.iloc[negative_idx])[p]
        
    features = np.array (list(X.columns))
    
    drop = y.notnull()

    X = X[drop]
    y = y[drop]
    
    X = (X.values)
    y = (y.values)
    
    nfold = 5
    nrepeats=25
    kf = RepeatedKFold(n_splits=nfold, n_repeats=nrepeats)
    
    scores = np.zeros([nfold*nrepeats])
    
    a = 1
    
    for isplit, Ind in enumerate(tqdm(kf.split(X))):

        # Get the training data in the split
        Itr, Its = Ind

        Xtr = X[Itr]
        ytr = y[Itr]
        Xts = X[Its]
        yts = y[Its]

        regr = Lasso(alpha=a, max_iter=50000)
        regr.fit(Xtr, ytr)
        yhat = regr.predict(Xts)
        scores[isplit] = r2_score(yts, yhat)

    print('For np {0}, the mean r2 scores for prediction is {1:.6f} with std = {2:.6f}'.format(p, np.mean(scores), np.std(scores)))
    
    regr = Lasso(alpha=a, max_iter=50000)
    regr.fit(X, y)

    lasso_coeff = regr.coef_
    # showing the stem graph of lasso coeff
    plt.stem(np.arange(*lasso_coeff.shape), lasso_coeff)
    plt.show()
    
    print('For np {0}, the related features are {1}'.format (p, features[np.nonzero(lasso_coeff)]))

In [None]:
for p in predict:

    X = data
    y = label[p]

    features = np.array (list(X.columns))
    
    drop = y.notnull()

    X = X[drop]
    y = y[drop]
    
    X = (X.values)
    y = (y.values)
    
    nfold = 5
    nrepeats = 2
    
    kf = RepeatedKFold(n_splits=nfold, n_repeats=nrepeats)
    
    model = svm.SVR(C=25, epsilon=0.1, kernel='linear', degree=4, gamma=10, tol=0.001, cache_size=200, max_iter=-1)
    
    rfecv = RFECV(estimator=model, step=10, cv=kf, scoring='r2', verbose=1, n_jobs=-1)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

In [None]:
scores = np.zeros([nfold*nrepeats])

for isplit, Ind in enumerate(tqdm(kf.split(X))):
    
    # Get the training data in the split
    Itr, Its = Ind
    
    Xtr = X[Itr]
    ytr = y[Itr]
    Xts = X[Its]
    yts = y[Its]
    
    regr = Lasso(alpha=0.005, max_iter=10000)
    regr.fit(Xtr, ytr)
    scores[isplit] = regr.score(Xts, yts)

print('the mean r2 scores for prediction is {0:.6f} with std = {1:.6f}'.format(np.mean(scores), np.std(scores)))

In [None]:
regr = Lasso(alpha=0.005, max_iter=10000)
regr.fit(X, y)

lasso_coeff = regr.coef_

In [None]:
plt.stem(np.arange(*lasso_coeff.shape), lasso_coeff)
plt.show()

In [None]:
print('the only related features are {}'.format (features[np.nonzero(lasso_coeff)]))

In [3]:
def greedy_forward_SVM(F, y, num_feature, num_selected, num_repeats, num_test, C_val, Gamma_val, eps_val):
    '''
    Greedy Forward SVM by Dr. Alp
    Args:
        F: data
        y: label
        num_feature: number of feature in total. should be 120 but only 100
        num_selected: number of feature selected
        num_repeats: iter of repeat to avoid noise
        num_test: number of test sample
        C_val: c value as SVM hypter parameter
        Gamma_val: gamma value as SVM hyper parameter
        eps_val: eplison value as SVM hyper parameter
    '''
    best_acc_featsize = np.zeros((num_feature,))  ### accuracy with best subset of different sizes
    all_feat_remained = np.arange(num_feature)
    feat_order = list()

    num_samples = F.shape[0]

    for i in range(num_selected):  ## adds one feature per step
        # iterating through all possible i

        train_acc_cur = np.zeros((len(all_feat_remained), num_repeats))
        test_acc_cur = np.zeros((len(all_feat_remained), num_repeats))
        train_acc_avg = np.zeros((len(all_feat_remained),))
        test_acc_avg = np.zeros((len(all_feat_remained),))
        # print("%d-th feature selection" % ( i+1) )

        for j in range(len(all_feat_remained)):  ## selects one feature out of the remaining ones

            cur_feat_list = deepcopy(feat_order)
            cur_feat_list.append(all_feat_remained[j])
            X = F[:, cur_feat_list]
            # print("%d-th feature selection and feature list= [%s]" % ( i+1, ', '.join(map(str, cur_feat_list))))

            for iter in range(num_repeats):
                # print("%d-th feature selection and %d-th iteration and feature list= [%s]" % ( i+1, iter+1,', '.join(map(str, cur_feat_list))))

                np.random.seed(3 * iter + 10)
                inds = np.random.choice(len(y), num_test)

                X_test = X[inds, :]
                y_test = y[inds]
                X_train = np.delete(X, inds, 0)
                y_train = np.delete(y, inds)

                clf = svm.SVR(C=C_val, epsilon=eps_val, kernel='rbf', degree=4, gamma=Gamma_val, tol=0.001,
                              cache_size=200, max_iter=-1)
                clf.fit(X_train, y_train)
                predicted_train_labels = clf.predict(X_train)
                train_score = clf.score(X_train, y_train)
                # train_error= sum( [1. for k in pred_diff_train if k != 0])/len(predicted_train_labels)
                train_acc_cur[j, iter] = train_score
                predicted_test_labels = clf.predict(X_test)
                test_score = clf.score(X_test, y_test)
                # test_error= sum( [1. for k in pred_diff_test if k != 0])/len(predicted_test_labels)
                test_acc_cur[j, iter] = test_score
                # print("%d-th feature, current list [%s], and its acc= %f" % ( j+1, ', '.join(map(str, cur_feat_list)), test_acc_cur[ j, iter] ))

        for k in range(len(all_feat_remained)):
            train_acc_avg[k] = np.mean(train_acc_cur[k, :])
            test_acc_avg[k] = np.mean(test_acc_cur[k, :])

        best_acc_featsize[i] = np.max(test_acc_avg)
        best_testacc_ind = np.unravel_index(test_acc_avg.argmax(), test_acc_avg.shape)
        feat_order.append(all_feat_remained[best_testacc_ind])
        # print(" current best feature index= %d" %  (all_feat_remained[best_testacc_ind]) )

        all_feat_remained = np.delete(all_feat_remained, best_testacc_ind, 0)

    return feat_order, best_acc_featsize

In [18]:
X = data
y = label['Digit Span Backward T1']

features = np.array (list(X.columns))

drop = y.notnull()

X = X[drop]
y = y[drop]

X = (X.values)
y = (y.values)

In [19]:
start_time = time.time()

num_feature = X.shape[1]
num_test = 20
num_repeats = 10  ## shows number of times we shuffle the data and test on testing
train_accuracies = [0] * num_repeats
test_accuracies = [0] * num_repeats

tot_num = X.shape[0]

num_selected = 10
eps = 0.01

C_range = np.logspace(-3, 3, num=5)
gamma_range = np.logspace(-5, 4, 5)

# C_range = np.array([31.62278])
# gamma_range = np.array([0.31623])

test_acc_all = np.zeros((C_range.shape[0], gamma_range.shape[0], num_feature))
selected_feat = np.zeros((C_range.shape[0], gamma_range.shape[0], num_selected))

print("Start")
for i in range(len(C_range)):
    for j in range(len(gamma_range)):
        a1, a2 = greedy_forward_SVM(X, y, num_feature, num_selected, num_repeats, num_test, C_range[i], gamma_range[j],
                                    eps)
        a3 = np.asarray(a1)
        test_acc_all[i, j, :] = a2
        selected_feat[i, j, :] = a3
        print("(", C_range[i], ",", gamma_range[j], ") value of (C, Gamma), and best feat indices=", a3)

max_ind = np.unravel_index(test_acc_all.argmax(), test_acc_all.shape)
print("maximum test accuracy= %.3f, achieved by using %d features and ( C, Gamma, eps)= ( %.5f, %.5f, %.5f) from 14 metrics" % (
test_acc_all[max_ind], max_ind[2] + 1, C_range[max_ind[0]], gamma_range[max_ind[1]], eps))
print("Selected feature for 3xiter=", selected_feat[max_ind[0], max_ind[1], 0:max_ind[2] + 1])
print(features[np.array(selected_feat[max_ind[0], max_ind[1], 0:max_ind[2] + 1], dtype=np.int)])
# print("\nMaximum Train Accuracy with 100 iteration: %f " % np.max(train_acc_all), "%")
# print("\nMaximum Test Accuracy: %f, by feature indices= %s " % ( np.max(test_acc_avg), subset_indices[best_testacc_ind[0]] ) )

# print("\nall accuracies:", test_acc_avg)

end_time = time.time()
tot_time = end_time - start_time
print("total time=", tot_time)


Start
( 31.62278 , 0.31623 ) value of (C, Gamma), and best feat indices= [ 29 242  12 258  25   5  19  24 164 259]
maximum test accuracy= 0.412, achieved by using 8 features and ( C, Gamma, eps)= ( 31.62278, 0.31623, 0.00500) from 14 metrics
Selected feature for 3xiter= [ 29. 242.  12. 258.  25.   5.  19.  24.]
['ak-L_Pref-etrp' 'md-R_Pref-skew' 'ak-CC_Body_mask-skew'
 'mk-CC_Body_mask-kurt' 'ak-L_Pref-mean' 'ak-2_R_thal-mean'
 'ak-CC_Genu_mask-etrp' 'ak-CC_Splenium_mask-etrp']
total time= 112.98960018157959
