Imports:

In [1]:
import numpy as np
from scipy.stats import pearsonr
from heapq import heappush, heappop, heappushpop
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import itertools
import pickle

Load the presaved channel data and word/category data:

In [2]:
all_data = np.load("all_data.npy") #holds all the data from channels
category_info = np.load("words_in_categories.npy") #category_info[cat][ptr] returns the number of the word(0...62) of the ptr'th word in the category cat
lengths = np.load("category_lengths.npy") #lengths[cat] is the number of words in category cat

Define some constants:

In [3]:
total_words = 63 

tStart = 0 #start time
tEnd = 650 #end time
tWidth = 100 #width of time slice
tIncr = 50 #increment in start time
tEx = 10 #number of examples to downsample to

training_amt = 8 #8 examples for training, 2 for testing
testing_amt = 10 - training_amt

np.random.seed(63)

Create and build TrainingData and TestingData matrices:

In [4]:
TrainingData = np.zeros((total_words,5,training_amt,256,650))#gives the pertinent data from all_data for the two categories
TestingData = np.zeros( (total_words,5,testing_amt,256,650)) #^
wordptr = -1 #the index of the current word, iterates from 0...total_words

for i in range(63):
    wordptr+=1

    excl = [-1]*10 #excl[j] = the j'th presentation number which should be saved for testing (e.g. excl[0] = 0 means the first presentation of the wordptr'th word should be saved for testing). Ignore -1's.
    
    for pres in range(testing_amt):
        while(1): #this loop repeatedly generates a random presentation until one which hasn't been reserved for testing has been found, and then breaks it
            nxtrand = np.random.randint(0,10)
            if(excl[nxtrand]==-1):
                excl[nxtrand]=nxtrand
                break
    for bandnum in range(5):
        ptr2 = 0 #points to which presentation(0...9) of wordptr'th word we are currently copying to TrainingData
        for pres in range(10):
            if(excl[pres]!=-1): #if reserved for testing, don't include in training data
                continue
           
            TrainingData[wordptr][bandnum][ptr2]=all_data[bandnum][i][pres] #sets the channel x time matrix for TrainingData[bandnum][wordptr][ptr2]
            ptr2+=1 #move to next presentation

    for bandnum in range(5): #this loop is same as above, except now we only want the testing presentations
        ptr2=0
        for pres in range(10):
            if(excl[pres]==-1):
                continue
            TestingData[wordptr][bandnum][ptr2] = all_data[bandnum][i][excl[pres]]
            ptr2+=1
            

Create vectors to hold best feature information based on Pearson Coefficients:

In [5]:
toSelect = 5 #number of top features to select

best_feature_vectors = np.zeros((total_words, training_amt,toSelect * tEx))
test_feature_vectors = np.zeros((total_words, testing_amt, toSelect * tEx))
timeSequences = np.zeros((total_words,5,12,training_amt,256,tEx))

Pick top feature vectors:

In [6]:
fixedc = int(tWidth/tEx)
ptrr = 0
for t in range(tStart, tEnd-tWidth+1, tIncr):
    ptrppp = 0
    for tEStart in range(t,t+tWidth-tEx+1,tEx):
        timeSequences[:,:,ptrr,:,:,ptrppp] = np.average(TrainingData[:,:,:,:,tEStart:tEStart+fixedc], axis = 4)
        ptrppp+=1
    ptrr+=1
print(str(timeSequences.shape))

for wordnum in range(total_words):
    SHheap = [] #heap of BTC + featurevector information used to select top 400
    
    for band_num in range(5): #frequency bands
        ptrr=0
        for t in range(tStart, tEnd-tWidth+1, tIncr): #various starts of time slice
            for channel in range(256): #eeg channels

                #pairwise correlations
                avg_p = 0
                avg_p2 = 0
                #print(str(wordnum) + " " + str(band_num) + " " + str(ptrr) + " " + str(channel))
                for i in range(training_amt-1):
                    for j in range(i+1,training_amt):
                        #if(wordnum == 1):
                       #     print(str(pearsonr(timeSequences[wordnum][band_num][ptrr][channel][i],timeSequences[wordnum][band_num][ptrr][channel][j])))
                        avg_p += pearsonr(timeSequences[wordnum][band_num][ptrr][i][channel],timeSequences[wordnum][band_num][ptrr][j][channel])[0]

                '''
                for word2 in range(total_words):
                    if(wordnum==word2):
                        continue
                    avg_p2 += pearsonr(AverageWord[wordnum][band_num][ptrr][channel], AverageWord[word2][band_num][ptrr][channel])[0]
                '''
                avg_p /= training_amt*(training_amt-1)/2 #want to maximize
                #avg_p2 /= (total_words-1) #want to minimize
                #ranking_measure = (avg_p - avg_p2)/2 #want to maximize
                if(len(SHheap)<400):
                    heappush(SHheap, (avg_p,band_num,t,channel, timeSequences[wordnum,band_num,ptrr,:,channel]))
                else:
                    heappushpop(SHheap, (avg_p,band_num,t,channel, timeSequences[wordnum,band_num,ptrr,:,channel]))
            ptrr+=1
    #pick top 5
    
    #f.write("Word " + str(wordnum) +"\n")
    print("Word " + str(wordnum))

    
    current_matrix = np.zeros( (training_amt,0))
    test_matrix = np.zeros( (testing_amt,0))
    
    for i in range(400):
        (avg_p,band_num,t,channel, timeSequenc) = heappop(SHheap)
        if(i>=400-toSelect):
            #this is da guy
            #f.write(str(400-i) + ". " + str(band_num) + "   " + str(t) + "   " + str(channel) + "   " + str(avg_p) + "\n")
            print(str(400-i) + ". " + str(band_num) + "   " + str(t) + "   " + str(channel) + "   " + str(avg_p))
            current_matrix = np.hstack( (current_matrix,timeSequenc))

            #construct testing matrix
            tmpo = np.zeros( (testing_amt,tEx))
            for itero in range(testing_amt):
                pp = 0
                for tEStart in range(t,t+tWidth-tEx+1,tEx):
                    tmpo[itero][pp] = np.average(TestingData[wordnum,band_num,itero,channel,tEStart:tEStart+int(tWidth/tEx)])
                    pp+=1
            test_matrix = np.hstack( (test_matrix,tmpo) )
            
    best_feature_vectors[wordnum] = current_matrix
    test_feature_vectors[wordnum] = test_matrix

(63, 5, 12, 8, 256, 10)
Word 0
5. 0   100   73   0.980905737259
4. 0   50   73   0.982876164639
3. 0   250   29   0.983602753254
2. 0   250   23   0.985362276105
1. 0   450   39   0.989386948271
Word 1
5. 1   200   158   0.923113386023
4. 0   100   63   0.940962237861
3. 0   300   73   0.947802080593
2. 0   400   247   0.948459312784
1. 0   450   58   0.982858827875
Word 2
5. 0   150   233   0.937545952033
4. 0   450   243   0.939538575863
3. 0   150   130   0.941791634019
2. 0   150   44   0.943095765008
1. 0   150   249   0.944888191215
Word 3
5. 0   200   45   0.981380590646
4. 0   200   46   0.983422325627
3. 0   400   35   0.986773318596
2. 0   200   28   0.986870773309
1. 0   200   29   0.991008854051
Word 4
5. 0   200   49   0.984954036831
4. 0   150   49   0.985370956584
3. 0   150   40   0.986526650694
2. 0   150   239   0.990905589816
1. 0   200   35   0.993451314616
Word 5
5. 0   100   32   0.938634059393
4. 0   350   207   0.942155076079
3. 0   100   190   0.947669602492
2.

Word 48
5. 0   350   215   0.984495981964
4. 0   400   38   0.984505871716
3. 0   150   46   0.986778719337
2. 0   400   215   0.986947705121
1. 0   150   37   0.990796430052
Word 49
5. 0   200   37   0.989707617466
4. 0   100   46   0.990840706879
3. 0   150   45   0.995121277596
2. 0   150   47   0.995692662117
1. 0   150   46   0.997297567227
Word 50
5. 0   500   88   0.98881384389
4. 0   500   138   0.989818558602
3. 0   450   29   0.990503726252
2. 0   500   99   0.990801200378
1. 0   450   40   0.992383190638
Word 51
5. 0   400   40   0.971584922528
4. 0   550   240   0.977337167019
3. 0   550   241   0.978894179878
2. 0   500   132   0.984275444831
1. 0   550   242   0.988054996974
Word 52
5. 0   400   20   0.975819582823
4. 0   150   29   0.976889340752
3. 0   150   39   0.982771202754
2. 0   150   35   0.989278698883
1. 0   150   40   0.992189993597
Word 53
5. 0   0   73   0.927126873051
4. 1   100   9   0.93140959772
3. 0   200   45   0.936670773643
2. 0   0   194   0.9558219

Set-up for running the actual training and testing between every pair of categories: 

In [8]:
#dictionaries for storing all data
save_trainx = {}
save_trainy = {}
save_testx = {}
save_testy = {}

#initialize variable for overall average accuracy
avgacc = 0 

#initialize space to search for optimal C values
clist = np.logspace(-3,2,100)

In [9]:
best_feature_vectors.shape

(63, 8, 50)

In [10]:
test_feature_vectors.shape

(63, 2, 50)

In [20]:
trainx=np.concatenate((best_feature_vectors[0], best_feature_vectors[1]), axis=0)
trainx.shape

testx=np.concatenate((test_feature_vectors[0], test_feature_vectors[1]), axis=0)
testx.shape

np.concatenate((np.full((8),0), np.full((8),1)), axis=0)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [26]:
for word1 in range(63):
    for word2 in range(word1+1, 63):
        
        tot_words = 16
        
        trainx = np.concatenate((best_feature_vectors[word1], best_feature_vectors[word2]), axis=0)
        trainy = np.concatenate((np.full((training_amt),0), np.full((training_amt),1)), axis=0)
        
        testx = np.concatenate((test_feature_vectors[0], test_feature_vectors[1]), axis=0)
        testy = np.concatenate((np.full((testing_amt),0), np.full((testing_amt),1)), axis=0)
        
        '''
        save_trainx[(cat1,cat2)] = trainx
        save_trainy[(cat1,cat2)] = trainy
        save_testx[(cat1,cat2)] = testx
        save_testy[(cat1,cat2)] = testy
        '''

        #run kfold cross validation for parameters
        bst_acc = 0
        bst_c = 100
        '''
        for c in clist:
                avg_acc = 0
                for fold in range(4):
                    fold_sz = int(training_amt*tot_words/4)
                    valid_x = trainx[(fold_sz*fold):((fold_sz)*(fold+1))]
                    valid_y = trainy[(fold_sz*fold):((fold_sz)*(fold+1))]
                    tr_x = np.concatenate( (trainx[0:(fold_sz*fold)],trainx[(fold+1)*fold_sz:(training_amt*tot_words)]), axis = 0)
                    tr_y = np.concatenate( (trainy[0:(fold_sz*fold)],trainy[(fold+1)*fold_sz:(training_amt*tot_words)]), axis = 0)

                    scaler = StandardScaler()
                    tr_x = scaler.fit_transform(tr_x)
                    valid_x = scaler.transform(valid_x)
                    tr_y = np.ravel(tr_y)
                    valid_y = np.ravel(valid_y)

                    classifier = LinearSVC(C=c)
                    classifier.fit(tr_x,tr_y)
                    avg_acc += (classifier.score(valid_x,valid_y) )/4.0
                if(avg_acc > bst_acc):
                    bst_acc = avg_acc
                    bst_c = c
        '''            
        print("For " + str(word1) + " and " + str(word2) + " we picked C = " + str(bst_c))
        
        #create final classifier with best C value
        classifier = LinearSVC(C=bst_c)
        
        #adjust data to the classifier
        scaler = StandardScaler()
        trainx = scaler.fit_transform(trainx)
        testx = scaler.transform(testx)
        trainy = np.ravel(trainy)
        testy = np.ravel(testy)
        
        #fit training data to classifier
        classifier.fit(trainx, trainy)
        #test on testing data
        myscore = classifier.score(testx,testy)
        avgacc+=myscore
        print("Has accuracy " + str(myscore))
        print("======")

#pickle.dump( (save_trainx,save_trainy,save_testx,save_testy), open("created_data.p","wb") )
avgacc/=(63*62/2)
print("Average score was " + str(avgacc))

print("Finished!")

For 0 and 1 we picked C = 100
Has accuracy 0.75
For 0 and 2 we picked C = 100
Has accuracy 0.5
For 0 and 3 we picked C = 100
Has accuracy 0.75
For 0 and 4 we picked C = 100
Has accuracy 1.0
For 0 and 5 we picked C = 100
Has accuracy 0.5
For 0 and 6 we picked C = 100
Has accuracy 0.5
For 0 and 7 we picked C = 100
Has accuracy 0.5
For 0 and 8 we picked C = 100
Has accuracy 0.25
For 0 and 9 we picked C = 100
Has accuracy 1.0
For 0 and 10 we picked C = 100
Has accuracy 0.5
For 0 and 11 we picked C = 100
Has accuracy 1.0
For 0 and 12 we picked C = 100
Has accuracy 0.5
For 0 and 13 we picked C = 100
Has accuracy 0.5
For 0 and 14 we picked C = 100
Has accuracy 0.5
For 0 and 15 we picked C = 100
Has accuracy 1.0
For 0 and 16 we picked C = 100
Has accuracy 0.75
For 0 and 17 we picked C = 100
Has accuracy 0.75
For 0 and 18 we picked C = 100
Has accuracy 0.5
For 0 and 19 we picked C = 100
Has accuracy 1.0
For 0 and 20 we picked C = 100
Has accuracy 1.0
For 0 and 21 we picked C = 100
Has accuracy 

Has accuracy 0.75
For 4 and 31 we picked C = 100
Has accuracy 0.25
For 4 and 32 we picked C = 100
Has accuracy 0.25
For 4 and 33 we picked C = 100
Has accuracy 0.75
For 4 and 34 we picked C = 100
Has accuracy 0.25
For 4 and 35 we picked C = 100
Has accuracy 0.0
For 4 and 36 we picked C = 100
Has accuracy 0.5
For 4 and 37 we picked C = 100
Has accuracy 0.5
For 4 and 38 we picked C = 100
Has accuracy 0.0
For 4 and 39 we picked C = 100
Has accuracy 0.25
For 4 and 40 we picked C = 100
Has accuracy 0.5
For 4 and 41 we picked C = 100
Has accuracy 0.5
For 4 and 42 we picked C = 100
Has accuracy 1.0
For 4 and 43 we picked C = 100
Has accuracy 0.0
For 4 and 44 we picked C = 100
Has accuracy 0.25
For 4 and 45 we picked C = 100
Has accuracy 0.25
For 4 and 46 we picked C = 100
Has accuracy 0.0
For 4 and 47 we picked C = 100
Has accuracy 0.75
For 4 and 48 we picked C = 100
Has accuracy 0.25
For 4 and 49 we picked C = 100
Has accuracy 0.5
For 4 and 50 we picked C = 100
Has accuracy 0.0
For 4 and 51 

Has accuracy 0.25
For 8 and 49 we picked C = 100
Has accuracy 0.5
For 8 and 50 we picked C = 100
Has accuracy 1.0
For 8 and 51 we picked C = 100
Has accuracy 0.75
For 8 and 52 we picked C = 100
Has accuracy 0.5
For 8 and 53 we picked C = 100
Has accuracy 1.0
For 8 and 54 we picked C = 100
Has accuracy 0.75
For 8 and 55 we picked C = 100
Has accuracy 0.75
For 8 and 56 we picked C = 100
Has accuracy 0.75
For 8 and 57 we picked C = 100
Has accuracy 1.0
For 8 and 58 we picked C = 100
Has accuracy 0.25
For 8 and 59 we picked C = 100
Has accuracy 0.25
For 8 and 60 we picked C = 100
Has accuracy 0.75
For 8 and 61 we picked C = 100
Has accuracy 0.5
For 8 and 62 we picked C = 100
Has accuracy 0.75
For 9 and 10 we picked C = 100
Has accuracy 0.25
For 9 and 11 we picked C = 100
Has accuracy 0.5
For 9 and 12 we picked C = 100
Has accuracy 0.25
For 9 and 13 we picked C = 100
Has accuracy 0.0
For 9 and 14 we picked C = 100
Has accuracy 0.0
For 9 and 15 we picked C = 100
Has accuracy 0.0
For 9 and 16

Has accuracy 0.25
For 13 and 46 we picked C = 100
Has accuracy 0.0
For 13 and 47 we picked C = 100
Has accuracy 0.75
For 13 and 48 we picked C = 100
Has accuracy 0.5
For 13 and 49 we picked C = 100
Has accuracy 0.5
For 13 and 50 we picked C = 100
Has accuracy 0.75
For 13 and 51 we picked C = 100
Has accuracy 0.5
For 13 and 52 we picked C = 100
Has accuracy 0.5
For 13 and 53 we picked C = 100
Has accuracy 0.75
For 13 and 54 we picked C = 100
Has accuracy 0.5
For 13 and 55 we picked C = 100
Has accuracy 0.5
For 13 and 56 we picked C = 100
Has accuracy 1.0
For 13 and 57 we picked C = 100
Has accuracy 0.75
For 13 and 58 we picked C = 100
Has accuracy 0.5
For 13 and 59 we picked C = 100
Has accuracy 0.25
For 13 and 60 we picked C = 100
Has accuracy 0.75
For 13 and 61 we picked C = 100
Has accuracy 0.25
For 13 and 62 we picked C = 100
Has accuracy 0.75
For 14 and 15 we picked C = 100
Has accuracy 1.0
For 14 and 16 we picked C = 100
Has accuracy 1.0
For 14 and 17 we picked C = 100
Has accurac

For 16 and 56 we picked C = 100
Has accuracy 0.75
For 16 and 57 we picked C = 100
Has accuracy 0.0
For 16 and 58 we picked C = 100
Has accuracy 0.0
For 16 and 59 we picked C = 100
Has accuracy 0.0
For 16 and 60 we picked C = 100
Has accuracy 0.75
For 16 and 61 we picked C = 100
Has accuracy 0.0
For 16 and 62 we picked C = 100
Has accuracy 0.0
For 17 and 18 we picked C = 100
Has accuracy 0.5
For 17 and 19 we picked C = 100
Has accuracy 0.5
For 17 and 20 we picked C = 100
Has accuracy 0.75
For 17 and 21 we picked C = 100
Has accuracy 0.5
For 17 and 22 we picked C = 100
Has accuracy 0.25
For 17 and 23 we picked C = 100
Has accuracy 0.75
For 17 and 24 we picked C = 100
Has accuracy 0.5
For 17 and 25 we picked C = 100
Has accuracy 0.5
For 17 and 26 we picked C = 100
Has accuracy 0.25
For 17 and 27 we picked C = 100
Has accuracy 0.5
For 17 and 28 we picked C = 100
Has accuracy 1.0
For 17 and 29 we picked C = 100
Has accuracy 0.5
For 17 and 30 we picked C = 100
Has accuracy 0.75
For 17 and 31

For 22 and 54 we picked C = 100
Has accuracy 0.25
For 22 and 55 we picked C = 100
Has accuracy 0.75
For 22 and 56 we picked C = 100
Has accuracy 0.75
For 22 and 57 we picked C = 100
Has accuracy 0.25
For 22 and 58 we picked C = 100
Has accuracy 0.25
For 22 and 59 we picked C = 100
Has accuracy 0.25
For 22 and 60 we picked C = 100
Has accuracy 0.75
For 22 and 61 we picked C = 100
Has accuracy 0.25
For 22 and 62 we picked C = 100
Has accuracy 0.75
For 23 and 24 we picked C = 100
Has accuracy 0.0
For 23 and 25 we picked C = 100
Has accuracy 0.25
For 23 and 26 we picked C = 100
Has accuracy 0.0
For 23 and 27 we picked C = 100
Has accuracy 0.5
For 23 and 28 we picked C = 100
Has accuracy 0.5
For 23 and 29 we picked C = 100
Has accuracy 0.0
For 23 and 30 we picked C = 100
Has accuracy 0.75
For 23 and 31 we picked C = 100
Has accuracy 0.0
For 23 and 32 we picked C = 100
Has accuracy 0.75
For 23 and 33 we picked C = 100
Has accuracy 0.5
For 23 and 34 we picked C = 100
Has accuracy 0.5
For 23 a

Has accuracy 0.0
For 30 and 46 we picked C = 100
Has accuracy 0.0
For 30 and 47 we picked C = 100
Has accuracy 0.25
For 30 and 48 we picked C = 100
Has accuracy 0.0
For 30 and 49 we picked C = 100
Has accuracy 0.5
For 30 and 50 we picked C = 100
Has accuracy 0.25
For 30 and 51 we picked C = 100
Has accuracy 0.5
For 30 and 52 we picked C = 100
Has accuracy 0.25
For 30 and 53 we picked C = 100
Has accuracy 0.0
For 30 and 54 we picked C = 100
Has accuracy 0.25
For 30 and 55 we picked C = 100
Has accuracy 0.5
For 30 and 56 we picked C = 100
Has accuracy 0.5
For 30 and 57 we picked C = 100
Has accuracy 0.25
For 30 and 58 we picked C = 100
Has accuracy 0.0
For 30 and 59 we picked C = 100
Has accuracy 0.0
For 30 and 60 we picked C = 100
Has accuracy 0.75
For 30 and 61 we picked C = 100
Has accuracy 0.0
For 30 and 62 we picked C = 100
Has accuracy 0.0
For 31 and 32 we picked C = 100
Has accuracy 0.25
For 31 and 33 we picked C = 100
Has accuracy 1.0
For 31 and 34 we picked C = 100
Has accuracy 

Has accuracy 0.75
For 40 and 56 we picked C = 100
Has accuracy 0.75
For 40 and 57 we picked C = 100
Has accuracy 0.25
For 40 and 58 we picked C = 100
Has accuracy 0.25
For 40 and 59 we picked C = 100
Has accuracy 0.0
For 40 and 60 we picked C = 100
Has accuracy 0.75
For 40 and 61 we picked C = 100
Has accuracy 0.25
For 40 and 62 we picked C = 100
Has accuracy 0.25
For 41 and 42 we picked C = 100
Has accuracy 0.5
For 41 and 43 we picked C = 100
Has accuracy 0.25
For 41 and 44 we picked C = 100
Has accuracy 0.25
For 41 and 45 we picked C = 100
Has accuracy 0.25
For 41 and 46 we picked C = 100
Has accuracy 0.0
For 41 and 47 we picked C = 100
Has accuracy 0.75
For 41 and 48 we picked C = 100
Has accuracy 0.5
For 41 and 49 we picked C = 100
Has accuracy 0.5
For 41 and 50 we picked C = 100
Has accuracy 0.75
For 41 and 51 we picked C = 100
Has accuracy 0.75
For 41 and 52 we picked C = 100
Has accuracy 0.25
For 41 and 53 we picked C = 100
Has accuracy 0.5
For 41 and 54 we picked C = 100
Has ac