In [106]:
import numpy as np
from scipy.stats import pearsonr
import heapq
from heapq import heappush, heappop, heappushpop
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import itertools
import pickle

In [107]:
all_data = np.load("all_data.npy") #holds all the data from channels
category_info = np.load("words_in_categories.npy") #category_info[cat][ptr] returns the number of the word(0...62) of the ptr'th word in the category cat
lengths = np.load("category_lengths.npy") #lengths[cat] is the number of words in category cat

In [108]:
total_words = 63 

tStart = 0 #start time
tEnd = 650 #end time
tWidth = 100 #width of time slice
tIncr = 50 #increment in start time
tEx = 10 #number of examples to downsample to
tNtoAvg = int(tWidth/tEx) #number of timestep values to average to form one example

training_amt = 8 #8 examples for training, 2 for testing
testing_amt = 10 - training_amt

np.random.seed(63)

In [109]:
TrainingData = np.zeros((total_words,5,training_amt,256,650))#gives the pertinent data from all_data for the two categories
TestingData = np.zeros( (total_words,5,testing_amt,256,650)) #^
wordptr = -1 #the index of the current word, iterates from 0...total_words

for i in range(63):
    wordptr+=1

    excl = [-1]*10 #excl[j] = the j'th presentation number which should be saved for testing (e.g. excl[0] = 0 means the first presentation of the wordptr'th word should be saved for testing). Ignore -1's.
    
    for pres in range(testing_amt):
        while(1): #this loop repeatedly generates a random presentation until one which hasn't been reserved for testing has been found, and then breaks it
            nxtrand = np.random.randint(0,10)
            if(excl[nxtrand]==-1):
                excl[nxtrand]=nxtrand
                break
    for bandnum in range(5):
        ptr2 = 0 #points to which presentation(0...9) of wordptr'th word we are currently copying to TrainingData
        for pres in range(10):
            if(excl[pres]!=-1): #if reserved for testing, don't include in training data
                continue
           
            TrainingData[wordptr][bandnum][ptr2]=all_data[bandnum][i][pres] #sets the channel x time matrix for TrainingData[bandnum][wordptr][ptr2]
            ptr2+=1 #move to next presentation

    for bandnum in range(5): #this loop is same as above, except now we only want the testing presentations
        ptr2=0
        for pres in range(10):
            if(excl[pres]==-1):
                continue
            TestingData[wordptr][bandnum][ptr2] = all_data[bandnum][i][excl[pres]]
            ptr2+=1

In [110]:
toSelect = 5 #number of top features to select

train_feature_vectors = np.zeros((total_words, training_amt,toSelect * tEx))
test_feature_vectors = np.zeros((total_words, testing_amt, toSelect * tEx))
timeSequences = np.zeros((total_words,5,12,training_amt,256,tEx))

In [111]:
time_pointer = 0
for t in range(tStart, tEnd-tWidth+1, tIncr):
    tEx_pointer = 0
    for tEStart in range(t,t+tWidth-tEx+1,tNtoAvg):
        timeSequences[:,:,time_pointer,:,:,tEx_pointer] = np.average(TrainingData[:,:,:,:,tEStart:tEStart+tNtoAvg], axis = 4)
        tEx_pointer+=1
    time_pointer+=1

btcwpv_matrix = np.transpose(timeSequences, (1, 2, 4, 0, 3, 5)) #band,time,channel,word,pres,value matrix in that order
print(btcwpv_matrix.shape)

(5, 12, 256, 63, 8, 10)


In [112]:
index_to_cat = ["Tools","Animals","Buildings","Body Parts","Furniture","Vehicles","Kitchen Utensils", 
"Building Parts", "Clothing", "Insects", "Vegetables","Man-made objects"]

_b_fix = 0
_t_fix = 2
_c_fix = 100

def word_averaged(word, btcwpv_matrix):
    word_averaged = np.zeros((10)) 
    for pres_counter in range(8):
        word_averaged += btcwpv_matrix[_b_fix, _t_fix, _c_fix, word, pres_counter]
    return word_averaged
            
for cat in range(12):
    M1 = 0
    for word in category_info[cat]:
        if word!=-1:
            for pres in range(8):
                for other_word in category_info[cat]:
                        if other_word!=-1:
                            M1 += np.linalg.norm(btcwpv_matrix[_b_fix, _t_fix, _c_fix, word, pres] 
                                                 - word_averaged(other_word, btcwpv_matrix))
    print("\t" + index_to_cat[cat] + " " + str(M1/(lengths[cat][0]*8*lengths[cat][0])))

print("=====================================================")


for cat1 in range(12):
    print(index_to_cat[cat1])
    for cat2 in range(12):
        M2 = 0
        for word in category_info[cat1]:
            if word!=-1:
                for pres in range(8):
                    for other_word in category_info[cat2]:
                        if other_word!=-1:
                            M2 += np.linalg.norm(btcwpv_matrix[_b_fix, _t_fix, _c_fix, word, pres] 
                                                 - word_averaged(other_word, btcwpv_matrix))
        print("\t" + index_to_cat[cat2] + " " + str(M2/(lengths[cat1][0]*8*lengths[cat2][0])))


	Tools 29.951406328
	Animals 31.5223048562
	Buildings 25.3300220892
	Body Parts 20.282115635
	Furniture 29.9131665937
	Vehicles 30.3536025496
	Kitchen Utensils 19.770727642
	Building Parts 28.2914487204
	Clothing 35.0384274227
	Insects 57.5985880093
	Vegetables 26.6776789374
	Man-made objects 30.3252509962
Tools
	Tools 29.951406328
	Animals 32.6344233015
	Buildings 26.9983456565
	Body Parts 19.7452904928
	Furniture 30.418159969
	Vehicles 30.8741838788
	Kitchen Utensils 19.0480165789
	Building Parts 28.4404169875
	Clothing 35.3989675172
	Insects 64.1922018265
	Vegetables 29.1278673491
	Man-made objects 30.207744081
Animals
	Tools 29.8103551773
	Animals 31.5223048562
	Buildings 26.0374081308
	Body Parts 20.2700329637
	Furniture 29.5408058501
	Vehicles 30.5271059493
	Kitchen Utensils 19.3174109158
	Building Parts 28.4644898801
	Clothing 34.8436647652
	Insects 62.3108959825
	Vegetables 27.6314792585
	Man-made objects 30.2742084909
Buildings
	Tools 29.3181951256
	Animals 31.2671112937
	Buil

In [113]:
tools = 0
animals = 1

#store_values_arr = np.zeros((5*12*256, 4))

store_values_counter = 0

store_vals_heap = []
for _b_fix in range(5):
    for _t_fix in range(12):
        for _c_fix in range(256):
            
            TvT = 0
            TvA = 0 
            for word in category_info[tools]:
                if word!=-1:
                    for pres in range(8):
                        for other_word in category_info[tools]:
                            if other_word!=-1:
                                TvT += np.linalg.norm(btcwpv_matrix[_b_fix, _t_fix, _c_fix, word, pres] 
                                                     - word_averaged(other_word, btcwpv_matrix))
                                
                        for other_word in category_info[animals]:
                            if other_word!=-1:
                                TvA += np.linalg.norm(btcwpv_matrix[_b_fix, _t_fix, _c_fix, word, pres] 
                                                        - word_averaged(other_word, btcwpv_matrix))
            
            TvT = TvT/(lengths[tools][0]*8*lengths[tools][0])
            TvA = TvA/(lengths[tools][0]*8*lengths[animals][0])
            
            #store_values_arr[store_values_counter, 0] = TvA - TvT
            #store_values_arr[store_values_counter, 1] = _b_fix
            #store_values_arr[store_values_counter, 2] = _t_fix
            #store_values_arr[store_values_counter, 3] = _c_fix
            
            store_values_counter+=1
            
            if store_values_counter%150==0:
                print(store_values_counter)
            
            store_vals_heap.append((TvA-TvT, _b_fix, _t_fix, _c_fix))

heapq._heapify_max(store_vals_heap)
for i in range(5):
    print(heappop(store_vals_heap))

            
            

150
300
450
600
750
900
1050
1200
1350
1500
1650
1800
1950
2100
2250
2400
2550
2700
2850
3000
3150
3300
3450
3600
3750
3900
4050
4200
4350
4500
4650
4800
4950
5100
5250
5400
5550
5700
5850
6000
6150
6300
6450
6600
6750
6900
7050
7200
7350
7500
7650
7800
7950
8100
8250
8400
8550
8700
8850
9000
9150
9300
9450
9600
9750
9900
10050
10200
10350
10500
10650
10800
10950
11100
11250
11400
11550
11700
11850
12000
12150
12300
12450
12600
12750
12900
13050
13200
13350
13500
13650
13800
13950
14100
14250
14400
14550
14700
14850
15000
15150
15300
(99.223998464555663, 0, 10, 254)
(-9.467752802818147, 0, 1, 223)
(-3.9894980054611944, 4, 11, 254)
(76.425304476261232, 0, 3, 243)
(37.765130176222961, 0, 6, 39)


In [101]:
max1 = 0
_btc1 = (0,0,0)
for i in store_values_arr:
    if i[0] > max1:
        max1 = i[0]
        _btc1 = (i[1], i[2], i[3])
        
max2 = 0
_btc2 = (0,0,0)
for i in store_values_arr:
    if i[0] > max2:
        if i[0] < max1:
            max2 = i[0]
            _btc1 = (i[1], i[2], i[3])
        
max3 = 0
_btc3 = (0,0,0)
for i in store_values_arr:
    if i[0] > max3:
        if i[0] < max2:
            max3 = i[0]
            _btc1 = (i[1], i[2], i[3])

print(max1, max2,max2)
print(_btc1)
print(_btc2)
print(_btc3)

99.2239984646 93.7660544526 93.7660544526
(0.0, 2.0, 243.0)
(0, 0, 0)
(0, 0, 0)


In [None]:
num words
num bands
num time sections
num presentations
num channels
num timesteps

In [None]:
useless = ''' 
for wordnum in range(total_words):
    SHheap = [] #heap of BTC + featurevector information used to select top 400
    
    for band_num in range(5): #frequency bands
        time_pointer=0
        for t in range(tStart, tEnd-tWidth+1, tIncr): #various starts of time slice
            for channel in range(256): #eeg channels

                #pairwise correlations
                avg_p = 0
    
                for i in range(training_amt-1):
                    for j in range(i+1,training_amt):

                        avg_p += pearsonr(timeSequences[wordnum][band_num][time_pointer][i][channel],timeSequences[wordnum][band_num][time_pointer][j][channel])[0]

                avg_p /= training_amt*(training_amt-1)/2 #want to maximize
                
                if(len(SHheap)<400):
                    heappush(SHheap, (avg_p,band_num,t,channel, timeSequences[wordnum,band_num,time_pointer,:,channel]))
                else:
                    heappushpop(SHheap, (avg_p,band_num,t,channel, timeSequences[wordnum,band_num,time_pointer,:,channel]))
            time_pointer+=1

    print("Word " + str(wordnum))

    
    current_matrix = np.zeros( (training_amt,0))
    test_matrix = np.zeros( (testing_amt,0))
    
    for i in range(400):
        (avg_p,band_num,t,channel, timeSequenc) = heappop(SHheap)
        if(i>=400-toSelect):
            print(str(400-i) + ". " + str(band_num) + "   " + str(t) + "   " + str(channel) + "   " + str(avg_p))
            current_matrix = np.hstack( (current_matrix,timeSequenc))

            #construct testing matrix
            tmpo = np.zeros( (testing_amt,tEx))
            for itero in range(testing_amt):
                pp = 0
                for tEStart in range(t,t+tWidth-tEx+1,tEx):
                    tmpo[itero][pp] = np.average(TestingData[wordnum,band_num,itero,channel,tEStart:tEStart+int(tWidth/tEx)])
                    pp+=1
            test_matrix = np.hstack( (test_matrix,tmpo) )
            
    train_feature_vectors[wordnum] = current_matrix
    test_feature_vectors[wordnum] = test_matrix 
    ''' 