In [3]:
import numpy as np
from scipy.stats import pearsonr
import heapq
from heapq import heappush, heappop, heappushpop
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import itertools
import pickle

In [4]:
all_data = np.load("all_data.npy") #holds all the data from channels
category_info = np.load("words_in_categories.npy") #category_info[cat][ptr] returns the number of the word(0...62) of the ptr'th word in the category cat
lengths = np.load("category_lengths.npy") #lengths[cat] is the number of words in category cat

In [5]:
total_words = 63 

tStart = 0 #start time
tEnd = 650 #end time
tWidth = 100 #width of time slice
tIncr = 50 #increment in start time
tEx = 10 #number of examples to downsample to
tNtoAvg = int(tWidth/tEx) #number of timestep values to average to form one example

training_amt = 8 #8 examples for training, 2 for testing
testing_amt = 10 - training_amt

np.random.seed(63)

In [6]:
TrainingData = np.zeros((total_words,5,training_amt,256,650))#gives the pertinent data from all_data for the two categories
TestingData = np.zeros( (total_words,5,testing_amt,256,650)) #^
wordptr = -1 #the index of the current word, iterates from 0...total_words
for i in range(63):
    wordptr+=1

    excl = [-1]*10 #excl[j] = the j'th presentation number which should be saved for testing (e.g. excl[0] = 0 means the first presentation of the wordptr'th word should be saved for testing). Ignore -1's.
    
    for pres in range(testing_amt):
        while(1): #this loop repeatedly generates a random presentation until one which hasn't been reserved for testing has been found, and then breaks it
            nxtrand = np.random.randint(0,10)
            if(excl[nxtrand]==-1):
                excl[nxtrand]=nxtrand
                break
    for bandnum in range(5):
        ptr2 = 0 #points to which presentation(0...9) of wordptr'th word we are currently copying to TrainingData
        for pres in range(10):
            if(excl[pres]!=-1): #if reserved for testing, don't include in training data
                continue
           
            TrainingData[wordptr][bandnum][ptr2]=all_data[bandnum][i][pres] #sets the channel x time matrix for TrainingData[bandnum][wordptr][ptr2]
            ptr2+=1 #move to next presentation

    for bandnum in range(5): #this loop is same as above, except now we only want the testing presentations
        ptr2=0
        for pres in range(10):
            if(excl[pres]==-1):
                continue
            TestingData[wordptr][bandnum][ptr2] = all_data[bandnum][i][excl[pres]]
            ptr2+=1

In [7]:
toSelect = 5 #number of top features to select

train_feature_vectors = np.zeros((total_words, training_amt,toSelect * tEx))
test_feature_vectors = np.zeros((total_words, testing_amt, toSelect * tEx))
timeSequences = np.zeros((total_words,5,12,training_amt,256,tEx))
testTimeSequences = np.zeros((total_words,5,12,testing_amt,256,tEx))

In [8]:
time_pointer = 0
for t in range(tStart, tEnd-tWidth+1, tIncr):
    tEx_pointer = 0
    for tEStart in range(t,t+tWidth-tEx+1,tNtoAvg):
        timeSequences[:,:,time_pointer,:,:,tEx_pointer] = np.average(TrainingData[:,:,:,:,tEStart:tEStart+tNtoAvg], axis = 4)
        testTimeSequences[:,:,time_pointer,:,:,tEx_pointer] = np.average(TestingData[:,:,:,:,tEStart:tEStart+tNtoAvg], axis=4)
        tEx_pointer+=1
    time_pointer+=1

btcwpv_matrix = np.transpose(timeSequences, (1, 2, 4, 0, 3, 5)) #band,time,channel,word,pres,value matrix in that order
btcwpv_matrix_test = np.transpose(testTimeSequences, (1,2,4,0,3,5))
print(btcwpv_matrix.shape)

(5, 12, 256, 63, 8, 10)


In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

index_to_cat = ["Tools","Animals","Buildings","Body Parts","Furniture","Vehicles","Kitchen Utensils", 
"Building Parts", "Clothing", "Insects", "Vegetables","Man-made objects"]

tools = 0
animals = 1

def word_averaged(word, btcwpv_matrix, _b, _t, _c):
    word_averaged = np.zeros((10)) 
    for pres_counter in range(8):
        word_averaged += btcwpv_matrix[_b, _t, _c, word, pres_counter]
    word_averaged/=8
    return word_averaged

for tool in category_info[tools]:
    if tool!=-1:
        
        BEST = [0]*4
        
        for _b in range(5):
            for _t in range(12):
                for _c in range(256):
                    
                    averaged_tool = word_averaged(tool, btcwpv_matrix, _b, _t, _c)

                    rmse = 0
                    for pres in range(8):
                        #calculate rmse
                        #rmse += sqrt(mean_squared_error(averaged_tool, btcwpv_matrix[_b,_t,_c,tool,pres]))
                        rmse += np.linalg.norm(btcwpv_matrix[_b, _t, _c, tool, pres]-averaged_tool)**2
                    rmse/=8
                    dist = 0
                    c1 = 0
                    for animal in category_info[animals]:
                        if animal!=-1:
                            tmpdis = 0
                            averaged_animal = word_averaged(animal, btcwpv_matrix, _b, _t, _c)
                            tmpdis = np.linalg.norm(averaged_tool-averaged_animal) #* np.linalg.norm(averaged_tool-averaged_animal)
                            
                            averaged_animal = word_averaged(animal,btcwpv_matrix,_b,_t,_c)
                            rmseanimal = 0
                            for pres in range(8):
                                rmseanimal += np.linalg.norm(btcwpv_matrix[_b,_t,_c,animal,pres]-averaged_animal)
                            rmseanimal/=8
                            dist += (max(0,tmpdis-rmseanimal))**2
                            
                            c1+=1
                    dist/=c1
        
                    MEASURE = dist/rmse
                    
                    if (MEASURE > BEST[0]):
                        BEST[0] = dist/rmse
                        BEST[1] = _b
                        BEST[2] = _t
                        BEST[3] = _c
        print("\n")
        print("BTC for " + str(tool) + ": " + str(BEST))
        averaged_tool = word_averaged(tool, btcwpv_matrix, BEST[1], BEST[2], BEST[3])
        for tool2 in category_info[tools]:
            if(tool2==-1):
                continue
            averaged_tool2 = word_averaged(tool2, btcwpv_matrix, BEST[1], BEST[2], BEST[3])
            
            rmse = 0
            dist = np.linalg.norm(averaged_tool2 - averaged_tool)
            for pres in range(8):
                rmse += np.linalg.norm(btcwpv_matrix[BEST[1],BEST[2],BEST[3], tool2, pres] - averaged_tool2)
            rmse/=8
            print("For tool " + str(tool2) + " rmse is : " + str(rmse))
            print("For tool " + str(tool2) + " distance is : " + str(dist))
        
        for animal2 in category_info[animals]:
            if(animal2==-1):
                continue
            averaged_animal = word_averaged(animal2, btcwpv_matrix, BEST[1], BEST[2], BEST[3])
            dist = np.linalg.norm(averaged_animal - averaged_tool)
            rmse = 0
            for pres in range(8):
                rmse += np.linalg.norm(btcwpv_matrix[BEST[1],BEST[2],BEST[3], animal2, pres] - averaged_animal)
            rmse/=8
            print("For animal " + str(animal2) + " rmse is : " + str(rmse))
            print("For animal " + str(animal2) + " distance is : " + str(dist))
        
        '''
        print("\t", "RMSE FROM OTHER TOOL PRESENTATIONS:", rmse/8)
        print("-")
        print("\t", "DISTANCE FROM ANIMALS PRESENTATIONS:")
        for animal in category_info[animals]:
            if animal!=-1:
                averaged_animal = word_averaged(animal, btcwpv_matrix, _b, _t, _c)
                print("\t", np.linalg.norm(word_averaged(tool, btcwpv_matrix, BEST[1], BEST[2], BEST[3])-averaged_animal))
        '''




BTC for 5: [2.362288198878256, 0, 6, 42]
For tool 5 rmse is : 3.17086360204
For tool 5 distance is : 0.0
For tool 17 rmse is : 3.74814177225
For tool 17 distance is : 12.6018709198
For tool 29 rmse is : 5.96903215152
For tool 29 distance is : 9.19007138229
For tool 41 rmse is : 4.53503485058
For tool 41 distance is : 3.56336135948
For tool 53 rmse is : 2.58485173923
For tool 53 distance is : 8.46253100275
For tool 62 rmse is : 5.00278473586
For tool 62 distance is : 6.31269972888
For animal 3 rmse is : 5.29008127436
For animal 3 distance is : 9.46861402866
For animal 15 rmse is : 8.02744391842
For animal 15 distance is : 5.94210547411
For animal 27 rmse is : 4.77363073372
For animal 27 distance is : 11.8530980158
For animal 39 rmse is : 3.41226375392
For animal 39 distance is : 6.00143050641
For animal 51 rmse is : 3.57606405071
For animal 51 distance is : 11.541820683


BTC for 17: [0.96546277072147357, 0, 7, 237]
For tool 5 rmse is : 28.8637369551
For tool 5 distance is : 37.142794

TypeError: slice indices must be integers or None or have an __index__ method

In [54]:
from sklearn.preprocessing import MinMaxScaler

def btc_acc(band, time, channel, word1, word2): #returns the percent accuracy after training a model on the btc's data
    trainx_matrix = np.zeros( (2*training_amt, tEx))
    trainy_matrix = np.zeros( (2*training_amt, ))
    testx_matrix = np.zeros( (2*testing_amt, tEx))
    testy_matrix = np.zeros( (2*testing_amt, ))
    for i in range(training_amt):
        trainx_matrix[2*i,:] = btcwpv_matrix[band,time,channel,word1,i,:]
        trainx_matrix[2*i+1,:] = btcwpv_matrix[band,time,channel,word2,i,:]
        trainy_matrix[2*i] = 0
        trainy_matrix[2*i+1] = 1
    for i in range(testing_amt):
        testx_matrix[2*i,:] = btcwpv_matrix_test[band,time,channel,word1,i,:]
        testx_matrix[2*i+1,:] = btcwpv_matrix_test[band,time,channel,word2,i,:]
        testy_matrix[2*i] = 0
        testy_matrix[2*i+1] = 1
    clist = np.logspace(start=-3,stop=4,num=100)
    bestsc = -1
    bestc = -1
    for c in clist:
        foldlen = 2*training_amt/4
        avgscore = 0
        for fold in range(4):
            mytrainx = np.concatenate((trainx_matrix[:int(fold*foldlen)], trainx_matrix[int((fold+1)*foldlen):]),axis=0)
            mytrainy = np.concatenate((trainy_matrix[:int(fold*foldlen)], trainy_matrix[int((fold+1)*foldlen):]),axis=0)
            mytestx = trainx_matrix[int(fold*foldlen):int((fold+1)*foldlen)]
            mytesty = trainy_matrix[int(fold*foldlen):int((fold+1)*foldlen)]
            
            
            scaler = MinMaxScaler(feature_range=(0,1))
            mytrainx = scaler.fit_transform(mytrainx)
            mytestx = scaler.transform(mytestx)
            
            clf = LinearSVC(C = c, random_state = 63)
            clf.fit(mytrainx, mytrainy)
            score = clf.score(mytestx,mytesty)
            avgscore+=score
        avgscore/=4
        if(avgscore>bestsc):
            bestsc=avgscore
            bestc = c
    scaler = MinMaxScaler(feature_range=(0,1))
    print(testx_matrix[0])
    print(testx_matrix[1])
    print(testx_matrix[2])
    print(testx_matrix[3])

    trainx_matrix = scaler.fit_transform(trainx_matrix)
    testx_matrix = scaler.transform(testx_matrix)
    print(bestc)
    print(bestsc)
    bestc = 100000
    clf = LinearSVC(C=bestc, random_state = 63)
    
    clf.fit(trainx_matrix, trainy_matrix)
    print(clf.score(trainx_matrix, trainy_matrix))
    print(clf.predict(testx_matrix))
    print(testy_matrix)

    return clf.score(testx_matrix, testy_matrix)
        
print(btc_acc(0,6,42,5,51))

[-3.22227433 -3.11685929 -3.0008584  -2.8658186  -2.7218807  -2.56755664
 -2.40145986 -2.23296673 -2.05304937 -1.87083862]
[-4.02166452 -4.38038621 -4.72099624 -5.0273376  -5.30267973 -5.53654289
 -5.72490177 -5.87152667 -5.96616912 -6.01825261]
[ 5.88960052  6.01219745  6.10028796  6.15786991  6.16900582  6.15433083
  6.09979978  6.01512775  5.90984693  5.77335048]
[ 0.87819385  0.77463113  0.67344813  0.58772972  0.5122684   0.45735884
  0.42487273  0.41258237  0.43054756  0.46974561]
1.78864952906
0.9375
1.0
[ 1.  1.  0.  0.]
[ 0.  1.  0.  1.]
0.5


In [None]:
def get_train_matrices(band, time, channel, cat1, cat2):
    #btcwpv_matrix
    #btcwpv_matrix_test
    final_matrix_x = np.zeros((2*training_amt*(lengths[cat1]+lengths[cat2]),10))
    final_matrix_y = np.zeros((2*training_amt*(lengths[cat1]+lengths[cat2]),))
    rowptr = 0
    for pres in range(training_amt):
        for word in category_info[cat1]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix[band,time,channel,word,pres,:]
                final_matrix_y[rowptr,:] = 0
                rowptr+=1
        for word in category_info[cat2]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix[band,time,channel,word,pres,:]
                final_matrix_y[rowptr,:] = 1
                rowptr+=1
    return (final_matrix_x,final_matrix_y)

def get_test_matrices(band, time, channel, cat1, cat2):
    #btcwpv_matrix
    #btcwpv_matrix_test
    final_matrix_x = np.zeros((2*testing_amt*(lengths[cat1]+lengths[cat2]),10))
    final_matrix_y = np.zeros((2*testing_amt*(lengths[cat1]+lengths[cat2]),))
    rowptr = 0
    for pres in range(testing_amt):
        for word in category_info[cat1]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix_test[band,time,channel,word,pres,:]
                final_matrix_y[rowptr,:] = 0
                rowptr+=1
        for word in category_info[cat2]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix_test[band,time,channel,word,pres,:]
                final_matrix_y[rowptr,:] = 1
                rowptr+=1
    return (final_matrix_x,final_matrix_y)


def get_acc(trainx, trainy):
    #ADD CODE HERE
    #run 4 fold cross validation to get accuracy
    #C = 1 should be fixed
    #for each fold, apply MinMaxScaler to training data 
    #fit linear svm
    

def hill_climb(cat1, cat2):
    current_train_matrix = np.zeros(2*training_amt*(lengths[cat1]+lengths[cat2]),0)
    current_test_matrix = np.zeros(2*testing_amt*(lengths[cat1]+lengths[cat2]),0)

    
    trainy = get_train_matrices(0,0,0,cat1,cat2)[1]
    testy = get_test_matrices(0,0,0,cat1,cat2)[1]
    
    btc_count = 6
    seen = {}
    for iteration in range(btc_count):
        besto = (0,0,0,0)
        for band in range(5):
            for time in range(12):
                for channel in range(256):
                    if(seen[(band,time,channel)]):
                        continue
                    newmatrixtrain = np.concatenate((current_train_matrix,get_train_matrices(band,time,channel,cat1,cat2)[0]), axis = 1)
                    newmatrixtest = np.concatenate((current_test_matrix,get_test_matrices(band,time,channel,cat1,cat2)[0]), axis = 1)
                    
                    thisacc = get_acc(newmatrixtrain,trainy)
                    besto = max(besto, (thisacc,band,time,channel))
        
        seen[(besto[1],besto[2],besto[3])]=True
        current_train_matrix = np.concatenate((current_train_matrix,get_train_matrices(besto[1],besto[2],besto[3],cat1,cat2)[0]), axis = 1)
        current_test_matrix = np.concatenate((current_test_matrix,get_test_matrices(besto[1],besto[2],besto[3],cat1,cat2)[0]), axis = 1)
        print("chose " + str(band) + " " + str(time) + " " + str(channel))
        return (current_train_matrix,trainy,current_test_matrix,testy)
    
    
(trainx,trainy,testx,testy) = hill_climb(0,1)
#ADD CODE HERE to scale data, fit linear svm to training data, test on test data, and output accuracy
