In [1]:
import numpy as np
from scipy.stats import pearsonr
import heapq
from heapq import heappush, heappop, heappushpop
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import itertools
import pickle

In [2]:
all_data = np.load("all_data.npy") #holds all the data from channels
category_info = np.load("words_in_categories.npy") #category_info[cat][ptr] returns the number of the word(0...62) of the ptr'th word in the category cat
lengths = np.load("category_lengths.npy") #lengths[cat] is the number of words in category cat

In [3]:
total_words = 63 

tStart = 0 #start time
tEnd = 650 #end time
tWidth = 100 #width of time slice
tIncr = 50 #increment in start time
tEx = 10 #number of examples to downsample to
tNtoAvg = int(tWidth/tEx) #number of timestep values to average to form one example

training_amt = 8 #8 examples for training, 2 for testing
testing_amt = 10 - training_amt

np.random.seed(63)

In [4]:
TrainingData = np.zeros((total_words,5,training_amt,256,650))#gives the pertinent data from all_data for the two categories
TestingData = np.zeros( (total_words,5,testing_amt,256,650)) #^
wordptr = -1 #the index of the current word, iterates from 0...total_words
for i in range(63):
    wordptr+=1

    excl = [-1]*10 #excl[j] = the j'th presentation number which should be saved for testing (e.g. excl[0] = 0 means the first presentation of the wordptr'th word should be saved for testing). Ignore -1's.
    
    for pres in range(testing_amt):
        while(1): #this loop repeatedly generates a random presentation until one which hasn't been reserved for testing has been found, and then breaks it
            nxtrand = np.random.randint(0,10)
            if(excl[nxtrand]==-1):
                excl[nxtrand]=nxtrand
                break
    for bandnum in range(5):
        ptr2 = 0 #points to which presentation(0...9) of wordptr'th word we are currently copying to TrainingData
        for pres in range(10):
            if(excl[pres]!=-1): #if reserved for testing, don't include in training data
                continue
           
            TrainingData[wordptr][bandnum][ptr2]=all_data[bandnum][i][pres] #sets the channel x time matrix for TrainingData[bandnum][wordptr][ptr2]
            ptr2+=1 #move to next presentation

    for bandnum in range(5): #this loop is same as above, except now we only want the testing presentations
        ptr2=0
        for pres in range(10):
            if(excl[pres]==-1):
                continue
            TestingData[wordptr][bandnum][ptr2] = all_data[bandnum][i][excl[pres]]
            ptr2+=1

In [5]:
toSelect = 5 #number of top features to select

train_feature_vectors = np.zeros((total_words, training_amt,toSelect * tEx))
test_feature_vectors = np.zeros((total_words, testing_amt, toSelect * tEx))
timeSequences = np.zeros((total_words,5,12,training_amt,256,tEx))
testTimeSequences = np.zeros((total_words,5,12,testing_amt,256,tEx))

In [6]:
time_pointer = 0
for t in range(tStart, tEnd-tWidth+1, tIncr):
    tEx_pointer = 0
    for tEStart in range(t,t+tWidth-tEx+1,tNtoAvg):
        timeSequences[:,:,time_pointer,:,:,tEx_pointer] = np.average(TrainingData[:,:,:,:,tEStart:tEStart+tNtoAvg], axis = 4)
        testTimeSequences[:,:,time_pointer,:,:,tEx_pointer] = np.average(TestingData[:,:,:,:,tEStart:tEStart+tNtoAvg], axis=4)
        tEx_pointer+=1
    time_pointer+=1

btcwpv_matrix = np.transpose(timeSequences, (1, 2, 4, 0, 3, 5)) #band,time,channel,word,pres,value matrix in that order
btcwpv_matrix_test = np.transpose(testTimeSequences, (1,2,4,0,3,5))
print(btcwpv_matrix.shape)

(5, 12, 256, 63, 8, 10)


In [7]:
from sklearn.preprocessing import MinMaxScaler

def get_train_matrices(band, time, channel, cat1, cat2):
    #btcwpv_matrix
    #btcwpv_matrix_test
    final_matrix_x = np.zeros((int(2*training_amt*(lengths[cat1][0]+lengths[cat2][0])),10))
    final_matrix_y = np.zeros((int(2*training_amt*(lengths[cat1][0]+lengths[cat2][0])),))
    rowptr = 0
    for pres in range(training_amt):
        for word in category_info[cat1]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix[band,time,channel,word,pres,:]
                final_matrix_y[rowptr] = 0
                rowptr+=1
        for word in category_info[cat2]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix[band,time,channel,word,pres,:]
                final_matrix_y[rowptr] = 1
                rowptr+=1
    return (final_matrix_x,final_matrix_y)

def get_test_matrices(band, time, channel, cat1, cat2):
    #btcwpv_matrix
    #btcwpv_matrix_test
    final_matrix_x = np.zeros((int(2*testing_amt*(lengths[cat1][0]+lengths[cat2][0])),10))
    final_matrix_y = np.zeros((int(2*testing_amt*(lengths[cat1][0]+lengths[cat2][0])),))
    rowptr = 0
    for pres in range(testing_amt):
        for word in category_info[cat1]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix_test[band,time,channel,word,pres,:]
                final_matrix_y[rowptr] = 0
                rowptr+=1
        for word in category_info[cat2]:
            if(word!=-1):
                final_matrix_x[rowptr,:] = btcwpv_matrix_test[band,time,channel,word,pres,:]
                final_matrix_y[rowptr] = 1
                rowptr+=1
    return (final_matrix_x,final_matrix_y)


def get_acc(trainx_matrix, trainy_matrix):
    #ADD CODE HERE
    #run 4 fold cross validation to get accuracy
    #C = 1 should be fixed
    #for each fold, apply MinMaxScaler to training data 
    #fit linear svm
    foldlen = 2*training_amt/4
    avgscore = 0
    for fold in range(4):
        mytrainx = np.concatenate((trainx_matrix[:int(fold*foldlen)], trainx_matrix[int((fold+1)*foldlen):]),axis=0)
        mytrainy = np.concatenate((trainy_matrix[:int(fold*foldlen)], trainy_matrix[int((fold+1)*foldlen):]),axis=0)
        mytestx = trainx_matrix[int(fold*foldlen):int((fold+1)*foldlen)]
        mytesty = trainy_matrix[int(fold*foldlen):int((fold+1)*foldlen)]

        scaler = MinMaxScaler(feature_range=(0,1))
        mytrainx = scaler.fit_transform(mytrainx)
        mytestx = scaler.transform(mytestx)

        clf = LinearSVC(C = 1, random_state = 63)
        clf.fit(mytrainx, mytrainy)
        score = clf.score(mytestx,mytesty)
        avgscore+=score
    avgscore/=4
    
    return avgscore
    

def hill_climb(cat1, cat2):
#     print((int((2*training_amt*(lengths[cat1][0]+lengths[cat2][0])))))
#     print(np.zeros((176,0)))
    current_train_matrix = np.zeros((int(2*training_amt*(lengths[cat1][0]+lengths[cat2][0])),0))
    current_test_matrix = np.zeros((int(2*testing_amt*(lengths[cat1][0]+lengths[cat2][0])),0))

    
    
    trainy = get_train_matrices(0,0,0,cat1,cat2)[1]
    testy = get_test_matrices(0,0,0,cat1,cat2)[1]
    
    btc_count = 6
    seen = {}
#     for iteration in range(btc_count):
    for iteration in range(1):
        besto = (0,0,0,0)
        for band in range(5):
            for time in range(12):
                for channel in range(256): 
                    if channel==0:
                        print(seen)
                    if (band, time, channel) in seen: 
                        print("already picked", band, time, channel)
                    else:
                        newmatrixtrain = np.concatenate((current_train_matrix,get_train_matrices(band,time,channel,cat1,cat2)[0]), axis = 1)
                        newmatrixtest = np.concatenate((current_test_matrix,get_test_matrices(band,time,channel,cat1,cat2)[0]), axis = 1)

                        thisacc = get_acc(newmatrixtrain,trainy)
                        besto = max(besto, (thisacc,band,time,channel))
        print(besto)

        seen[(besto[1],besto[2],besto[3])]=True
        current_train_matrix = np.concatenate((current_train_matrix,get_train_matrices(besto[1],besto[2],besto[3],cat1,cat2)[0]), axis = 1)
        current_test_matrix = np.concatenate((current_test_matrix,get_test_matrices(besto[1],besto[2],besto[3],cat1,cat2)[0]), axis = 1)
        print("chose " + str(besto[1]) + " " + str(besto[2]) + " " + str(besto[3]))
    return (current_train_matrix,trainy,current_test_matrix,testy)
    
    

#ADD CODE HERE to scale data, fit linear svm to training data, test on test data, and output accuracy





In [8]:
(trainx,trainy,testx,testy) = hill_climb(0,1)

{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
(0.875, 3, 6, 223)
chose 3 6 223


In [28]:
np.count_nonzero(np.concatenate((trainy,testy)))
np.concatenate((trainy,testy)).shape

(220,)

In [87]:
scaler = MinMaxScaler(feature_range=(0,1))
trainx = scaler.fit_transform(trainx)
testx = scaler.transform(testx)
clf = LinearSVC(C = 1, random_state = 63)
clf.fit(trainx, trainy)
final_score = clf.score(testx,testy)

In [88]:
final_score

0.79545454545454541