In [11]:
import numpy as np
from scipy.stats import pearsonr
from heapq import heappush, heappop, heappushpop
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import itertools
import pickle

In [12]:
## cat vs cat

#import data
best_feature_vectors = np.load("features_train.npy")
test_feature_vectors = np.load("features_test.npy")
lengths = np.load("category_lengths.npy")
category_info = np.load("words_in_categories.npy")

#define constants
toSelect = 5
tEx = 10
training_amt = 8
testing_amt = 2

#dictionaries to pickle data later
save_trainx = {}
save_trainy = {}
save_testx = {}
save_testy = {}

#loop through all pairs of categories
for cat1 in range(12):
    for cat2 in range(cat1+1, 12):
        
        tot_words = int(lengths[cat1][0])+int(lengths[cat2][0])
        
        #create empty train and test matrices
        trainx = np.zeros((0,toSelect * tEx))
        trainy = np.zeros((training_amt * tot_words))
        testx = np.zeros((0, toSelect * tEx))
        testy = np.zeros((testing_amt * tot_words))
        
        ytraincnt = 0
        ytestcnt = 0
        
        #loop through every presentation (training_amt = 8)
        for pres in range(training_amt):
            #loop through every word that is in the first category
            for cat1_word in category_info[cat1]:
                if cat1_word != -1:
                    #add the feature vector of that word to the train matrix
                    trainx = np.concatenate((trainx, np.reshape(best_feature_vectors[cat1_word][pres],(1,toSelect*tEx))), axis=0)
                    #since this is the first category, it will be a '0' in the y-vector
                    trainy[ytraincnt] = 0
                    ytraincnt+=1
                    
                    #do the same thing for test, but only testing_amt = 2 number of times 
                    if(pres<testing_amt):
                        testx = np.concatenate((testx, np.reshape(test_feature_vectors[cat1_word][pres],(1,toSelect*tEx))), axis=0)
                        testy[ytestcnt] = 0
                        ytestcnt+=1

            #repeat above procedure for the second category
            for cat2_word in category_info[cat2]:
                if cat2_word != -1:
                    trainx = np.concatenate((trainx, np.reshape(best_feature_vectors[cat2_word][pres],(1,toSelect*tEx))), axis=0)
                    #now there is a '1' in the y-vector
                    trainy[ytraincnt] = 1
                    ytraincnt+=1
                    
                    if(pres<testing_amt):
                        testx = np.concatenate((testx, np.reshape(test_feature_vectors[cat2_word][pres],(1,toSelect*tEx))), axis=0)
                        testy[ytestcnt] = 1
                        ytestcnt+=1 
                        
        #create dictionaries of the matrices              
        save_trainx[(cat1,cat2)] = trainx
        save_trainy[(cat1,cat2)] = trainy
        save_testx[(cat1,cat2)] = testx
        save_testy[(cat1,cat2)] = testy

#save data as CategoryXCategory
pickle.dump((save_trainx, save_trainy, save_testx, save_testy), open("CategoryXCategory.p","wb"))

In [13]:
## cat vs cat

index_to_cat = ["Tools","Animals","Buildings","Body Parts","Furniture","Vehicles","Kitchen Utensils", 
"Building Parts", "Clothing", "Insects", "Vegetables","Man-made objects"]

#load in matrices and extract data
file_name = 'CategoryXCategory.p' #change this appropriately
loaded_data = pickle.load(open(file_name,"rb"))

dict_trainx = loaded_data[0]
dict_trainy = loaded_data[1]
dict_testx = loaded_data[2]
dict_testy = loaded_data[3]

clist = np.logspace(-4,2,100)

#restating important constants, in case the two programs are not run together
training_amt = 8 #num of presentations used for training
testing_amt = 2 #num of presentations used for testing
toSelect = 5 #num of btc's selected
tEx = 10 #number of features per BTC vector

#loop through each pair of categories
avgacc = 0
for pair in dict_trainx:
    trainx = dict_trainx[pair]
    trainy = dict_trainy[pair]
    testx = dict_testx[pair]
    testy = dict_testy[pair]

    bst_acc = 0
    bst_c = 0
    
    #cross validation to find the best C value (stored as bst_c)
    for c in clist:
        cross_avg_acc = 0
        for fold in range(4):
            fold_sz = int(trainx.shape[0]/4)
            valid_x = trainx[(fold_sz*fold):((fold_sz)*(fold+1))]
            valid_y = trainy[(fold_sz*fold):((fold_sz)*(fold+1))]
            tr_x = np.concatenate((trainx[:(fold_sz*fold)],trainx[((fold+1)*fold_sz):]), axis = 0)
            tr_y = np.concatenate((trainy[:(fold_sz*fold)],trainy[((fold+1)*fold_sz):]), axis = 0)
            
            #normalize data by having 0 mean and unit variance
            scaler = StandardScaler()
            tr_x = scaler.fit_transform(tr_x)
            valid_x = scaler.transform(valid_x)
            tr_y = np.ravel(tr_y)
            valid_y = np.ravel(valid_y)

            classifier = LinearSVC(C = c)
            classifier.fit(tr_x,tr_y)
            cross_avg_acc += (classifier.score(valid_x, valid_y))/4.0
        if(cross_avg_acc > bst_acc):
            bst_acc = cross_avg_acc
            bst_c = c
            
    #use the C value that worked best (bst_c) for the final testing classifier        
    clf = LinearSVC(C=bst_c)
    
    scaler = StandardScaler()
    trainx = scaler.fit_transform(trainx)
    testx = scaler.transform(testx)
    trainy = np.ravel(trainy)
    testy = np.ravel(testy)
    
    clf.fit(trainx, trainy)
    myscore = clf.score(testx, testy)
    avgacc+=myscore

    print("For " + index_to_cat[pair[0]] + " and " + index_to_cat[pair[1]] + " we picked C = " + str(bst_c))
    print("Has accuracy " + str(myscore))
    print("=========")
    
print(avgacc/(12*11/2))

For Vehicles and Insects we picked C = 2.00923300257
Has accuracy 0.65
For Furniture and Building Parts we picked C = 14.1747416293
Has accuracy 0.75
For Animals and Body Parts we picked C = 18.7381742286
Has accuracy 0.7
For Vegetables and Man-made objects we picked C = 24.7707635599
Has accuracy 0.8
For Furniture and Clothing we picked C = 6.13590727341
Has accuracy 0.5
For Vehicles and Kitchen Utensils we picked C = 0.657933224658
Has accuracy 0.75
For Buildings and Clothing we picked C = 4.64158883361
Has accuracy 0.708333333333
For Body Parts and Man-made objects we picked C = 1.1497569954
Has accuracy 0.8
For Tools and Building Parts we picked C = 0.756463327555
Has accuracy 0.772727272727
For Furniture and Kitchen Utensils we picked C = 0.376493580679
Has accuracy 0.65
For Clothing and Insects we picked C = 7.05480231072
Has accuracy 0.65
For Animals and Kitchen Utensils we picked C = 100.0
Has accuracy 0.5
For Tools and Vegetables we picked C = 0.0533669923121
Has accuracy 0.77

0.674421664194