## Sift Evaluator

The objectives of this notebook are to:

- Define functions that query the data based on different parameters (distance metric, transformations?)
- Define functions to evaluate the truth of each returned query parameter
- Define functions to calculate mAP and precision@k for the above output
- Investigate the effects of different parameters on mAP, p@k.

______
# Functions
____
### Imports

In [52]:
import numpy as np
import unittest
import sklearn.metrics.pairwise
import pandas as pd
import os

____
### Query Functions

In [12]:
def basic_query(query_image, query_target):
    """Return the indexes of the query_target images, arranged in ascending euclidean distance as compared to the query image"""
    
    # copy pasted code to investigate
    query = query_image.reshape((1, -1))
    D = sklearn.metrics.pairwise.euclidean_distances(query_target, query).squeeze()
    index = np.argsort(D)

    return(index)

_____
### Query to Truth Value Functions

In [43]:
def query_results_to_truth_values(query_image_building, query_results, image_names):
    """Convert the index results of a query to an array of booleans corresponding to whether the correct image was retrieved."""
    return([query_image_building == image_names[index] for index in query_results])

_______
### Truth Value Metrics Functions

In [40]:
def precision_at_k(truth_values, k, warnings=True):
    """Return proportions of true values in the first k elements.
    If warnings=True and all true values occur before the kth element, raise an error and print the value at k and the value at the last true value."""
    p_at_k = truth_values[:k].count(True) / k

    ## Complete code to output p at k, p at last value
    if warnings:
        if k < len(truth_values):
            if truth_values[k:].count(True) == 0:
                raise ValueError("All true values are before the first k values")
    
    return(p_at_k)


## Create a more computationally efficient version using true positives, false positives?
def average_precision(truth_values):
    """Given a boolean input of whether returned query values are correct or false, return the average precision.
    e.g. average_precision([True, True, False, True]) ~ 0.85
    """
    precisions = []
    for (index, val) in enumerate(truth_values):
        if val: # == True
            precisions.append(truth_values[:index + 1].count(True) / (index + 1))      

    return(np.mean(precisions))

#tbd
def mean_average_precision():
    pass

___________
### Tests

In [44]:
class TestQuery(unittest.TestCase):

    def test_basicquery(self):
        test_query = np.array([0,0,0,0])
        test_query_target = pd.DataFrame(data={"a" : [4,0,0,0],
                             "b" : [0,1,2,0],
                             "c" : [0,0,0,0],
                             "d" : [0,0,0,6]})
        self.assertTrue(np.allclose(basic_query(test_query, test_query_target), [1,2,0,3]))

class TestTruthValues(unittest.TestCase):

    def test_queryresultstotruthvalues(self):
        test_query_image_building = "A"
        test_query_results = [0,4,3,2,1,5,6]
        test_image_names = ["A", "B", "C", "D", "A", "A", "D"]
        results = query_results_to_truth_values(test_query_image_building, test_query_results, test_image_names)
        self.assertTrue(np.array_equal(results, [True, True, False, False, False, True, False]))

class TestMetrics(unittest.TestCase):

    def test_precision_at_k(self):
        self.assertEqual(precision_at_k([True,False,True, True],4), 0.75)
        self.assertEqual(precision_at_k([True, False, True, False, True], 2), 0.5)

    def test_averageprecision(self):
        self.assertAlmostEqual(average_precision([True, False, True, True]), np.mean([1,2/3,3/4]))
        self.assertAlmostEqual(average_precision([False, False, True, False, True]), np.mean([1/3,2/5]))
    
    # def test_meanaverageprecision(self)


unittest.main(argv=[''], verbosity=2, exit=False)

test_averageprecision (__main__.TestMetrics) ... ok
test_precision_at_k (__main__.TestMetrics) ... ok
test_basicquery (__main__.TestQuery) ... ok
test_queryresultstotruthvalues (__main__.TestTruthValues) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.002s

OK


<unittest.main.TestProgram at 0x7fe3389009a0>

___________
# Evaluate Data

_____________
## Load Data

In [93]:
BASE_DIR = "/home/sean/Code/Pawsey/oxford_data/"

def load_data(name):
    """Returns a dictionary of attributes and 1 of features for train or test data"""

    attributes = {}
    
    for img_att in ["names", "pixels", "images"]:
        attributes[img_att] = np.load(BASE_DIR + name + "_" + img_att + ".npy", allow_pickle=True)

    features = {}

    os.chdir(BASE_DIR + "NPY files for BoVW/")
    for bovw_size in os.listdir():
        features[bovw_size] = pd.DataFrame(np.load(bovw_size + "/BoW_" + name.capitalize() + ".npy"))
    
    return(attributes, features)

(test_attributes, test_features) = load_data("test")
(train_attributes, train_features) = load_data("train")
train_features["bovw files for 10 Words"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.314357,0.166189,0.288328,0.203231,0.247281,0.408464,0.269306,0.321365,0.408464,0.422480
1,0.323421,0.171378,0.285630,0.188076,0.267174,0.565107,0.249596,0.313753,0.304965,0.323421
2,0.382649,0.202579,0.280109,0.296366,0.217585,0.437671,0.257600,0.335131,0.290113,0.380148
3,0.298072,0.195457,0.276083,0.195457,0.305401,0.559495,0.309066,0.285856,0.273640,0.316396
4,0.336860,0.148814,0.304391,0.257042,0.228632,0.274629,0.247572,0.403149,0.347683,0.482968
...,...,...,...,...,...,...,...,...,...,...
562,0.321308,0.251836,0.334334,0.238810,0.258349,0.492817,0.240981,0.329992,0.240981,0.360386
563,0.312773,0.237406,0.221256,0.224486,0.179266,0.216949,0.197031,0.332153,0.299315,0.654617
564,0.352977,0.186982,0.301461,0.268072,0.233728,0.417848,0.262348,0.374919,0.237544,0.427388
565,0.222150,0.391848,0.283858,0.212894,0.235520,0.376421,0.280773,0.266374,0.401104,0.404190


____
### Apply Transformations

In [98]:
train_transformed = {}
test_transformed = {}

for bovw_size in train_features.keys():
    full_df = pd.concat([train_features[bovw_size], test_features[bovw_size]], ignore_index = True)

    transformed_dfs = {"Vector normalised" : sklearn.preprocessing.normalize(full_df, axis = 1),
                           "Feature normalised" : sklearn.preprocessing.normalize(full_df, axis = 0)}
    
    for transformation_name in transformed_dfs.keys():
        train_transformed[transformation_name] = {}
        test_transformed[transformation_name] = {}

    for (transformation_name, transformed_df) in transformed_dfs.items():
        train_transformed[transformation_name][bovw_size] = transformed_df[:567,:]
        test_transformed[transformation_name][bovw_size] = transformed_df[567:,:]

In [103]:
for bovw_size in train_transformed["Vector normalised"].keys():
    print(bovw_size)

bovw files for 50000 Words
