## Sift Evaluator

The objectives of this notebook are to:

- Define functions that query the data based on different parameters (distance metric, transformations?)
- Define functions to evaluate the truth of each returned query parameter
- Define functions to calculate mAP and precision@k for the above output
- Create a pipeline for evaluating the effects of different parameter set ups / transformations on mAP and p@k

______
# Evaluation Functions
____
### Imports

In [1]:
import numpy as np
import unittest
import sklearn.metrics.pairwise
import sklearn.preprocessing
import pandas as pd
import os

____
### Query Functions

In [2]:
def basic_query(query_image_features, query_target_features, metric_function = sklearn.metrics.pairwise.euclidean_distances):
    """Return the indexes of the query_target images, arranged in ascending euclidean distance as compared to the query image"""
    
    query = query_image_features.reshape((1, -1))
    D = metric_function(query_target_features, query).squeeze()
    index = np.argsort(D)

    return(index)

def reranked_query():
    """reranking query goes here"""
    pass

_____
### Query to Truth Value Functions

In [3]:
def query_results_to_truth_values(query_image_building, query_results, image_names):
    """Convert the index results of a query to an array of booleans corresponding to whether the correct image was retrieved."""
    return([query_image_building == image_names[index] for index in query_results])

_______
### Truth Value Metrics Functions

In [4]:
## Potential Improvements
# 1. Precision_at_k to print precision at last true value?
# 2. Create a more computationally efficient (/combined?) version.
# 3. Add recall_at_k

def precision_at_k(truth_values, k, warnings=True):
    """Return proportions of true values in the first k elements.
    If warnings=True and all true values occur before the kth element, raise an error"""
    p_at_k = truth_values[:k].count(True) / k

    if warnings:
        if k < len(truth_values):
            if truth_values[k:].count(True) == 0:
                raise ValueError("All true values are before the first k values")
    
    return(p_at_k)


def average_precision(truth_values):
    """Given a boolean input of whether returned query values are correct or false, return the average precision.
    e.g. average_precision([True, True, False, True]) ~ 0.85
    """
    precisions = []
    for (index, val) in enumerate(truth_values):
        if val: # == True
            precisions.append(truth_values[:index + 1].count(True) / (index + 1))      

    return(np.mean(precisions))

____
### Total Metrics


In [18]:
precisions_at_k = {}

In [36]:
help([].append)

Help on built-in function append:

append(object, /) method of builtins.list instance
    Append object to the end of the list.



In [26]:
a = precisions_at_k.get(1, 2)
print(a)

None


In [40]:
def compute_metrics(train_features, test_features, train_names, test_names, query_function, metric_function, average_mean_precision = True, k_values = [5,10,20]):
    """Run each test feature against the train features,"""
    average_precisions = []
    precisions_at_k = {}
    for k in k_values:
        precisions_at_k[k] = []
    
    for (test_feature, test_feature_name) in zip(test_features.iterrows(), test_names):
        query_image_features = test_feature[1].values # extract the values for the iterrows row object
        query_results = query_function(query_image_features = query_image_features, query_target_features = train_features, metric_function = metric_function)
        truth_values = query_results_to_truth_values(test_feature_name, query_results, train_names)
        
        average_precisions.append(average_precision(truth_values))
        for k in precisions_at_k:
            p_at_k = precision_at_k(truth_values, k, warnings=False)
            precisions_at_k[k].append(p_at_k)
    
    for (k_value, list_of_precisions) in precisions_at_k.items():
        precisions_at_k[k_value] = np.mean(list_of_precisions)
    
    return(np.mean(average_precisions), precisions_at_k)


## Outdated version

# def mean_average_precision(train_features, test_features, train_names, test_names, distance_metric):
#     """descriptive docstring do map yeah"""
#     average_precisions = []
    
#     for (test_feature, test_feature_name) in zip(test_features.iterrows(), test_names):
#         features_as_array = test_feature[1].values # extract the values for the iterrows row object
#         query_results = basic_query(features_as_array, train_features, distance_metric)
#         truth_values = query_results_to_truth_values(test_feature_name, query_results, train_names)
#         average_precisions.append(average_precision(truth_values))
    
#     return(np.mean(average_precisions))

___________
### Tests

In [6]:
class TestQuery(unittest.TestCase):

    def test_basicquery(self):
        test_query = np.array([0,0,0,0])
        test_query_target = pd.DataFrame(data={"a" : [4,0,0,0],
                             "b" : [0,1,2,0],
                             "c" : [0,0,0,0],
                             "d" : [0,0,0,6]})
        self.assertTrue(np.allclose(basic_query(test_query, test_query_target), [1,2,0,3]))

class TestTruthValues(unittest.TestCase):

    def test_queryresultstotruthvalues(self):
        test_query_image_building = "A"
        test_query_results = [0,4,3,2,1,5,6]
        test_image_names = ["A", "B", "C", "D", "A", "A", "D"]
        results = query_results_to_truth_values(test_query_image_building, test_query_results, test_image_names)
        self.assertTrue(np.array_equal(results, [True, True, False, False, False, True, False]))

class TestMetrics(unittest.TestCase):

    def test_precision_at_k(self):
        self.assertEqual(precision_at_k([True,False,True, True],4), 0.75)
        self.assertEqual(precision_at_k([True, False, True, False, True], 2), 0.5)

    def test_averageprecision(self):
        self.assertAlmostEqual(average_precision([True, False, True, True]), np.mean([1,2/3,3/4]))
        self.assertAlmostEqual(average_precision([False, False, True, False, True]), np.mean([1/3,2/5]))
    
    # def test_meanaverageprecision(self)


unittest.main(argv=[''], verbosity=2, exit=False)

test_averageprecision (__main__.TestMetrics) ... ok
test_precision_at_k (__main__.TestMetrics) ... ok
test_basicquery (__main__.TestQuery) ... ok
test_queryresultstotruthvalues (__main__.TestTruthValues) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.002s

OK


<unittest.main.TestProgram at 0x7fe3fd90a580>

_______
# Data Analysis

_____________
## Load Data

In [7]:
BASE_DIR = "/home/sean/Code/Pawsey/oxford_data/"

def load_oxford_5k_data(name):
    """Returns a dictionary of attributes and 1 of features for train or test data"""

    attributes = {}
    
    for img_att in ["names", "pixels", "images"]:
        attributes[img_att] = np.load(BASE_DIR + name + "_" + img_att + ".npy", allow_pickle=True)

    features = {}

    os.chdir(BASE_DIR + "NPY files for BoVW/")
    for bovw_size in os.listdir():
        features[bovw_size] = pd.DataFrame(np.load(bovw_size + "/BoW_" + name.capitalize() + ".npy"))
    
    return(attributes, features)

oxford5k = {"raw features" : {}, "attributes" : {}}
(oxford5k["attributes"]["test"], oxford5k["raw features"]["test"]) = load_oxford_5k_data("test")
(oxford5k["attributes"]["train"], oxford5k["raw features"]["train"]) = load_oxford_5k_data("train")

oxford5k["raw features"]["test"]["bovw files for 10 Words"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.37925,0.124875,0.319125,0.205812,0.183844,0.361906,0.250906,0.449781,0.333,0.394281
1,0.323421,0.171378,0.28563,0.188076,0.267174,0.565107,0.249596,0.313753,0.304965,0.323421
2,0.298072,0.195457,0.276083,0.195457,0.305401,0.559495,0.309066,0.285856,0.27364,0.316396
3,0.406106,0.192947,0.2738,0.302589,0.258487,0.199684,0.336891,0.485123,0.294014,0.298914
4,0.382649,0.202579,0.280109,0.296366,0.217585,0.437671,0.2576,0.335131,0.290113,0.380148


____
### Apply Vertical and Horizontal Normalisation

In [8]:
oxford5k["vector norm"] = {"test" : {}, "train": {}}
oxford5k["feature norm"] = {"test" : {}, "train": {}}

for bovw_size in oxford5k["raw features"]["train"].keys():
    full_df = pd.concat([oxford5k["raw features"]["train"][bovw_size],
                         oxford5k["raw features"]["test"][bovw_size]],
                         ignore_index=True)

    feature_norm = sklearn.preprocessing.normalize(full_df, axis=0)
    oxford5k["feature norm"]["train"][bovw_size] = pd.DataFrame(feature_norm[:567,:])
    oxford5k["feature norm"]["test"][bovw_size] = pd.DataFrame(feature_norm[567:,:])

    vector_norm = sklearn.preprocessing.normalize(full_df, axis=1)
    oxford5k["vector norm"]["train"][bovw_size] = pd.DataFrame(vector_norm[:567,:])
    oxford5k["vector norm"]["test"][bovw_size] = pd.DataFrame(vector_norm[567:,:])

oxford5k["feature norm"]["test"]["bovw files for 100 Words"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.033827,0.035713,0.030498,0.018938,0.022437,0.022456,0.077555,0.051392,0.013825,0.056622,...,0.04686,0.058236,0.041051,0.047852,0.024786,0.018312,0.036336,0.060646,0.023192,0.043599
1,0.01127,0.065442,0.036901,0.039666,0.030566,0.033093,0.029715,0.048888,0.045024,0.027695,...,0.039898,0.036126,0.046472,0.051336,0.035612,0.022452,0.031476,0.024603,0.034499,0.037122
2,0.010666,0.061128,0.029933,0.061474,0.016788,0.028634,0.026782,0.047735,0.044757,0.029954,...,0.016182,0.048841,0.028997,0.037181,0.025678,0.025295,0.022914,0.040654,0.039574,0.035131
3,0.02723,0.038022,0.045059,0.039964,0.046902,0.053649,0.048526,0.089966,0.048509,0.033818,...,0.038093,0.073704,0.023961,0.02858,0.029607,0.038643,0.034912,0.041547,0.030148,0.044364
4,0.007698,0.048766,0.026887,0.038329,0.011744,0.04521,0.044462,0.070499,0.050655,0.037836,...,0.040881,0.033386,0.017581,0.030057,0.020542,0.017892,0.02977,0.063489,0.031421,0.020286


In [9]:
# ## actual correct way to do transformations, currently not working?

# # normalising the columns
# train_feature_norm = {}
# test_feature_norm = {}

# # normalising the rows
# train_vector_norm = {}
# test_vector_norm = {}

# for bovw_size in train_features.keys():
#     train = train_features[bovw_size]
#     test = test_features[bovw_size]

#     row_normaliser = sklearn.preprocessing.Normalizer().fit(train)
#     train_vector_norm[bovw_size] = row_normaliser.transform(train)
#     test_vector_norm[bovw_size] = row_normaliser.transform(test)

#     # rotate dataframes to normalise by column
#     temp_train = train.T
#     temp_test = test.T
#     column_normaliser = sklearn.preprocessing.Normalizer().fit(temp_train)
#     train_feature_norm[bovw_size] = column_normaliser.transform(temp_train).T
#     test_feature_norm[bovw_size] = column_normaliser.transform(temp_test).T

# train_feature_norm(["bovw files for 10 Words"]) == sklearn.preprocessing.normalize(train_features["bovw files for 10 Words"], axis=0)

# dat = pd.DataFrame({1:[10,-10, 5], 2:[4,-1, 4]})

# def normalise_vector(vector, mean=None, stdev=None):
#     if not mean:
#         mean = np.mean(vector)
#     if not stdev:
#         stdev = np.std(vector)
#     return((vector-mean)/stdev)

# dat.apply(normalise_vector, axis=0)

_____
### Sample Query

In [41]:
# Sample query validated against Sean Oldenburger's method

compute_metrics(train_features = oxford5k["raw features"]["train"]["bovw files for 10 Words"],
                test_features = oxford5k["raw features"]["test"]["bovw files for 10 Words"],
                train_names = oxford5k["attributes"]["train"]["names"],
                test_names = oxford5k["attributes"]["test"]["names"],
                query_function=basic_query,
                metric_function = sklearn.metrics.pairwise.euclidean_distances)

(0.2846602906055916,
 {5: 0.4763636363636364, 10: 0.3636363636363636, 20: 0.2818181818181818})

____
### Test all experimental setups

In [50]:
test_distance_metrics = {"euclidean" : sklearn.metrics.pairwise.euclidean_distances,
           "cosine" : sklearn.metrics.pairwise.cosine_distances,
           "manhattan" :sklearn.metrics.pairwise.manhattan_distances,
           "nan_euclidean" : sklearn.metrics.pairwise.nan_euclidean_distances}

In [53]:
results = {"distance_metrics" : [], 'transformations' : [], 'bovw_sizes' : [], 'mean_aps' : []}
k_vals = [5, 10, 20]
for k in k_vals:
    results["precision at {}".format(k)] = []

for (metric_name, metric_function) in test_distance_metrics.items():
    for  transformation in ["raw features", "vector norm", "feature norm"]:
        train = oxford5k[transformation]["train"]
        test = oxford5k[transformation]["test"]
        
        for bovw_size in train.keys():
            (mean_ap, p_at_k) = compute_metrics(train_features = train[bovw_size],
                                                test_features = test[bovw_size],
                                                train_names = oxford5k["attributes"]["train"]["names"],
                                                test_names = oxford5k["attributes"]["test"]["names"],
                                                query_function=basic_query,
                                                metric_function = metric_function,
                                                k_values=k_vals)
            
            print("{} {} {} : {}".format(metric_name, transformation, bovw_size, mean_ap))
            results['distance_metrics'].append(metric_name)
            results['transformations'].append(transformation)
            results['bovw_sizes'].append(bovw_size)
            results['mean_aps'].append(mean_ap)
            for (k, p_at_k) in p_at_k.items():
                results["precision at {}".format(k)] = p_at_k


results_df = pd.DataFrame(results)
results_df


euclidean raw features bovw files for 10000 Words : 0.5160431683671248
euclidean raw features bovw files for 10 Words : 0.2846602906055916
euclidean raw features bovw files for 100 Words : 0.3836962289612868
euclidean raw features bovw files for 20000 Words : 0.5679781603977279
euclidean raw features bovw files for 100000 Words : 0.6682471101801445
euclidean raw features bovw files for 1000 Words : 0.4096111116568395
euclidean raw features bovw files for 50000 Words : 0.6198841133826765
euclidean vector norm bovw files for 10000 Words : 0.5160431683671248
euclidean vector norm bovw files for 10 Words : 0.2846602906055916
euclidean vector norm bovw files for 100 Words : 0.3836962289612868
euclidean vector norm bovw files for 20000 Words : 0.5679781603977279
euclidean vector norm bovw files for 100000 Words : 0.6682471101801445
euclidean vector norm bovw files for 1000 Words : 0.4096111116568395
euclidean vector norm bovw files for 50000 Words : 0.6198841133826765
euclidean feature norm 

Unnamed: 0,distance_metrics,transformations,bovw_sizes,mean_aps,precision at 5,precision at 10,precision at 20
0,euclidean,raw features,bovw files for 10000 Words,0.516043,0.301818,0.238182,0.184545
1,euclidean,raw features,bovw files for 10 Words,0.284660,0.301818,0.238182,0.184545
2,euclidean,raw features,bovw files for 100 Words,0.383696,0.301818,0.238182,0.184545
3,euclidean,raw features,bovw files for 20000 Words,0.567978,0.301818,0.238182,0.184545
4,euclidean,raw features,bovw files for 100000 Words,0.668247,0.301818,0.238182,0.184545
...,...,...,...,...,...,...,...
79,nan_euclidean,feature norm,bovw files for 100 Words,0.413200,0.301818,0.238182,0.184545
80,nan_euclidean,feature norm,bovw files for 20000 Words,0.212736,0.301818,0.238182,0.184545
81,nan_euclidean,feature norm,bovw files for 100000 Words,0.204114,0.301818,0.238182,0.184545
82,nan_euclidean,feature norm,bovw files for 1000 Words,0.284265,0.301818,0.238182,0.184545


____
### Visualising results