## Sift Evaluator

The objectives of this notebook are to:

- Define functions that query the data based on different parameters (distance metric, transformations?)
- Define functions to evaluate the truth of each returned query parameter
- Define functions to calculate mAP and precision@k for the above output
- Create a pipeline for evaluating the effects of different parameter set ups / transformations on mAP and p@k

______
# Evaluation Functions
____
### Imports

In [1]:
import numpy as np
import unittest
import sklearn.metrics.pairwise
import sklearn.preprocessing
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import glob

____
### Query Functions

In [2]:
def basic_query(query_image_features, query_target_features, metric_function = sklearn.metrics.pairwise.euclidean_distances):
    """Return the indexes of the query_target images, arranged in ascending euclidean distance as compared to the query image"""
    
    query = query_image_features.reshape((1, -1))
    D = metric_function(query_target_features, query).squeeze()
    index = np.argsort(D)

    return(index)

def get_inliers(img1, img2, min_match_count=10):
    """return the number of inlier features between the two images"""
    sift = cv2.SIFT_create()
    # find the keypoints and descriptors with SIFT
    kp1, des1 = sift.detectAndCompute(img1,None)
    kp2, des2 = sift.detectAndCompute(img2,None)
    FLANN_INDEX_KDTREE = 1
    index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
    search_params = dict(checks = 50)
    flann = cv2.FlannBasedMatcher(index_params, search_params)
    matches = flann.knnMatch(des1,des2,k=2)
    # store all the good matches as per Lowe's ratio test.
    good = []
    for m,n in matches:
        if m.distance < 0.7*n.distance:
            good.append(m)

    if len(good)>min_match_count:
        src_pts = np.float32([ kp1[m.queryIdx].pt for m in good ]).reshape(-1,1,2)
        dst_pts = np.float32([ kp2[m.trainIdx].pt for m in good ]).reshape(-1,1,2)
        M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC,5.0)
        matchesMask = mask.ravel().tolist()
        h,w,_ = img1.shape
        pts = np.float32([ [0,0],[0,h-1],[w-1,h-1],[w-1,0] ]).reshape(-1,1,2)
        dst = cv2.perspectiveTransform(pts,M)
        img2 = cv2.polylines(img2,[np.int32(dst)],True,255,3, cv2.LINE_AA)
        
        inlier_prop = np.sum(np.array(matchesMask) == 1/len(matchesMask))
        return(inlier_prop)
    else:
        return(0)

def ransac_query(query_image, query_target_images, metric_functin="unused"):
    """sort images based on number of inliers found"""
    inlier_counts = [get_inliers(query_image, test_image) for test_image in query_target_images]
    ascending_order = np.argsort(inlier_counts)
    descending_order = np.flip(ascending_order)
    return(descending_order)


_____
### Query to Truth Value Functions

In [3]:
def query_results_to_truth_values(query_image_building, query_results, image_names):
    """Convert the index results of a query to an array of booleans corresponding to whether the correct image was retrieved."""
    return([query_image_building == image_names[index] for index in query_results])

_______
### Truth Value Metrics Functions

In [4]:
## Potential Improvements
# 1. Precision_at_k to print precision at last true value?
# 2. Create a more computationally efficient (/combined?) version.
# 3. Add recall_at_k

def precision_at_k(truth_values, k, warnings=True):
    """Return proportions of true values in the first k elements.
    If warnings=True and all true values occur before the kth element, raise an error"""
    p_at_k = truth_values[:k].count(True) / k

    if warnings:
        if k < len(truth_values):
            if truth_values[k:].count(True) == 0:
                raise ValueError("All true values are before the first k values")
    
    return(p_at_k)


def average_precision(truth_values):
    """Given a boolean input of whether returned query values are correct or false, return the average precision.
    e.g. average_precision([True, True, False, True]) ~ 0.85
    """
    precisions = []
    for (index, val) in enumerate(truth_values):
        if val: # == True
            precisions.append(truth_values[:index + 1].count(True) / (index + 1))      

    return(np.mean(precisions))

____
### Total Metrics


In [5]:
def compute_metrics(train_features, test_features, train_names, test_names, query_function, metric_function, average_mean_precision = True, k_values = [5,10,20]):
    """Run each test feature against the train features,"""
    average_precisions = []
    precisions_at_k = {}
    for k in k_values:
        precisions_at_k[k] = []
    
    for (test_feature, test_feature_name) in zip(test_features.iterrows(), test_names):
        query_image_features = test_feature[1].values # extract the values for the iterrows row object
        query_results = query_function(query_image_features, train_features, metric_function)
        truth_values = query_results_to_truth_values(test_feature_name, query_results, train_names)
        
        average_precisions.append(average_precision(truth_values))
        for k in precisions_at_k:
            p_at_k = precision_at_k(truth_values, k, warnings=False)
            precisions_at_k[k].append(p_at_k)
    
    for (k_value, list_of_precisions) in precisions_at_k.items():
        precisions_at_k[k_value] = np.mean(list_of_precisions)
    
    return(np.mean(average_precisions), precisions_at_k)


## Outdated version

# def mean_average_precision(train_features, test_features, train_names, test_names, distance_metric):
#     """descriptive docstring do map yeah"""
#     average_precisions = []
    
#     for (test_feature, test_feature_name) in zip(test_features.iterrows(), test_names):
#         features_as_array = test_feature[1].values # extract the values for the iterrows row object
#         query_results = basic_query(features_as_array, train_features, distance_metric)
#         truth_values = query_results_to_truth_values(test_feature_name, query_results, train_names)
#         average_precisions.append(average_precision(truth_values))
    
#     return(np.mean(average_precisions))

___________
### Tests

In [6]:
class TestQuery(unittest.TestCase):

    def test_basicquery(self):
        test_query = np.array([0,0,0,0])
        test_query_target = pd.DataFrame(data={"a" : [4,0,0,0],
                             "b" : [0,1,2,0],
                             "c" : [0,0,0,0],
                             "d" : [0,0,0,6]})
        self.assertTrue(np.allclose(basic_query(test_query, test_query_target), [1,2,0,3]))

class TestTruthValues(unittest.TestCase):

    def test_queryresultstotruthvalues(self):
        test_query_image_building = "A"
        test_query_results = [0,4,3,2,1,5,6]
        test_image_names = ["A", "B", "C", "D", "A", "A", "D"]
        results = query_results_to_truth_values(test_query_image_building, test_query_results, test_image_names)
        self.assertTrue(np.array_equal(results, [True, True, False, False, False, True, False]))

class TestMetrics(unittest.TestCase):

    def test_precision_at_k(self):
        self.assertEqual(precision_at_k([True,False,True, True],4), 0.75)
        self.assertEqual(precision_at_k([True, False, True, False, True], 2), 0.5)

    def test_averageprecision(self):
        self.assertAlmostEqual(average_precision([True, False, True, True]), np.mean([1,2/3,3/4]))
        self.assertAlmostEqual(average_precision([False, False, True, False, True]), np.mean([1/3,2/5]))
    
    # def test_meanaverageprecision(self)


unittest.main(argv=[''], verbosity=2, exit=False)

test_averageprecision (__main__.TestMetrics) ... ok
test_precision_at_k (__main__.TestMetrics) ... ok
test_basicquery (__main__.TestQuery) ... ok
test_queryresultstotruthvalues (__main__.TestTruthValues) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.004s

OK


<unittest.main.TestProgram at 0x7fb72e788b20>

_______
# Data Analysis

_____________
## Load Data

In [7]:
def load_data(train_path, query_path):

    train_image_paths = []
    train_images = []
    train_names = []

    # save path to image and save class names as numbers (train)
    for data_path in glob.glob(train_path + '/*'):
        name = data_path.split('/')[-1].split("-")[0]
        train_names.append(name) 
        train_image_paths.append(data_path)
    
    # open image from path and save to array
    for img_path in train_image_paths:
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        train_images.append(img)

    # save path to image and save class names as numbers (query)
    query_image_paths = []
    query_names = []
    query_images = []
    
    for data_path in glob.glob(query_path + '/*'):
        name = data_path.split('/')[-1].split("-")[0]
        query_names.append(name) 
        query_image_paths.append(data_path)
    
    # open image from path and save to array
    for img_path in query_image_paths:
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        query_images.append(img)

    print("Train Images: {} | Query Images: {}".format(len(train_images), len(query_images)))
    return train_images, train_names, query_images, query_names

option = 'easy'
train_path = "/home/sean/Code/Pawsey/3. Data/Revised and Sorted/roxford5k/{}".format(option)
query_path = "/home/sean/Code/Pawsey/3. Data/Revised and Sorted/roxford5k/query"

ox_easy_images, ox_easy_names, ox_query_images, ox_query_names = load_data(train_path, query_path)

Train Images: 516 | Query Images: 70


In [8]:
ox_easy_names = np.load("/home/sean/Code/Pawsey/2. Revised Data Analysis/Oldenburger Data Files/names/ox_easy_names.npy")
ox_query_names = np.load("/home/sean/Code/Pawsey/2. Revised Data Analysis/Oldenburger Data Files/names/ox_query_names.npy")

In [9]:
ox_easy = pd.DataFrame(np.load("/home/sean/Code/Pawsey/2. Revised Data Analysis/Oldenburger Data Files/SIFT/ox-easy-1000.npy"))
ox_query = pd.DataFrame(np.load("/home/sean/Code/Pawsey/2. Revised Data Analysis/Oldenburger Data Files/SIFT/ox-query-1000.npy"))

In [12]:
# Sample query validated against Sean Oldenburger's method

compute_metrics(train_features = ox_easy,
                test_features = ox_query,
                train_names = ox_easy_names,
                test_names = ox_query_names,
                query_function=basic_query,
                metric_function = sklearn.metrics.pairwise.euclidean_distances)

(0.14765216960160718,
 {5: 0.14, 10: 0.13857142857142862, 20: 0.1464285714285714})

In [22]:
def bundle_and_save(features, buildings, filename, dir = "default_dir"):
    """Create a dataframe with features and building names, then saves to an outfile"""
    
    features_df = pd.DataFrame(features)
    features_df["building"] = buildings

    os.chdir(dir)
    features_df.to_csv(filename, index = False)

bundle_and_save(ox_easy, ox_easy_names, "test", "/home/sean/Code/Pawsey/2. Revised Data Analysis/")

In [23]:
pd.read_csv("/home/sean/Code/Pawsey/2. Revised Data Analysis/test")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,building
0,-0.222562,0.107324,-0.096406,-0.042793,-0.166953,0.115800,0.023228,0.046255,-0.052713,0.037768,...,0.002296,-0.000992,0.004535,-0.000660,-0.002533,-0.000543,0.005779,0.004640,-0.001291,radcliffe_camera
1,-0.143654,-0.243126,-0.146313,0.148875,-0.114526,-0.113641,-0.064977,0.112074,-0.148714,0.028721,...,-0.000611,-0.001790,-0.004445,-0.005078,0.002574,0.002198,-0.003624,0.000931,-0.002079,all_souls
2,-0.182766,0.112678,-0.072608,-0.066618,-0.151153,0.145077,0.016828,-0.000937,-0.058925,0.030486,...,0.006297,0.002810,0.007026,-0.003762,-0.001334,0.003043,0.003580,0.000501,-0.001501,radcliffe_camera
3,-0.162958,-0.195413,-0.201296,-0.007927,0.006130,-0.013056,-0.074896,0.119956,-0.042482,0.024529,...,-0.000959,-0.000534,0.001421,0.004909,-0.002227,-0.001403,0.003236,-0.001630,0.001046,christ_church
4,-0.156370,-0.175493,-0.233327,0.003510,0.057135,0.071303,-0.063093,0.068845,0.037527,0.048635,...,-0.000473,-0.002242,0.002499,-0.001076,0.000346,-0.000612,0.001374,0.001420,0.002370,all_souls
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,0.008675,-0.364543,-0.034069,0.030281,-0.073821,-0.109432,0.161836,-0.136757,0.110383,0.058745,...,-0.001491,-0.001246,-0.001753,0.001646,0.000618,0.003733,0.000092,-0.000043,-0.000278,radcliffe_camera
512,-0.002103,-0.361036,0.056904,-0.049549,0.061208,-0.070501,0.100968,-0.003870,0.064122,-0.011988,...,-0.002792,-0.003951,-0.003102,-0.001206,-0.002207,-0.003269,0.003815,-0.003498,-0.001653,radcliffe_camera
513,0.010216,-0.358101,0.091496,0.008245,0.023822,-0.127268,0.205493,0.021845,0.175781,-0.181827,...,-0.001939,0.000826,0.004236,-0.005485,0.002993,-0.002058,0.002255,0.006579,0.004411,radcliffe_camera
514,0.165959,-0.310917,-0.216677,-0.067044,0.173877,0.053991,0.198769,-0.001055,-0.021094,0.125117,...,-0.000165,0.000492,0.000051,0.001506,0.001901,-0.000668,-0.000263,0.000772,0.000626,radcliffe_camera


In [1]:
ox_easy_names

NameError: name 'ox_easy_names' is not defined