# Reranking Test Notebook

__Goals__
- Implement nqe
- Implement alpha query expansion
- Implement diffusion

____
## Imports

In [3]:
import numpy as np
import pandas as pd
import sklearn.metrics
import unittest
import sklearn.preprocessing
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import PIL
import time
import glob
import copy

_____
## Metrics Functions

In [2]:
def basic_query(query_image_features, query_target_features, metric_function = sklearn.metrics.pairwise.euclidean_distances):
    """Return the indexes of the query_target images, arranged in ascending euclidean distance as compared to the query image"""
    
    query = query_image_features.reshape((1, -1))
    D = metric_function(query_target_features, query).squeeze()
    index = np.argsort(D)

    return(index)

In [3]:
def query_results_to_truth_values(query_image_building, query_results, image_names):
    """Convert the index results of a query to an array of booleans corresponding to whether the correct image was retrieved."""
    return([query_image_building == image_names[index] for index in query_results])

In [4]:
def precision_at_k(truth_values, k, warnings=True):
    """Return proportions of true values in the first k elements.
    If warnings=True and all true values occur before the kth element, raise an error"""
    p_at_k = truth_values[:k].count(True) / k

    if warnings:
        if k < len(truth_values):
            if truth_values[k:].count(True) == 0:
                raise ValueError("All true values are before the first k values")
    
    return(p_at_k)


def average_precision(truth_values):
    """Given a boolean input of whether returned query values are correct or false, return the average precision.
    e.g. average_precision([True, True, False, True]) ~ 0.85
    """
    precisions = []
    for (index, val) in enumerate(truth_values):
        if val: # == True
            precisions.append(truth_values[:index + 1].count(True) / (index + 1))      

    return(np.mean(precisions))

In [5]:
def compute_metrics(train_features, test_features, train_names, test_names, query_function, metric_function, average_mean_precision = True, k_values = [5,10,20]):
    """Run each test feature against the train features,"""
    average_precisions = []
    precisions_at_k = {}
    for k in k_values:
        precisions_at_k[k] = []
    
    for (test_feature, test_feature_name) in zip(test_features.iterrows(), test_names):
        query_image_features = test_feature[1].values # extract the values for the iterrows row object
        query_results = query_function(query_image_features, train_features, metric_function)
        truth_values = query_results_to_truth_values(test_feature_name, query_results, train_names)
        
        average_precisions.append(average_precision(truth_values))
        for k in precisions_at_k:
            p_at_k = precision_at_k(truth_values, k, warnings=False)
            precisions_at_k[k].append(p_at_k)
    
    for (k_value, list_of_precisions) in precisions_at_k.items():
        precisions_at_k[k_value] = np.mean(list_of_precisions)
    
    return(np.mean(average_precisions), precisions_at_k)

____
## Query Expansion

In [6]:
def new_expanded_query(original_query_results, query_target_features, type = "n", n = 5):
    """Return an expanded query to based on the top n results of the initial query results."""
    
    top_n_features = query_target_features.loc[:n]

    if type == "n":
        return(pd.DataFrame([top_n_features.apply(np.mean)]).values)

    else:
        if type == "alpha":
            raise ValueError("Alpha query expansion not implemented")
        elif type == "linear1":
            weights = [(n-i)/n for i in range(n)]
        elif type == "linear2":
            weights = [(n-i)/n + 1 for i in range(n)]
        elif type == "fractional":
            weights = [1/i for i in range(1, n+1)]
        
        return(pd.DataFrame([top_n_features.apply(np.average, weights = weights)]))


def qe_query(query, query_target, type = "n", n=5, metric_function=sklearn.metrics.pairwise.euclidean_distances):
    """Perform a query, then run query expansion and return the new results."""
    original_results = basic_query(query, query_target, metric_function)
    new_query = new_expanded_query(original_results, query_target, type, n)
    new_results = basic_query(new_query, query_target, metric_function)
    return(new_results)

____
## Query Expansion Take 2

In [11]:
# def basic_query(query_image_features, query_target_features, metric_function = sklearn.metrics.pairwise.euclidean_distances):
#     """Return the indexes of the query_target images, arranged in ascending euclidean distance as compared to the query image"""
    
#     query = query_image_features.reshape((1, -1))
#     D = metric_function(query_target_features, query).squeeze()
#     index = np.argsort(D)

#     return(index)

def new_query(features, weights):
    """Return a new dataframe row containing a weighted combination of a previous set of features."""
    return(pd.DataFrame([features.apply(np.average, weights = weights)]))


def qe_query(query, query_target, metric_function=sklearn.metrics.pairwise.euclidean_distances, type="qe baseline", n=5, alpha=1):
    """Run a query with query expansion, supported methods:
       - "qe baseline" : described in Total Recall (2007), new result is based on alpha proportion of requerying (e.g. alpha = 1,
                         then results after the top 5 will be completely determined by the top five"""
    
    original_results = basic_query(query, query_target, metric_function)

    if type == "qe baseline":
        # find top n results, combine top n into a new query, append results of new query to top n
        top_n = original_results.loc[:n]
        second_query = new_query(top_n, weights = np.ones(n))
        
        if alpha != 1:
            second_query = new_query(pd.DataFrame([query, second_query]), weights = [1 - alpha, alpha])

        new_results = basic_query(second_query, query_target, metric_function)
        pruned_new_results = new_results[np.logical_not(np.isin(new_results, top_n))]
        results = np.concatenate([original_results, pruned_new_results])

    print("Something went wrong")
    

In [4]:
query = np.array([1,1,1,1,1])
new_query = np.array([0,0,0,0,0])

In [9]:
pd.DataFrame([query, new_query]).apply(np.average)

0    0.5
1    0.5
2    0.5
3    0.5
4    0.5
dtype: float64

In [18]:
np.concatenate([a, b])

array([1, 2, 3, 1, 2, 3])

_____
## Test Space

In [3]:
test_features = pd.DataFrame({1 : [1,2,3,4],
                              2 : [1,2,3,4],
                              3 : [1,2,3,4]})
results = [2,0,1,3]

n = 2

a = test_features.loc[results[:2]]
a

Unnamed: 0,1,2,3
2,3,3,3
0,1,1,1


In [1]:
def func(a, b, c):
    print(a)
    print(b)
    print(c)

func("1", "2", "3")

1
2
3


In [9]:
def do_func(fn, **kwargs):
    fn(a= 1, b = 2, **kwargs)

do_func(func, c = 3)

1
2
3


In [4]:
b = pd.DataFrame([a.apply(np.mean)]).values
b

array([[2., 2., 2.]])

In [14]:
b2 = pd.DataFrame([a.apply(np.average, weights = [2,1])]).values
b2

array([[2.33333333, 2.33333333, 2.33333333]])

In [77]:
c = np.array(a.apply(np.mean))
type(c)

numpy.ndarray

In [78]:
type(b.values)

numpy.ndarray

In [81]:
np.shape(b.values)

(1, 3)

___
## Test AQE

In [17]:
basedir = "/home/sean/Code/Pawsey/3. Data/New_SIFT"

sift_raw = {"roxford5k" : {}, "rparis6k" : {}}

os.chdir(basedir + "/SIFT")
for file_name in os.listdir():
    split_name = file_name.split("-")
    dataset, num_features, difficulty, _ = split_name

    num_features = int(num_features)
    if num_features not in sift_raw[dataset].keys():
        sift_raw[dataset][num_features] = {}
    
    sift_raw[dataset][num_features][difficulty] = pd.read_csv(file_name)

sift_raw["roxford5k"][1000]["easy"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.008328,0.0,0.213189,0.041077,0.0,0.011529,0.016828,0.016603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016982,0.0,0.0,0.011505
1,0.0,0.026717,0.024817,0.038176,0.018326,0.085688,0.042927,0.0,0.0,0.012369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025303,0.0,0.019041,0.008571
2,0.0,0.002802,0.036436,0.016014,0.046126,0.005135,0.033442,0.010809,0.010518,0.005189,...,0.007394,0.0,0.0,0.0,0.0,0.0,0.018575,0.0,0.007988,0.003596
3,0.0,0.017057,0.015844,0.032496,0.0546,0.039075,0.023491,0.021934,0.0,0.0,...,0.022507,0.0,0.0,0.0,0.0,0.0,0.016154,0.0,0.0,0.021888
4,0.0,0.015522,0.050464,0.025876,0.088728,0.046228,0.103324,0.00998,0.0,0.0,...,0.0,0.0,0.011005,0.0,0.0,0.0,0.022051,0.0,0.0,0.0


In [18]:
names = {"roxford5k" : {}, "rparis6k" : {}}

os.chdir(basedir + "/names")
for data_file in os.listdir():
    split_name = data_file[:-4].split("-")
    dataset, _, difficulty, _ = split_name
    names[dataset][difficulty] = np.load(data_file)

names["rparis6k"]["easy"]

array(['invalides', 'moulinrouge', 'louvre', ..., 'eiffel', 'triomphe',
       'eiffel'], dtype='<U11')