In [5]:
import os
import numpy as np
import joblib
import pickle
from diffusion import Diffusion
from sklearn import preprocessing
from dataset import Dataset
from knn import KNN
from tqdm import tqdm
import unittest
import sklearn.metrics.pairwise
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import PIL
import time
import glob
import copy
import sklearn.decomposition
import rank

NOTEBOOK_DIR = "/home/sean/Code/Pawsey/2. Revised Data Analysis/diffusion"

In [6]:
def query_results_to_truth_values(query_image_building, query_results, image_names):
    """Convert the index results of a query to an array of booleans corresponding to whether the correct image was retrieved."""
    return([query_image_building == image_names[index] for index in query_results])

## Potential Improvements
# 1. Precision_at_k to print precision at last true value?
# 2. Create a more computationally efficient (/combined?) version.
# 3. Add recall_at_k

def precision_at_k(truth_values, k, warnings=True):
    """Return proportions of true values in the first k elements.
    If warnings=True and all true values occur before the kth element, raise an error"""
    p_at_k = truth_values[:k].count(True) / k

    # if warnings:
    #     if k < len(truth_values):
    #         if truth_values[k:].count(True) == 0:
    #             raise ValueError("All true values are before the first k values")
    
    return(p_at_k)


def average_precision(truth_values):
    """Given a boolean input of whether returned query values are correct or false, return the average precision.
    e.g. average_precision([True, True, False, True]) ~ 0.85
    """
    precisions = []
    for (index, val) in enumerate(truth_values):
        if val: # == True
            precisions.append(truth_values[:index + 1].count(True) / (index + 1))      

    return(np.mean(precisions))

def compute_metrics(train_features, test_features, train_names, test_names, query_function, metric_function, average_mean_precision = True, k_values = [5,10,20], **kwargs):
    """Run each test feature against the train features,"""
    average_precisions = []
    precisions_at_k = {}
    for k in k_values:
        precisions_at_k[k] = []
    
    for (test_feature, test_feature_name) in zip(test_features.iterrows(), test_names):
        query_image_features = test_feature[1].values # extract the values for the iterrows row object
        query_results = query_function(query_image_features, train_features, metric_function, **kwargs)
        truth_values = query_results_to_truth_values(test_feature_name, query_results, train_names)
        
        average_precisions.append(average_precision(truth_values))
        for k in precisions_at_k:
            p_at_k = precision_at_k(truth_values, k, warnings=False)
            precisions_at_k[k].append(p_at_k)
    
    for (k_value, list_of_precisions) in precisions_at_k.items():
        precisions_at_k[k_value] = np.mean(list_of_precisions)
    
    return(np.mean(average_precisions), precisions_at_k)

____
## Convert Features to Pickles

In [7]:
def conv_file_to_npy(input_fname, output_fname, input_type = "csv"):
    """Load an input file, convert the data to a numpy array, pickle it and save to an output file"""

    if input_type == "csv":
        data = pd.read_csv(input_fname).to_numpy()
    
    np.save(output_fname, data)
    
    return

In [8]:
basedir = "/home/sean/Code/Pawsey/3. Data/New_SIFT"

conv_file_to_npy("/home/sean/Code/Pawsey/3. Data/New_SIFT/SIFT/roxford5k-1000-easy-SIFT.csv", NOTEBOOK_DIR + "/data/gallery.npy")
conv_file_to_npy("/home/sean/Code/Pawsey/3. Data/New_SIFT/SIFT/roxford5k-1000-querye-SIFT.csv", NOTEBOOK_DIR + "/data/queries.npy")

____
## Run Diffusion

In [9]:
def return_ranks(queries, gallery):
    n_query = len(queries)
    diffusion = Diffusion(np.vstack([queries, gallery]), cache_dir)
    offline = diffusion.get_offline_results(truncation_size, kd)
    features = preprocessing.normalize(offline, norm="l2", axis=1)
    scores = features[:n_query] @ features[n_query:].T
    ranks = np.argsort(-scores.todense())
    return(ranks)


In [36]:
DATA_DIR="./data"
# directory to cache files
TMP_DIR="./tmp"
# oxford5k, oxford105k, paris6k, paris106k
DATASET="oxford5k"
# resnet or siamac
FEATURE_TYPE="resnet"

cache_dir = "./temp/roxford5k_easy_SIFT"
gallery_path = "/home/sean/Downloads/diffusion/data/gallery/oxford5k_resnet_glob.npy" # "./data/gallery.npy"
query_path = "/home/sean/Downloads/diffusion/data/query/oxford5k_resnet_glob.npy" # "./data/queries.npy"
dataset_name = "roxford5k_easy"
truncation_size = 500
kq, kd = 10, 50

In [11]:
os.getcwd()

'/home/sean/Code/Pawsey/2. Revised Data Analysis/diffusion'

In [33]:
if not os.path.isdir(cache_dir):
    os.makedirs(cache_dir)
dataset = Dataset(query_path, gallery_path)
queries, gallery = dataset.queries, dataset.gallery
ranks = return_ranks(queries, gallery)
ranks

[offline] starting offline diffusion
[offline] 1) prepare Laplacian and initial state
[offline] 2) gallery-side diffusion


[offline] diffusion: 100%|██████████| 5118/5118 [00:06<00:00, 754.49it/s]


[offline] 3) merge offline results
[cache] obtaining ./temp/roxford5k_easy_SIFT/offline.jbl costs 7.97s


matrix([[2989,  587, 4045, ..., 3086, 2752, 4465],
        [2989,  587, 4045, ..., 3086, 2752, 4465],
        [4178, 1334,  587, ..., 2603,  971, 2296],
        ...,
        [2349, 4773,  808, ..., 4759,  600, 4465],
        [3634,  544, 2794, ...,  971, 2752, 4759],
        [4773, 4055,  808, ..., 3086, 2752, 1304]])

In [38]:
if not os.path.isdir(cache_dir):
    os.makedirs(cache_dir)
dataset = Dataset(q_path_2, g_path_2)
queries, gallery = dataset.queries, dataset.gallery
ranks = return_ranks(queries, gallery)
ranks

[cache] loading ./temp/roxford5k_easy_SIFT/offline.jbl costs 0.00s


matrix([[  0, 352, 351, ..., 162, 192, 515],
        [382, 344, 313, ..., 121, 497, 393],
        [  0, 352, 351, ..., 162, 192, 515],
        ...,
        [458,   0, 350, ..., 162, 192, 515],
        [  5,  57, 302, ..., 497, 206, 357],
        [382, 344, 313, ...,  25, 357, 494]])

In [22]:
g_path_2 = "./data/gallery.npy"
q_path_2 = "./data/queries.npy"

my_g = np.load(g_path_2)
their_g = np.load(gallery_path)

my_q = np.load(q_path_2)
their_q = np.load(query_path)

In [20]:
np.shape(my_g)

(516, 1000)

In [21]:
np.shape(their_g)

(5063, 2048)

In [23]:
np.shape(my_q)

(70, 1000)

In [24]:
np.shape(their_q)

(55, 2048)

In [26]:
np.shape(my_g[0])

(1000,)

In [27]:
np.shape(their_g[0])

(2048,)

____
## Evaluation

In [39]:
gallery_names = np.load("/home/sean/Code/Pawsey/3. Data/New_SIFT/names/roxford5k-1000-easy-names.npy")
query_names = np.load("/home/sean/Code/Pawsey/3. Data/New_SIFT/names/roxford5k-1000-querye-names.npy")

In [77]:
ranks2 = ranks.A
ranks2

array([[  0, 352, 351, ..., 162, 192, 515],
       [382, 344, 313, ..., 121, 497, 393],
       [  0, 352, 351, ..., 162, 192, 515],
       ...,
       [458,   0, 350, ..., 162, 192, 515],
       [  5,  57, 302, ..., 497, 206, 357],
       [382, 344, 313, ...,  25, 357, 494]])

In [86]:
row = ranks2[0]
truth_values = [gallery_names[index] == "ashmolean" for index in row]
average_precision(truth_values)

0.02553419210531353

In [88]:
mean_aps = []

for query_ranks, query_building in zip(ranks2, query_names):
    truth_values = query_results_to_truth_values(query_building, query_ranks, gallery_names)
    mean_aps.append(average_precision(truth_values))

np.mean(mean_aps)

0.29292679643549335

In [51]:
type(ranks)

numpy.matrix