In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')

import os
import numpy as np
import pandas as pd

from sklearn import cluster
from wildlife_datasets import datasets, loader, metrics

In [None]:
root_dataset = '../data/'
root_dataframe = '../data/_dataframes/'

In [None]:
def ratio(x, y, digits=1):
    if y == 0:
        return 'inf'
    else:
        r = np.round(100*x/y, digits)
        return str(r) + '%'

def get_metrics(results, identity, clusters, prefix=''):
    df = pd.DataFrame({'identity': identity, 'clusters': clusters})
    
    n_correct_cluster = 0
    n_wrong_cluster = 0
    identity_in_clusters = []    
    for cluster, df_cluster in df.groupby('clusters'):
        if cluster >= 0:
            identity_counts = df_cluster['identity'].value_counts().sort_values(ascending=False)            
            n_correct_cluster += identity_counts.iloc[0]
            n_wrong_cluster += np.sum(identity_counts.iloc[1:])
            identity_in_clusters.append(identity_counts.index[0])

    n_no_cluster = np.sum(df['clusters'] == -1)
    n_identities = df['identity'].nunique()
    n_identities_in_clusters = pd.Series(identity_in_clusters).nunique()
    results[prefix + 'missed_identities'] = ratio(n_identities-n_identities_in_clusters, n_identities)
    results[prefix + 'multiple_identities'] = ratio(len(identity_in_clusters), n_identities_in_clusters)
    results[prefix + 'no_cluster'] = ratio(n_no_cluster, len(df))
    results[prefix + 'correct_cluster'] = ratio(n_correct_cluster, len(df))
    results[prefix + 'wrong_cluster'] = ratio(n_wrong_cluster, len(df))

In [None]:
# Computation of the similarity matrix must be batched

k = 10
names = []
results_all = []
for d_name in datasets.names_all:
    print(d_name)
    file_name = f'../data/_features/features_{d_name.__name__}.npy'
    d = loader.load_dataset(d_name, root_dataset, root_dataframe)
    if os.path.exists(file_name) and len(d.df) <= 10000:
        names.append(d_name.__name__)
        results = {}

        output = np.load(file_name)
        for i in range(len(output)):
            output[i] /= np.linalg.norm(output[i])
        
        similarity = output @ output.T
        np.fill_diagonal(similarity, -1)        
        idx = (-similarity).argsort(axis=-1)[:, :k]
        pred = [d.df['identity'].iloc[idx[i]].values for i in range(len(d.df))]
        map = metrics.mean_average_precision(d.df['identity'].values, pred)
        results['map'] = ratio(map, 1)
        
        db = cluster.DBSCAN().fit(output)
        get_metrics(results, d.df['identity'].to_numpy(), db.labels_, prefix='l2_')
        
        db = cluster.DBSCAN(metric='cosine').fit(output)
        get_metrics(results, d.df['identity'].to_numpy(), db.labels_, prefix='cos_')

        results_all.append(results)

In [None]:
results_all = pd.DataFrame(results_all)
results_all.index = names
results_all