In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.spatial.distance import cdist
from scipy.stats import ks_2samp, cramervonmises_2samp

from datasets import load_dataset
from transformers import AutoImageProcessor, AutoModel

tiny_imagenet = load_dataset('zh-plus/tiny-imagenet', split='valid')
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
model = AutoModel.from_pretrained('facebook/dinov2-small')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def embed(x):
    inputs = processor(images=x, return_tensors="pt")
    outputs = model(**inputs)
    retval = (
        outputs
        .last_hidden_state
        .flatten()
        .detach()
        .numpy()
    )
    retval = retval[np.newaxis, :]
    return retval

In [3]:
def compute_distances(A, B):
    return np.array(
        [
            cdist(a, b)
            for a in A
            for b in B
        ]
    )

In [4]:
def compute_metrics(data, func):
    pvalues = defaultdict(dict)
    statistics = defaultdict(dict)
    
    def cast_and_round(x):
        x = float(x)
        return round(x,3)

    for i, query in enumerate(data):
        for j, reference in enumerate(data):
            reference_distance = compute_distances(reference, reference)
            if i == j:
                # split the set in two and measure how similarly distributed it is.
                split_idx = len(reference_distance) // 2
                metric = func(reference_distance[:split_idx], reference_distance[split_idx:])
            else:
                query_distance = compute_distances(reference, query)
                metric = func(reference_distance, query_distance)
            pvalues[i][j] = cast_and_round(metric.pvalue)
            statistics[i][j] = cast_and_round(metric.statistic)
    return (pvalues, statistics)

def compute_cvm(data):
    return compute_metrics(data, func=cramervonmises_2samp)

def compute_ks(data):
    return compute_metrics(data, func=ks_2samp)

In [9]:
desired_labels = [0,1,2,3,4,5]

embeddings = []
for label in desired_labels:
    embeddings.append(
        [
            embed(image['image'])
            for image in tiny_imagenet 
            if image['label'] == label
        ]
    )

In [10]:
embeddings[0][0].shape

(1, 98688)

In [11]:
cvm_pvalues, cvm_statistics = compute_cvm(embeddings)
ks_pvalues, ks_statistics = compute_ks(embeddings)

  x = float(x)


In [12]:
col_ix = pd.MultiIndex.from_product([['Reference'], desired_labels]) 
row_ix = pd.MultiIndex.from_product([['Query'], desired_labels])

query_cvm_statistic_df = pd.DataFrame(cvm_statistics)
query_cvm_statistic_df = query_cvm_statistic_df.set_index(row_ix)
query_cvm_statistic_df.columns = col_ix

query_cvm_pvalue_df = pd.DataFrame(cvm_pvalues)
query_cvm_pvalue_df = query_cvm_pvalue_df.set_index(row_ix)
query_cvm_pvalue_df.columns = col_ix

print("Cramer-Von Mises")
print(" === statistic ===")
print(query_cvm_statistic_df)
print()
print(" === p-value ===")
print(query_cvm_pvalue_df)

ValueError: Length mismatch: Expected 6 rows, received array of length 3

In [None]:
col_ix = pd.MultiIndex.from_product([['Reference'], desired_labels]) 
row_ix = pd.MultiIndex.from_product([['Query'], desired_labels])

query_cvm_statistic_df = pd.DataFrame(ks_statistics)
query_cvm_statistic_df = query_cvm_statistic_df.set_index(row_ix)
query_cvm_statistic_df.columns = col_ix

query_cvm_pvalue_df = pd.DataFrame(ks_statistics)
query_cvm_pvalue_df = query_cvm_pvalue_df.set_index(row_ix)
query_cvm_pvalue_df.columns = col_ix

print("Kolmgorov-Smirnov")
print(" === statistic ===")
print(query_cvm_statistic_df)
print()
print(" === p-value ===")
print(query_cvm_pvalue_df)