In [None]:
!pip install git+https://github.com/WildlifeDatasets/wildlife-datasets@develop --quiet
!pip install git+https://github.com/WildlifeDatasets/wildlife-tools --quiet

In [None]:
from wildlife_tools.similarity.pairwise.collectors import CollectCounts
from wildlife_tools.features import SuperPointExtractor, AlikedExtractor, DiskExtractor, SiftExtractor, DeepFeatures
from wildlife_tools.similarity import MatchLightGlue
from wildlife_tools.similarity.calibration import IsotonicCalibration, LogisticCalibration
from wildlife_tools.similarity.wildfusion import SimilarityPipeline # WildFusion
from wildlife_tools.similarity.cosine import CosineSimilarity
from wildlife_datasets.datasets import WildlifeDataset, AnimalCLEF2025
from wildlife_datasets import splits

from wildlife_tools.similarity.pairwise.base import MatchPairs, PairDataset
from wildlife_tools.data import ImageDataset, FeatureDataset


In [None]:
from typing import List, Union, Callable
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

from sklearn.metrics import *

In [None]:
from transformers import AutoModel
import torchvision.transforms as T
import torch
import timm

import kornia.feature as KF
import kornia as K


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import random
# фикс сидов, чтобы обучение было воспроизводимым.
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(52)

In [None]:
# metrics
def BAKS(
    y_true: List,
    y_pred: List,
    identity_test_only: List,
) -> float:
    """Computes BAKS (balanced accuracy on known samples).

    It ignores `identity_test_only` because they are unknown identities.

    Args:
        y_true (List): List of true labels.
        y_score (List): List of scores.
        identity_test_only (List): List of new identities (only in the testing set).

    Returns:
        Computed BAKS.
    """

    # Need to keep the object type due to mixed arrays
    y_true = np.array(y_true, dtype=object)
    y_pred = np.array(y_pred, dtype=object)
    identity_test_only = np.array(identity_test_only, dtype=object)

    # Remove data in identity_test_only
    idx = np.where(~np.isin(y_true, identity_test_only))[0]
    y_true_idx = y_true[idx]
    y_pred_idx = y_pred[idx]
    if len(y_true_idx) == 0:
        return np.nan

    df = pd.DataFrame({"y_true": y_true_idx, "y_pred": y_pred_idx})

    # Compute the balanced accuracy
    accuracy = 0
    for _, df_identity in df.groupby("y_true"):
        accuracy += (
            1
            / df["y_true"].nunique()
            * np.mean(df_identity["y_pred"] == df_identity["y_true"])
        )
    return accuracy

def BAUS(
    y_true: List, y_pred: List, identity_test_only: List, new_class: Union[int, str]
) -> float:
    """Computes BAUS (balanced accuracy on unknown samples).

    It handles only `identity_test_only` because they are unknown identities.

    Args:
        y_true (List): List of true labels.
        y_score (List): List of scores.
        identity_test_only (List): List of new identities (only in the testing set).
        new_class (Union[int, str]): Name of the new class.

    Returns:
        Computed BAUS.
    """

    # Need to keep the object type due to mixed arrays
    y_true = np.array(y_true, dtype=object)
    y_pred = np.array(y_pred, dtype=object)
    identity_test_only = np.array(identity_test_only, dtype=object)

    # Remove data not in identity_test_only
    idx = np.where(np.isin(y_true, identity_test_only))[0]
    y_true_idx = y_true[idx]
    y_pred_idx = y_pred[idx]
    if len(y_true_idx) == 0:
        return np.nan

    df = pd.DataFrame({"y_true": y_true_idx, "y_pred": y_pred_idx})

    # Compute the balanced accuracy
    accuracy = 0
    for _, df_identity in df.groupby("y_true"):
        accuracy += (
            1 / df["y_true"].nunique() * np.mean(df_identity["y_pred"] == new_class)
        )
    return accuracy



In [None]:
# function
def split_data(dataset, data):
    splitter = splits.OpenSetSplit(0.8, 0.1)
    idx_train, idx_test = next(iter(splitter.split(data)))
    training_dataloader = dataset.get_subset(idx_train)
    val_dataloader = dataset.get_subset(idx_test)
    return training_dataloader, val_dataloader

def get_preds(labels, similarity, query_size):
    pred_idx = similarity.argsort(axis=1)[:,-1]
    pred_scores = similarity[range(query_size), pred_idx]
    predictions = labels[pred_idx]
    return [predictions, pred_scores]

def to_torch(x: Union[List, np.array, torch.Tensor]):
    if type(x) is List:
        x_out = torch.tensor(x)
    elif isinstance(x, torch.Tensor):
        x_out = x
    elif isinstance(x, np.ndarray):
        x_out = torch.from_numpy(x)
    else:
        raise TypeError('img should be List, np.array or torch.Tensor')
    return x_out

def get_hits(dataset0, dataset1):
    gt0 = dataset0.labels_string
    gt1 = dataset1.labels_string
    gt_grid0 = np.tile(gt0, (len(gt1), 1)).T
    gt_grid1 = np.tile(gt1, (len(gt0), 1))
    return gt_grid0 == gt_grid1



In [None]:
# matchers_func
def fast_local_matchers(extractors, device='cpu'):
    matchers = []
    for extractor in extractors:
        model_ = SimilarityPipeline(
            matcher=MatchLightGlue(features=extractor, device=device),
            extractor= extractors[extractor] ,
            transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
            calibration=IsotonicCalibration()
        )
        matchers.append(model_)
    return matchers

def fast_priority_matcher(parallel=False, loaded_model=None, batch_size=64, device='cpu'):
    model = timm.create_model('hf-hub:BVRA/wildlife-mega-L-384', num_classes=0, pretrained=True, device=device)
    if loaded_model:
        model.load_state_dict({i[7:]: value for i, value in torch.load(loaded_model)['model'].items()})
        model.eval()
    if parallel:
        model = torch.nn.DataParallel(model)
    priority_matcher = SimilarityPipeline(
        matcher=CosineSimilarity(),
        extractor=DeepFeatures(model=model, device=device, batch_size=batch_size, num_workers=0),
        transform=T.Compose([T.Resize([384, 384]), T.ToTensor(), T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]),
        calibration=IsotonicCalibration()
    )
    return priority_matcher

In [None]:
class Kornia_Extractor:
    def __init__(self, model_name: str, mx_keypoints: int = 1000, device: None | str = None, num_workers: int = 0, only_xy: bool = True):
        """
        Args:
            model_name (str): 'KeyNetAffNetHardNet', 'HesAffNetHardNet', 'KeyNetHardNet', 'GFTTAffNetHardNet', 'SIFT', 'DISK'.
            mx_keypoints (int, optional): maximum number of points for a Descriptor.
            device (str, optional): Device used for inference. Defaults to None.
            only_xy(bool, optional): 
                If False, returns the affine transformation.
                If True, returns the coordinates.
            num_workers(int, optional): how many subprocesses to use for data loading.
        """
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        if model_name == 'KeyNetAffNetHardNet':
            model = KF.KeyNetAffNetHardNet(mx_keypoints, True)
        elif model_name == 'HesAffNetHardNet':
            model = KF.HesAffNetHardNet(mx_keypoints, True)
        elif model_name == 'KeyNetHardNet':
            model = KF.KeyNetHardNet(mx_keypoints, True)
        elif model_name == 'GFTTAffNetHardNet':
            model = KF.GFTTAffNetHardNet(mx_keypoints, True)
        elif model_name == 'SIFT':
            model = KF.SIFTFeature(mx_keypoints, True)
        elif model_name == 'DISK':
            model = KF.DISK.from_pretrained("depth")
        else:
            raise ValueError(f"Unknown model name: {model_name}")
            
        self.model = model.eval()
        self.device = device
        self.num_workers = num_workers
        self.only_xy = only_xy
        self.mx_keypoints = mx_keypoints
        self.model_name = model_name

    def __call__(self, dataset: ImageDataset) -> FeatureDataset:
        loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=self.num_workers,
            batch_size=1,
            shuffle=False,
        )
        features = []
        self.model.to(self.device)
        for image, _ in tqdm(loader, mininterval=1, ncols=100):
            if self.model_name == 'DISK':
                img = image.to(self.device)
                with torch.inference_mode():
                    output = self.model(img, n=self.mx_keypoints)[0]
                    kpt = output.keypoints.cpu()
                    kpt_laf = KF.laf_from_center_scale_ori(
                        kpt.unsqueeze(0),
                        torch.ones(1, len(kpt), 1, 1, device='cpu'))
                    
                    output = {
                        'keypoints': kpt_laf.squeeze(0).cpu(),  # [N, 2, 3]
                        'keypoint_scores': output.detection_scores.cpu(),
                        'descriptors': output.descriptors.cpu(),
                        'image_size': torch.tensor(image.shape[2:])
                    }
            else:
                img = K.color.rgb_to_grayscale(image).to(self.device)
                with torch.inference_mode():
                    output = self.model(img)
                    output = {
                        'keypoints': output[0].squeeze(0).cpu(),
                        'keypoint_scores': output[1].squeeze(0).cpu(),
                        'descriptors': output[2].squeeze(0).cpu(),
                        'image_size': torch.tensor(image.shape[2:])
                    }
            
            # Дополнение ключевых точек до max_keypoints
            if output['keypoints'].shape[0] < self.mx_keypoints:
                pad_size = self.mx_keypoints - output['keypoints'].shape[0]
                output['keypoints'] = torch.cat([
                    output['keypoints'],
                    torch.zeros(pad_size, 2, 3, device='cpu')
                ], dim=0)
                output['keypoint_scores'] = torch.cat([
                    output['keypoint_scores'],
                    torch.zeros(pad_size, device='cpu')
                ], dim=0)
                output['descriptors'] = torch.cat([
                    output['descriptors'],
                    torch.zeros(pad_size, output['descriptors'].shape[1], device='cpu')
                ], dim=0)
            
            if self.only_xy:
                output['keypoints'] = KF.get_laf_center(output['keypoints'].unsqueeze(0)).reshape(-1, 2) + 0.5
            features.append(output)
        self.model.to("cpu")
        
        return FeatureDataset(
            metadata=dataset.metadata,
            features=features,
            col_label=dataset.col_label,
        )


In [None]:
class Kornia_Matcher(MatchPairs):
    def __init__(self, init_threshold: float = 0.1, device: str | None = None,
                 matcher_name='adalam', extract=None, **kwargs):
        super().__init__(**kwargs)
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = device
        self.init_threshold = init_threshold
        self.matcher_name = matcher_name
        self.extract = extract
        
        self.model = None
        if self.matcher_name == 'LightGlueMatcher':
            name = {
                'DISK': 'disk', 
                'SIFT': 'sift',
                'KeyNetAffNetHardNet': 'keynet_affnet_hardnet'
            }
            self.model_name = name[extract]
    
    def _load_model(self):
        if self.model is None and self.matcher_name == 'LightGlueMatcher':
            self.model = KF.LightGlueMatcher(self.model_name).eval().to(self.device)
    
    def get_matches(self, batch):
        self._load_model()
        idx0, data0, idx1, data1 = batch
        all_scores, all_matches = [], []
        
        try:
            for i in range(data1['image_size'].shape[0]):
                torch.cuda.empty_cache()
                
                if len(data0["keypoints"][i]) == 0 or len(data1["keypoints"][i]) == 0:
                    all_scores.append(torch.empty(0))
                    all_matches.append(torch.empty(0, 2, dtype=torch.long))
                    continue
                
                with torch.no_grad():
                    desc1 = data0["descriptors"][i].to(self.device)
                    desc2 = data1["descriptors"][i].to(self.device)
                    lafs1 = data0["keypoints"][i].unsqueeze(0).to(self.device)
                    lafs2 = data1["keypoints"][i].unsqueeze(0).to(self.device)
                    hw1 = data0['image_size'][i]
                    hw2 = data1['image_size'][i]
                    
                    if self.matcher_name == 'adalam':
                        scores_, matches_ = KF.match_adalam(
                            desc1=desc1, desc2=desc2,
                            lafs1=lafs1, lafs2=lafs2,
                            hw1=hw1, hw2=hw2,
                        )
                    elif self.matcher_name == 'fginn':
                        scores_, matches_ = KF.match_fginn(
                            desc1=desc1, desc2=desc2,
                            lafs1=lafs1, lafs2=lafs2,
                        )
                    elif self.matcher_name == 'GADMatcher':
                        scores_, matches_ = KF.GeometryAwareDescriptorMatcher(match_mode='fginn')(
                            desc1=desc1, desc2=desc2,
                            lafs1=lafs1, lafs2=lafs2,
                        )
                    elif self.matcher_name == 'LightGlueMatcher':
                        scores_, matches_ = self.model(
                            desc1=desc1, desc2=desc2,
                            lafs1=lafs1, lafs2=lafs2,
                            hw1=hw1, hw2=hw2,
                        )
                    
                    if matches_.shape[0] == 0:
                        all_scores.append(torch.empty(0))
                        all_matches.append(torch.empty(0, 2, dtype=torch.long))
                    else:
                        all_scores.append(scores_.detach().cpu())
                        all_matches.append(matches_.long().detach().cpu())
                    
                    del desc1, desc2, lafs1, lafs2, scores_, matches_
                
        finally:
            torch.cuda.empty_cache()
        
        data = []
        for i, (i0, i1, scores, matches) in enumerate(zip(idx0, idx1, all_scores, all_matches)):
            if len(matches) == 0 or len(data0["keypoints"][i]) == 0 or len(data1["keypoints"][i]) == 0:
                data.append({
                    "idx0": i0.item(),
                    "idx1": i1.item(),
                    "kpts0": np.empty((0, 2)),
                    "kpts1": np.empty((0, 2)),
                    "scores": np.empty(0),
                })
            else:
                matches_long = matches.long()
                data.append({
                    "idx0": i0.item(),
                    "idx1": i1.item(),
                    "kpts0": data0["keypoints"][i][matches_long[:, 0]].cpu().numpy(),
                    "kpts1": data1["keypoints"][i][matches_long[:, 1]].cpu().numpy(),
                    "scores": scores.numpy(),
                })
        return data
    
    def __del__(self):
        if hasattr(self, 'model') and self.model is not None:
            del self.model
        torch.cuda.empty_cache()

In [None]:
from scipy.optimize import minimize

def get_preds_by_treshold(coefs, scores, dataset0, dataset1):
    coefs = np.array(coefs)
    coefs_expanded = coefs[:, np.newaxis, np.newaxis]  # Теперь форма (3, 1, 1)
    similarity = np.mean(np.array(scores)*coefs_expanded, axis=0)
    similarity = np.where(np.isnan(similarity), 0, similarity)
    test_only_idx = list(set(dataset0.metadata["identity"]) - set(dataset1.metadata["identity"]))
    predictions, scores_model = get_preds(dataset1.labels_string, similarity, len(dataset0))
    best = {'geo_mean': -1}
    for threshold in np.arange(0.02, 0.98, 0.02):
        pred = predictions.copy()
        pred[scores_model < threshold] = 'new_individual'
        baks = BAKS(dataset0.labels_string, pred, test_only_idx)
        baus = BAUS(dataset0.labels_string, pred, test_only_idx, "new_individual")
        geo_mean = np.sqrt(baks * baus)
        best['geo_mean'] = max(best['geo_mean'], geo_mean)
    return 1 - best['geo_mean']

class WildFusion:
    def __init__(
        self,
        calibrated_pipelines: list[SimilarityPipeline],
        priority_pipeline: SimilarityPipeline | None = None,
    ):
        self.calibrated_pipelines = calibrated_pipelines
        self.priority_pipeline = priority_pipeline
        self.coefs = None

    def fit_calibration(self, dataset0: ImageDataset, dataset1: ImageDataset):
        for matcher in self.calibrated_pipelines:
            matcher.fit_calibration(dataset0, dataset1)
        if (self.priority_pipeline is not None) and (self.priority_pipeline.calibration is not None):
            self.priority_pipeline.fit_calibration(dataset0, dataset1)

    def get_optim(self, dataset0: ImageDataset, dataset1: ImageDataset, B=25):
        if B is not None:
            pairs = self.get_priority_pairs(dataset0, dataset1, B=B)
        scores = []
        for matcher in self.calibrated_pipelines:
            scores.append(matcher(dataset0, dataset1, pairs=pairs))
        scores = np.where(np.isnan(np.array(scores)), 0, np.array(scores))
        res = minimize(get_preds_by_treshold, ([0.5 for _ in range(len(scores))]), (scores, dataset0, dataset1), method='Powell')
        print(res.x, res.fun)
        self.coefs = res.x
        coefs_expanded = self.coefs[:, np.newaxis, np.newaxis]
        similarity = np.mean(np.array(scores)*coefs_expanded, axis=0)
        return similarity
    
    def get_priority_pairs(self, dataset0: ImageDataset, dataset1: ImageDataset, B: int) -> np.ndarray:
        if self.priority_pipeline is None:
            raise ValueError("Priority matcher is not assigned.")
        priority = self.priority_pipeline(dataset0, dataset1)
        _, idx1 = torch.topk(torch.tensor(priority), min(B, priority.shape[1]))
        idx0 = np.indices(idx1.numpy().shape)[0]
        grid_indices = np.stack([idx0.flatten(), idx1.flatten()]).T
        return grid_indices

    def __call__(
        self,
        dataset0: ImageDataset,
        dataset1: ImageDataset,
        pairs: list | None = None,
        B: int = None,
    ):
        if B is not None:
            pairs = self.get_priority_pairs(dataset0, dataset1, B=B)
        scores = []
        for matcher in self.calibrated_pipelines:
            scores.append(matcher(dataset0, dataset1, pairs=pairs))
        scores = np.where(np.isnan(np.array(scores)), 0, np.array(scores))
        coefs_expanded = self.coefs[:, np.newaxis, np.newaxis]
        similarity = np.mean(np.array(scores)*coefs_expanded, axis=0)
        return similarity



In [None]:
# Conveyor
class Conveyor:
    def __init__(self, query, database, device='cpu'):
        self.Test = query
        self.Train = database
        self.device = device

    def build_model(self, priority_matcher, local_matchers):
        self.wildfusion = WildFusion(priority_pipeline=priority_matcher, calibrated_pipelines=local_matchers)
    def similarity(self, BBB=25):
        self.sim = self.wildfusion(self.Test, self.Train, B=BBB)
    def predict(self, save_path = None):
        predictions, scores = get_preds(self.Train.labels_string, self.sim, len(self.Test))
        if save_path:
            sub = pd.DataFrame({
                'image_id': self.Test.df['image_id'],
                'identity': predictions,
                'scores': scores
            })
            sub.to_csv(save_path, index=False)
        return predictions, scores
    def calibration(self, data):
        database, query = data
        self.wildfusion.fit_calibration(query, database)
    def get_best_thresholds(self, data, BBB=25):
        database, query = data
        similarity = self.wildfusion(query, database, B=BBB)
        test_only_idx = list(set(query.metadata["identity"]) - set(database.metadata["identity"]))
        predictions, scores_model = get_preds(database.labels_string, similarity, len(query))

        best = {'geo_mean': -1, 'threshold': None}
        scores = []

        for threshold in tqdm(np.arange(0.001, 0.9, 0.001)):
            pred = predictions.copy()
            pred[scores_model < threshold] = 'new_individual'
            baks = BAKS(query.labels_string, pred, test_only_idx)
            baus = BAUS(query.labels_string, pred, test_only_idx, "new_individual")
            geo_mean = np.sqrt(baks * baus)
            if geo_mean > best['geo_mean']:
                best['geo_mean'] = geo_mean
                best['threshold'] = threshold
            scores.append([threshold, baks, baus, geo_mean])
        self.best_thresholds = best['threshold']
        return best['threshold'], best['geo_mean'], scores
    
    def plot_scores(self, scores, graf_name='nothink'):
        thresholds = [i[0] for i in scores]
        baks_scores = [i[1] for i in scores]
        baus_scores = [i[2] for i in scores]
        geo = [i[3] for i in scores]
        plt.plot(thresholds, baks_scores, label='BAKS (Known)')
        plt.plot(thresholds, baus_scores, label='BAUS (Unknown)')
        plt.plot(thresholds, geo, label='Geometrical Mean')
        plt.title(graf_name)
        plt.xlabel('thresholds')
        plt.ylabel('metrics')
        plt.legend()
        plt.show()


In [None]:
# root = './animal-clef-2025'
root = '/kaggle/input/animal-clef-2025'
threshold = 0.6

transform_display = T.Compose([T.Resize([384, 384])])
transform = T.Compose([T.Resize([512, 512]), T.ToTensor()])
models = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
dataset = AnimalCLEF2025(root, transform=transform_display, load_label=True)
df = dataset.df
train = df[df['split'] == 'database']
test = df[df['split'] == 'query']
df.dataset.value_counts()

In [None]:
seg = pd.read_csv('/kaggle/input/data/segmentation_salamanders.csv') # path to segmentation by yolo
seg['segmentation']=seg['segmentation'].apply(lambda x: eval(x))
seg['bbox']=seg['bbox'].apply(lambda x: np.array(x))
dataset_with_seg = AnimalCLEF2025(root, img_load='bbox_mask', load_label=True)
dataset_with_seg.df['image_name'] = dataset_with_seg.df['path'].apply(lambda x: x.split('/')[-1])
dataset_with_seg.df = dataset_with_seg.df.merge(seg, on='image_name', how='left')
df_seg = dataset_with_seg.df
train_seg = df_seg[df_seg['split'] == 'database']
test_seg = df_seg[df_seg['split'] == 'query']
df_seg.dataset.value_counts()

In [None]:
model = AutoModel.from_pretrained('conservationxlabs/miewid-msv3', trust_remote_code=True).eval()
priority_matcher = SimilarityPipeline(
    matcher=CosineSimilarity(),
    extractor=DeepFeatures(model, device=device, batch_size=64, num_workers=0),
    transform=T.Compose([T.Resize((440, 440)),T.ToTensor(),T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]),
    calibration=IsotonicCalibration()
)

In [None]:
local_matchers = [
    SimilarityPipeline(
        matcher=MatchLightGlue(features='superpoint', device=device),
        extractor=SuperPointExtractor(device=device, max_num_keypoints=512),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=MatchLightGlue(features='aliked', device=device),
        extractor=AlikedExtractor(device=device, max_num_keypoints=512),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=MatchLightGlue(features='disk', device=device),
        extractor=DiskExtractor(device=device, max_num_keypoints=512),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=MatchLightGlue(features='superpoint', device=device),
        extractor=SuperPointExtractor(device=device, max_num_keypoints=1024),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=MatchLightGlue(features='aliked', device=device),
        extractor=AlikedExtractor(device=device, max_num_keypoints=1024),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=MatchLightGlue(features='disk', device=device),
        extractor=DiskExtractor(device=device, max_num_keypoints=1024),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
]

In [None]:
kornia_matchers = [
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='GADMatcher', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='DISK', mx_keypoints=512, device=device, only_xy=False),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='adalam', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='DISK', mx_keypoints=512, device=device, only_xy=False),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='GADMatcher', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='KeyNetAffNetHardNet', mx_keypoints=512, device=device, only_xy=False),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='adalam', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='KeyNetAffNetHardNet', mx_keypoints=512, device=device, only_xy=False),
        transform=T.Compose([T.Resize([512, 512]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='GADMatcher', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='DISK', mx_keypoints=1024, device=device, only_xy=False),
        transform=T.Compose([T.Resize([1024, 1024]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='adalam', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='DISK', mx_keypoints=1024, device=device, only_xy=False),
        transform=T.Compose([T.Resize([1024, 1024]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='GADMatcher', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='KeyNetAffNetHardNet', mx_keypoints=1024, device=device, only_xy=False),
        transform=T.Compose([T.Resize([1024, 1024]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),
    SimilarityPipeline(
        matcher=Kornia_Matcher(matcher_name='adalam', device=device, collector = CollectCounts(thresholds=[0.0])),
        extractor=Kornia_Extractor(model_name='KeyNetAffNetHardNet', mx_keypoints=1024, device=device, only_xy=False),
        transform=T.Compose([T.Resize([1024, 1024]), T.ToTensor()]),
        calibration=IsotonicCalibration()
    ),

]

In [None]:
local_matchers.append(SimilarityPipeline(
    matcher=CosineSimilarity(),
    extractor=DeepFeatures(model, device=device, batch_size=64, num_workers=0),
    transform=T.Compose([T.Resize((440, 440)),T.ToTensor(),T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]),
    calibration=IsotonicCalibration()
))
model2 = timm.create_model('hf-hub:BVRA/wildlife-mega-L-384', num_classes=0, pretrained=True, device=device)
local_matchers.append(SimilarityPipeline(
    matcher=CosineSimilarity(),
    extractor=DeepFeatures(model=model2, device=device, batch_size=64, num_workers=0),
    transform=T.Compose([T.Resize([384, 384]), T.ToTensor(), T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]),
    calibration=IsotonicCalibration()
))

In [None]:
all_data = []

all_data.append({
    'name': 'SalamanderID2025',
    'train': dataset_with_seg.get_subset(train_seg.loc[train_seg['dataset'] == 'SalamanderID2025'].index),
    'test': dataset_with_seg.get_subset(test_seg.loc[test_seg['dataset'] == 'SalamanderID2025'].index),
    'threshold_df': split_data(dataset_with_seg, train_seg.loc[train_seg['dataset'] == 'SalamanderID2025']),
    'calibration_df': split_data(dataset_with_seg, train_seg.loc[train_seg['dataset'] == 'SalamanderID2025'][:300])
})
all_data.append({
    'name': 'LynxID2025',
    'train': dataset.get_subset(train.loc[train['dataset'] == 'LynxID2025'].index),
    'test': dataset.get_subset(test.loc[test['dataset'] == 'LynxID2025'].index),
    'threshold_df': split_data(dataset, train.loc[train['dataset'] == 'LynxID2025']),
    'calibration_df': split_data(dataset, train.loc[train['dataset'] == 'LynxID2025'][:300])
})
all_data.append({
    'name': 'SeaTurtleID2022',
    'train': dataset.get_subset(train.loc[train['dataset'] == 'SeaTurtleID2022'].index),
    'test': dataset.get_subset(test.loc[test['dataset'] == 'SeaTurtleID2022'].index),
    'threshold_df': split_data(dataset, train.loc[train['dataset'] == 'SeaTurtleID2022']),
    'calibration_df': split_data(dataset, train.loc[train['dataset'] == 'SeaTurtleID2022'][:300])
})

In [None]:
for data in all_data:
    print(data['name'])
    model = Conveyor(query=data['test'], database=data['train'], device=device)
    model.build_model(priority_matcher=priority_matcher, local_matchers= kornia_matchers + local_matchers)
    print('calibration')
    model.calibration(data['calibration_df'])

    print('get best threshold')
    threshold, geo_mean, metrics = model.get_best_thresholds(data['threshold_df'], BBB=50)
    
    print(f"best threshold: {round(threshold, 4)}, with geo_mean: {round(geo_mean, 4)}")
    model.plot_scores(scores=metrics, graf_name=f"Train metrics on {data['name']}")
    
    print('similarity')
    model.similarity(BBB=50)
    predictions, scores = model.predict(save_path=f'submission_{data["name"]}.csv')
    
    models.append({
        'model_name': data['name'],
        'model': model,
        'data_predictions': predictions,
        'data_scores': scores,
        'data_threshold': threshold,
    })
    print('\n')

In [None]:
for dat in models:
    ans = dat['data_predictions'].copy()
    ans[dat['data_scores'] < dat['data_threshold']] = 'new_individual'
    test.loc[test['dataset'] == dat['model_name'], 'identity'] = ans
print(test.shape)
test.value_counts('identity')

In [None]:
sub = pd.DataFrame({
    'image_id': test['image_id'],
    'identity': test['identity']
})
sub.to_csv('sample_submission.csv', index=False)
# sub.to_csv('./sample_submission.csv', index=False)