# WildlifeReID-10k creation

This is the notebook for creating the WildlifeReID-10k dataset. It copies the files to a separate folder, applies bounding boxes and masks and combines them together. It alsi creates the splits. All these operations are created dataset-wise.

First load the necessary packages.

In [None]:
import sys

sys.path.append("../..")

import os

import numpy as np
import pandas as pd
from thresholds import names_thresholds
from utils import SplitterByFeatures

from wildlife_datasets.datasets import WildlifeReID10k
from wildlife_datasets.preparation import prepare_functions, species_conversion

Then specify the roots, where the dataset is located. Parameters `transform` can be used to resize files, parameter `copy_files` whether the files copied from `root_datasets` to `root` and finally `add_split` whether split should be added. Since bounding boxes and masks are applied and the black borders are cropped, it is relatively time-consuming.

In [None]:
root_datasets = "/data/wildlife_datasets/data"
root = os.path.join(root_datasets, "WildlifeReID10k")
root_images = os.path.join(root, "images")
root_metadata = os.path.join(root, "metadata")
root_clusters = "clusters"
root_features = "features_dino"
os.makedirs(root_clusters, exist_ok=True)

transform = None
copy_files = False
add_split = True

names_permissible = list(names_thresholds.keys())
remove_str = ["[", "]"]
replace_extensions = {".webp": ".jpg"}

Create metadata for each dataset and potentially copy the files. The structure is probably a bit wrong because the notebook needs to be first run with `copy_files=True` and `add_split=False`, then the features need to be computed by the script `extract_features.py` and then the boolean parameters need to be reverted to add splits.

In [None]:
for name, prepare in prepare_functions.items():
    if name in names_permissible:
        print(name)
        os.makedirs(f"{root_metadata}/{name}/", exist_ok=True)
        thr, splitter = names_thresholds[name]
        if thr != "time-aware" and thr is not None:
            path_features = f"{root_features}/features_{name}.npy"
            path_clusters = f"{root_clusters}/clusters_{name}_{thr}.npy"
            splitter = SplitterByFeatures(path_features, splitter, thr, file_name=path_clusters)
        metadata_part = prepare(
            f"{root_datasets}/{name}",
            f"{root_images}/{name}",
            transform=transform,
            add_split=add_split,
            splitter=splitter,
            copy_files=copy_files,
            remove_str=remove_str,
            replace_extensions=replace_extensions,
        )
        metadata_part.to_csv(f"{root_metadata}/{name}/metadata.csv", index=False)

The next codes adds additional information to the metadata and combines them together.

In [None]:
metadata = []
for name in prepare_functions:
    if name in names_permissible:
        metadata_part = pd.read_csv(f"{root_metadata}/{name}/metadata.csv")
        metadata_part["dataset"] = name
        metadata_part["identity"] = name + "_" + metadata_part["identity"].astype(str)
        metadata_part["path"] = "images/" + name + "/" + metadata_part["path"]
        metadata_part["species"] = metadata_part["species"].apply(lambda x: species_conversion[x])

        thr, _ = names_thresholds[name]
        metadata_part["cluster_id"] = pd.Series(dtype=object)
        if thr != "time-aware" and thr is not None:
            path_clusters = f"{root_clusters}/clusters_{name}_{thr}.npy"
            if os.path.exists(path_clusters):
                clusters = np.load(path_clusters)
                metadata_part["cluster_id"] = clusters
                metadata_part["cluster_id"] = metadata_part["cluster_id"].astype(object)
        elif thr == "time-aware":
            for i, (_, metadata_date) in enumerate(metadata_part.groupby(["identity", "date"])):
                metadata_part.loc[metadata_date.index, "cluster_id"] = str(i)
        idx = ~metadata_part["cluster_id"].isnull()
        metadata_part.loc[idx, "cluster_id"] = (
            metadata_part.loc[idx, "identity"] + "_" + metadata_part.loc[idx, "cluster_id"].astype(int).astype(str)
        )

        metadata.append(metadata_part)
metadata = pd.concat(metadata).reset_index(drop=True)
metadata = metadata.drop("image_id", axis=1)
idx = ~metadata["date"].isnull()
idx = metadata.index[idx]
metadata.loc[idx, "date"] = pd.to_datetime(
    metadata.loc[idx, "date"].astype(str).apply(lambda x: x[:10]), format="%Y-%m-%d"
).astype(str)
metadata["orientation"] = metadata["orientation"].replace({"below": "down", "up": "top", "above": "top"})
metadata.to_csv(f"{root}/metadata.csv", index=False)

dataset = WildlifeReID10k(root)
dataset.df = dataset.df.drop("image_id", axis=1)
dataset.df.to_csv(f"{root}/metadata.csv", index=False)