# Dataset


We chose the following dataset to evaluate different models on tracking tasks:

-   [TC-128](https://www3.cs.stonybrook.edu/~hling/data/TColor-128/TColor-128.html#dataset) (Present in the article) (4.37 Go)
-   [VOT2021](https://www.votchallenge.net/vot2021/dataset.html) (Recent dataset) (1.23 Go)


This notebook display samples of datasets and how they are modified to feat the models.

To feat the models evaluation. The folder should be in the **data** folder and be structured as follow:

-   *my*dataset_name
    -   _img_
        -   img1.jpg
        -   img2.jpg
        -   ...
    -   _groundtruth.txt_

---

groundtruth.txt should be structured as follow:

```
x,y,w,h
x,y,w,h
...
```


In [41]:
import csv
import cv2
import matplotlib.pyplot as plt
import re
import shutil
import urllib.request
import vot

from pathlib import Path
from typing import *
from vot import dataset

**Change the location with the following cell.**


In [42]:
%%writefile dataset.py
from pathlib import Path
from typing import *

path_to_dataset = Path.cwd() / "data"

Overwriting dataset.py


In [43]:
from dataset import path_to_dataset

## TC-128


This dataset contains 128 videos.


In [44]:
def download_tc128_and_format(all: bool = False):
    """Download the TColor-128 dataset and format it to fit the code.

    Args:
        all (bool, optional): Download all the sequences or only the first one. Defaults to False.
    """
    data_folder: Path = path_to_dataset / "mytc128"
    if data_folder.exists():
        return

    data_folder.mkdir(parents=True)

    with urllib.request.urlopen(
        "https://www3.cs.stonybrook.edu/~hling/data/TColor-128/seqs/"
    ) as f:
        html: str = f.read().decode("utf-8")
        zip: List[str] = re.findall(r'href="(.*?\.zip)"', html)

    for i in range(len(zip)):
        zip_without_ext: str = zip[i][:-4]
        path_zip: Path = data_folder / zip[i]
        path_zip_without_ext: Path = data_folder / zip_without_ext

        # Download zip file
        urllib.request.urlretrieve(
            "https://www3.cs.stonybrook.edu/~hling/data/TColor-128/seqs/" + zip[i],
            path_zip,
        )
        shutil.unpack_archive(path_zip, data_folder)
        path_zip.unlink()

        # Remove useless files
        attribute_file: str = zip_without_ext + "_att.txt"
        frame_file: str = (
            zip_without_ext if zip_without_ext != "Jogging2" else "jogging2"
        ) + "_frames.txt"  # Special case for Jogging2

        (path_zip_without_ext / attribute_file).unlink()
        (path_zip_without_ext / frame_file).unlink()

        # Rename groundtruth file to fit the code
        groundtruth_file: str = zip_without_ext + "_gt.txt"
        (path_zip_without_ext / groundtruth_file).rename(
            path_zip_without_ext / "groundtruth.txt"
        )

        if not all:
            break

In [45]:
download_tc128_and_format(True)

# VOT 2021


This dataset contains semantic labelisation of 60 videos. We will use the semantic labelisation to compute the ground truth bounding boxes.


In [46]:
def semantic2bbox(path_semantic: str, path_bbox: str):
    """Convert semantic segmentation to bounding box

    Args:
        path_semantic (str): Path to the semantic segmentation file
        path_bbox (str): Path to the bounding box file
    """
    with open(path_semantic, "r") as f:
        reader = csv.reader(f)
        semantic = list(reader)

    semantic = list(map(lambda x: [x[0][1:]] + x[1:], semantic))
    semantic = list(map(lambda x: list(map(lambda y: int(y), x)), semantic))
    semantic = list(map(lambda x: x[:4], semantic))

    with open(path_bbox, "w") as f:
        writer = csv.writer(f)
        writer.writerows(semantic)

In [47]:
def download_vot2021_dataset_and_format(all: bool = False):
    """Download the VOT2021 dataset and format it to fit the code.

    Args:
        all (bool, optional): Download all the sequences or only the first one. Defaults to False.
    """
    path_vot2021: Path = path_to_dataset / "vot2021"
    path_myvot2021: Path = path_to_dataset / "myvot2021"

    if path_myvot2021.exists():
        return

    dataset.download_dataset(dataset.vot._VOT_DATASETS["vot-st2021"], path_vot2021)
    path_myvot2021.mkdir(parents=True)

    for video in path_vot2021.iterdir():
        if not (path_vot2021 / video).is_dir():
            continue

        (path_myvot2021 / video).mkdir(exist_ok=True)

        semantic2bbox(
            path_vot2021 / video / "groundtruth.txt",
            path_myvot2021 / video / "groundtruth.txt",
        )
        shutil.copytree(path_vot2021 / video / "color", path_myvot2021 / video / "img")

        if not all:
            break

    shutil.rmtree(path_vot2021)

In [48]:
download_vot2021_dataset_and_format(True)

# Dataloading


In [49]:
%%writefile -a dataset.py

def load_dataset(name: str) -> Dict:
    """Load a dataset.

    Args:
        name (str): Name of the dataset to load.

    Returns:
        Dict: Dictionary containing the dataset.
    """
    available: List[str] = [folder for folder in path_to_dataset.iterdir() if folder.name.startswith("my")]

    if not any(name == x.name for x in available):
        Exception("Dataset not found")

    ret: Dict = {}

    for folder in (path_to_dataset / name).iterdir():
        try:
            cur: Dict = {}
            cur["name"] = folder.name
            cur["gt"] = []

            folder_path: Path = path_to_dataset / name / folder

            with open(folder_path / "groundtruth.txt") as f:
                for line in f:
                    cur["gt"].append([int(float(x)) for x in line.split(",")])

            cur["image_files"] = [
                str(folder_path / "img" / x.name) for x in (folder_path / "img").iterdir()
            ]

            if len(cur["gt"]) != len(cur["image_files"]):
                print(f"Error while loading dataset {folder} gt and image files have different length")
                continue

            ret[folder.name] = cur
        except:
            print("Error while loading dataset", folder)

    return ret

Appending to dataset.py


In [50]:
%%writefile -a dataset.py

def load_datasets():
    """Load all the datasets.

    Returns:
        Dict: Dictionary containing all the datasets.
    """
    ret: Dict = {}
    for folder in path_to_dataset.iterdir():
        if folder.name.startswith("my"):
            ret[folder.name] = load_dataset(folder.name)
    return ret

Appending to dataset.py
