# Dataset

We chose the following dataset to evaluate different models on tracking tasks:
- [TC-128](https://www3.cs.stonybrook.edu/~hling/data/TColor-128/TColor-128.html#dataset) (Present in the article) (4.37 Go)
- [VOT2021](https://www.votchallenge.net/vot2021/dataset.html) (Recent dataset) (1.23 Go)

This notebook display samples of datasets and how they are modified to feat the models.

To feat the models evaluation. The folder should be in the **data** folder and be structured as follow:

- *my*dataset_name
    - *img*
        - img1.jpg
        - img2.jpg
        - ...
    - *groundtruth.txt*

---

groundtruth.txt should be structured as follow:

```
x,y,w,h
x,y,w,h
...
```

In [11]:
import csv
import cv2
import matplotlib.pyplot as plt
import os
import re
import shutil
import urllib.request
import vot

from dataset import path_to_dataset
from pathlib import Path
from vot import dataset

**Change the location with the following cell.**

In [12]:
%%writefile dataset.py
import os
from pathlib import Path

path_to_dataset = Path(os.getcwd()) / "data"

Overwriting dataset.py


## TC-128

This dataset contains 128 videos.

In [13]:
def download_tc128_and_format(all: bool = False):
    data_folder = path_to_dataset / "mytc128"
    if data_folder.exists():
        return
    
    os.makedirs(data_folder)

    with urllib.request.urlopen('https://www3.cs.stonybrook.edu/~hling/data/TColor-128/seqs/') as f:
        html = f.read().decode('utf-8')
        zip = re.findall(r'href="(.*?\.zip)"', html)

    for i in range(len(zip)):
        zip_without_ext = zip[i][:-4]
        path_zip = data_folder / zip[i]
        path_zip_without_ext = data_folder / zip_without_ext
        
        # Download zip file
        urllib.request.urlretrieve('https://www3.cs.stonybrook.edu/~hling/data/TColor-128/seqs/' + zip[i], path_zip)
        shutil.unpack_archive(path_zip, data_folder)
        os.remove(path_zip)

        # Remove useless files
        attribute_file = zip_without_ext + "_att.txt"
        frame_file = (zip_without_ext if zip_without_ext != "Jogging2" else "jogging2")  + "_frames.txt" # Special case for Jogging2
        os.remove(str(path_zip_without_ext / attribute_file))
        os.remove(str(path_zip_without_ext / frame_file))
        
        # Rename groundtruth file to fit the code
        groundtruth_file = zip_without_ext + "_gt.txt"
        os.rename(path_zip_without_ext / groundtruth_file, path_zip_without_ext / "groundtruth.txt")
        
        if not all:
            break

In [14]:
download_tc128_and_format(True)

# VOT 2021

This dataset contains semantic labelisation of 60 videos. We will use the semantic labelisation to compute the ground truth bounding boxes.

In [15]:
def semantic2bbox(path_semantic : str, path_bbox : str):
    """Convert semantic segmentation to bounding box"""

    with open(path_semantic, 'r') as f:
        reader = csv.reader(f)
        semantic = list(reader)
    
    semantic = list(map(lambda x: [x[0][1:]] + x[1:], semantic))
    semantic = list(map(lambda x: list(map(lambda y: int(y), x)), semantic))
    semantic = list(map(lambda x: x[:4], semantic))

    with open(path_bbox, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(semantic)

In [16]:
def download_vot2021_dataset_and_format(all: bool = False):
    path_vot2021 = path_to_dataset / "vot2021"
    path_myvot2021 = path_to_dataset / "myvot2021"
    
    if path_myvot2021.exists():
        return

    dataset.download_dataset(dataset.vot._VOT_DATASETS["vot-st2021"], path_vot2021)
    os.mkdir(path_myvot2021)
    
    videos = os.listdir(path_vot2021)
    
    for video in videos:
        if not os.path.isdir(path_vot2021 / video):
            continue

        if not os.path.exists(path_myvot2021 / video):
            os.mkdir(path_myvot2021 / video)

        semantic2bbox(path_vot2021 / video / "groundtruth.txt", path_myvot2021 / video / "groundtruth.txt")
        shutil.copytree(path_vot2021 / video / "color", path_myvot2021 / video / "img")
        
        if not all:
            break

    shutil.rmtree(path_vot2021)

In [17]:
download_vot2021_dataset_and_format(True)

# Dataloading

In [18]:
%%writefile -a dataset.py

def load_dataset(name):
    data_folders = os.listdir(path_to_dataset)
    available = [folder for folder in data_folders if folder.startswith("my")]

    if not any(name == x for x in available):
        Exception("Dataset not found")

    ret = {}

    for folder in os.listdir(path_to_dataset / name):
        try:
            cur = {}
            cur["name"] = folder
            cur["gt"] = []

            folder_path = path_to_dataset / name / folder

            with open(folder_path / "groundtruth.txt") as f:
                for line in f:
                    cur["gt"].append([int(float(x)) for x in line.split(",")])

            cur["image_files"] = [
                folder_path / "img" / x for x in os.listdir(folder_path / "img")
            ]

            if len(cur["gt"]) != len(cur["image_files"]):
                print(f"Error while loading dataset {folder} gt and image files have different length")
                return

            ret[folder] = cur
        except:
            print("Error while loading dataset", folder)

    return ret

Appending to dataset.py


In [19]:
%%writefile -a dataset.py

def load_datasets():
    ret = {}
    for folder in os.listdir(path_to_dataset):
        if folder.startswith("my"):
            ret[folder] = load_dataset(folder)
    return ret

Appending to dataset.py
