In [28]:
import json
import os

import pandas as pd
import shutil
from tqdm.auto import tqdm

In [29]:
def load_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

def write_json(data, out_path):
    json_object = json.dumps(data, indent=4)
    with open(out_path, "w") as outfile:
        outfile.write(json_object)
        
def export_annos(dfa, dfi, out_path, categories=[]):
    print('out_path', out_path)
    print('shapes: ', dfa.shape, dfi.shape)
    annos_list = dfa.to_dict(orient='records')
    images_list = dfi.to_dict(orient='records')

    data = {
        'info':{},
        'licenses':[],
        'images':images_list,
        'annotations':annos_list,
        'categories':categories
           }
    write_json(data, out_path)

    return out_path

In [30]:
def load_and_sample_tiles(data, n_negative_samples=1000):

    dfa = pd.DataFrame(data['annotations'])
    dfi = pd.DataFrame(data['images'])

    categories = data['categories']
    cat_map = {item['id']: item['name'] for item in categories}
    dfa['species'] = dfa['category_id'].map(cat_map)


    annot_img_ids = set(dfa['image_id'].unique())

    empty_ids = []
    for x in tqdm(dfi['id']):
        if x not in annot_img_ids:
            empty_ids.append(x)

    print('Number of non-empty tiles:', len(dfi) - len(empty_ids))
    print('Number of empty tiles:', len(empty_ids))

    dfi_p = dfi[~dfi['id'].isin(empty_ids)]
    dfi_n = dfi[dfi['id'].isin(empty_ids)]

    dfi_ns = dfi_n.sample(n=n_negative_samples, random_state=0)
    dfi_s = pd.concat([dfi_p, dfi_ns])

    print(f"Sampled {len(dfi_p)} positive tiles and {len(dfi_ns)} negative tiles. Total: {len(dfi_s)}")

    return dfa, dfi_s

# Convert test to yolo format

- NOTE: Everything below is for the test (val) set. The same must be repeated for the training set.

### Convert to JSON format first that will work with yolo converter

In [31]:
# Recommended to experiment with ~10-25% negative tiles. 
# More negative tiles should make the model less prone to false positives at the cost of recall
# (Actual effect and magnitude to be determined)

n_val_negative_tiles = 500


images_dir = '/media/l3404/Mate/kaza_files/slice_kaza_val-25'
anno_path ='/media/l3404/Mate/kaza_files/slice_kaza_val-25/slice_kaza_val-25_coco.json'

# images_dir = '/mnt/space/space-home/wild-me/kaza/slice_kaza_val_v0/'
# anno_path = '/mnt/space/space-home/wild-me/kaza/slice_kaza_val_v0/slice_kaza_val_v0.json'

dfas = []
dfis = []
data = load_json(anno_path)
dfa, dfi_s = load_and_sample_tiles(data, n_negative_samples=n_val_negative_tiles)
dfa['anno_path'] = anno_path
dfi_s['anno_path'] = anno_path
dfi_s['images_dir'] = images_dir
dfas.append(dfa)
dfis.append(dfi_s)

dfa = pd.concat(dfas)
dfi = pd.concat(dfis)

  0%|          | 0/216648 [00:00<?, ?it/s]

Number of non-empty tiles: 2773
Number of empty tiles: 213875
Sampled 2773 positive tiles and 500 negative tiles. Total: 3273


In [26]:
# Note: Folders should be different for val and train

out_dir = '/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val'
os.makedirs(out_dir, exist_ok=False)

In [34]:
# Necessary to bump category ids by 1 
# as conversion to yolo will subtract 1 from category_id

dfa['category_id'] = dfa['category_id'] + 1

categories = data['categories']
categories = [{'id':item['id'] + 1, 'name':item['name'], 'supercategory': ''} for item in categories]

dfa['category_id'].min(), dfa['category_id'].max()

(1, 46)

In [35]:
dfa = dfa.reset_index(drop=True)
dfi = dfi.reset_index(drop=True)

In [41]:
# Modify json file name at will
train_coco_path = export_annos(dfa, dfi, os.path.join(out_dir, 'kaza_v0.json'), categories)

out_path /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/kaza_v0.json
shapes:  (5230, 9) (3273, 6)


In [42]:
dfa_onecat = dfa.copy()
dfa_onecat['category_id'] = 1

train_coco_path_onecat = export_annos(dfa_onecat, dfi, os.path.join(out_dir, 'kaza_v0_onecat.json'), categories)

out_path /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/kaza_v0_onecat.json
shapes:  (5230, 9) (3273, 6)


In [46]:
import ultralytics

labels_dir = out_dir

ultralytics.data.converter.convert_coco(labels_dir=labels_dir, save_dir=f'{out_dir}/coco_converted/', use_segments=False, use_keypoints=False, cls91to80=False)

Annotations /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/kaza_v0.json

Annotations /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/kaza_v0.json
Annotations /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/kaza_v0_onec

COCO data converted successfully.
Results saved to /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/coco_converted





### Fill in the data directory (repeat for training set)

In [None]:
### Data folder for test set 'kaza_v0_val' (should have similar one for the train set) generated from above
# This process should be repeated for the trainin 'data_folder'
data_folder = '/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val'

In [54]:
# '/labels' is where the .txt labels for yolo format are stored
# Regular is for multi-class labels, 'onecat' is all labels mapped to the same class 


!tree -d {data_folder}

[01;34m/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val[0m
└── [01;34mcoco_converted[0m
    ├── [01;34mimages[0m
    └── [01;34mlabels[0m
        ├── [01;34mkaza_v0[0m
        └── [01;34mkaza_v0_onecat[0m

5 directories


In [55]:
# Make a new 'active' subfolder in the data folder

os.makedirs(f'{data_folder}/active')
os.makedirs(f'{data_folder}/active/labels')
os.makedirs(f'{data_folder}/active/images')


In [59]:
!tree -d {data_folder}

[01;34m/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val[0m
├── [01;34mactive[0m
│   ├── [01;34mimages[0m
│   └── [01;34mlabels[0m
└── [01;34mcoco_converted[0m
    ├── [01;34mimages[0m
    └── [01;34mlabels[0m
        ├── [01;34mkaza_v0[0m
        └── [01;34mkaza_v0_onecat[0m

8 directories


In [62]:
# Move either multiclass or oneclass labels from 'coco_converted' to 'active'

source_labels = f'{data_folder}/coco_converted/labels/kaza_v0_onecat'
destination_labels = f'{data_folder}/active/labels/kaza_v0_onecat'

shutil.copytree(source_labels, destination_labels)

'/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/active/labels/kaza_v0_onecat'

In [63]:
# Copied labels folder should show up under /active/labels
!tree -d {data_folder}

[01;34m/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val[0m
├── [01;34mactive[0m
│   ├── [01;34mimages[0m
│   └── [01;34mlabels[0m
│       └── [01;34mkaza_v0_onecat[0m
└── [01;34mcoco_converted[0m
    ├── [01;34mimages[0m
    └── [01;34mlabels[0m
        ├── [01;34mkaza_v0[0m
        └── [01;34mkaza_v0_onecat[0m

9 directories


In [None]:
# Also move the tile images folder created from Notebook 3

In [65]:
# Source should be coming from output_dir of the notebook 3

source_images = '/mnt/space/space-home/wild-me/kaza/slice_kaza_val_v0'
destination_images = f'{data_folder}/active/images/kaza_v0_onecat'

shutil.copytree(source_images, destination_images)

'/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/active/images/kaza_v0_onecat'

In [66]:
# This is what the final directory tree should look like for the dataset (repeat the same for train set)

!tree -d {data_folder}

[01;34m/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val[0m
├── [01;34mactive[0m
│   ├── [01;34mimages[0m
│   │   └── [01;34mkaza_v0_onecat[0m
│   └── [01;34mlabels[0m
│       └── [01;34mkaza_v0_onecat[0m
└── [01;34mcoco_converted[0m
    ├── [01;34mimages[0m
    └── [01;34mlabels[0m
        ├── [01;34mkaza_v0[0m
        └── [01;34mkaza_v0_onecat[0m

10 directories


# TODO: repeate the above steps for the training set before proceeding to final section

- Note: After repeating for the train set, (and optional holdout set) should be left with two folders - one for train and one for test. (Hint: this line should be different for train and test - `out_dir = '/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val'`)

### This is what kind of config.yaml you should create after this subsection

```


# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
# COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
# Example usage: python train.py --data coco128.yaml
# parent
# ├── yolov5
# └── datasets
#     └── coco128  ← downloads here (7 MB)
#

# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
train: 
 - '/mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_full_round1/active/images/kaza_full_round1_onecat'
val: /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_val/active/images/kaza_val_onecat

# Classes
names:
  # TO BE FILLED IN THE NEXT SUB-SECTION
  ```

# Final section: Create yolov8 yaml file (maual step)

### Fill in the classes mapping

In [None]:
dfa['species'].value_counts()

red lechwe          2069
elephant            1178
buffalo              411
impala               319
zebra                205
wildebeest           149
kudu                 112
giraffe              110
puku                  79
white_bones           64
baboon                64
reedbuck              59
topi                  58
bird                  54
sable                 53
unknown antelope      41
waterbuck             34
unknown mammal        34
hartebeest            25
warthog               21
roan                  18
crocodile             18
sitatunga              8
hippo                  8
ec4                    7
steenbok               6
eland                  4
vervet monkey          4
canoe                  3
duiker                 3
bushpig                3
ostrich                3
unknown animal         2
unknown carcass        2
ec3                    2
Name: species, dtype: int64

In [None]:
# This category mapping can be copied into yolo config.yaml
for cat in data['categories']:
    print(cat['id'],':', cat['name'])

0 : baboon
1 : bird
2 : buffalo
3 : bushpig
4 : canoe
5 : car
6 : crocodile
7 : duiker
8 : ec3
9 : ec4
10 : eland
11 : elephant
12 : elephant bull
13 : gazelle_thomsons
14 : giraffe
15 : hartebeest
16 : hippo
17 : hyena
18 : impala
19 : kob
20 : kudu
21 : lion
22 : ostrich
23 : puku
24 : red lechwe
25 : reedbuck
26 : roan
27 : roof_mabati
28 : sable
29 : sheep
30 : sitatunga
31 : steenbok
32 : topi
33 : unknown animal
34 : unknown antelope
35 : unknown carcass
36 : unknown mammal
37 : unknown_carcas
38 : unknown_carcass
39 : vervet monkey
40 : warthog
41 : waterbuck
42 : white_bones
43 : wild dog
44 : wildebeest
45 : zebra


### This is what config.yaml should look like at the end

- Note: The train and val point to /images/{folder_name} subfolder of the dataset folder. The script fill attempt to navigate around this folder to find labels in the parent folder.

```


# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
# COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
# Example usage: python train.py --data coco128.yaml
# parent
# ├── yolov5
# └── datasets
#     └── coco128  ← downloads here (7 MB)
#

# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
train: 
 - /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_train/active/images/kaza_v0_onecat
val: /mnt/space/space-home/wild-me/kaza/exp_dirs/kaza_v0_val/active/images/kaza_v0_onecat

# Classes
names:
  0 : baboon
  1 : bird
  2 : buffalo
  3 : bushpig
  4 : canoe
  5 : car
  6 : crocodile
  7 : duiker
  8 : ec3
  9 : ec4
  10 : elephant
  11 : elephant bull
  12 : gazelle_thomsons
  13 : giraffe
  14 : hartebeest
  15 : hippo
  16 : hyena
  17 : impala
  18 : kob
  19 : kudu
  20 : lion
  21 : ostrich
  22 : puku
  23 : red lechwe
  24 : reedbuck
  25 : roan
  26 : roof_mabati
  27 : sable
  28 : sheep
  29 : sitatunga
  30 : steenbok
  31 : topi
  32 : unknown animal
  33 : unknown antelope
  34 : unknown carcass
  35 : unknown mammal
  36 : unknown_carcas
  37 : unknown_carcass
  38 : vervet monkey
  39 : warthog
  40 : waterbuck
  41 : white_bones
  42 : wild dog
  43 : wildebeest
  44 : zebra


```