In [2]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import cv2
import warnings
warnings.filterwarnings("ignore")
from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image
import glob

# Preprocess

- Intent is to generate full-image level JSON COCO files for train and test set. This files will be used by the slicing script. Slicing script will generate a tile-level COCO file.

- Working with two datafarmes - dfa and dfi. The fields of these dataframes correspond to 'annotations' and images fields in the COCO files

### Point to data files processed from notebook 1.

In [4]:
data_dir = '/media/l3404/Mate/kaza_files/kaza_export_v0'
img_dir = os.path.join(data_dir, 'exports')

In [5]:
df_train = pd.read_csv(os.path.join(data_dir, 'kaza_train_v0.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'kaza_test_v0.csv'))

dfa = pd.read_csv(os.path.join(data_dir, 'dfa_kaza_v0.csv'))
dfi = pd.read_csv(os.path.join(data_dir, 'dfi_kaza_v0.csv'))

### Generate metadata fields

In [6]:
dfi['file_path'] = dfi['file_name'].apply(lambda x: os.path.abspath(os.path.join(img_dir, x)))

dfi['file_path'].apply(lambda x: os.path.exists(x)).value_counts()

True    5325
Name: file_path, dtype: int64

In [7]:
# Csv saves the list of bbox coordinates as string. Need to serialize

dfa['bbox'] = dfa['bbox'].apply(lambda x: json.loads(x))

### Encode annot-level and image-level 'ids' into yolo-compatible ids (from 0 to N)

- Coco datasets in general rely on indexing on two levels - annotations and images. Thus far composide IDs have been used to avoid collision. Though they must be converted in a uniform, 0 to N range

In [8]:
# Need to convert these annot IDs to 0, 1, 2, 3, ... N

dfa['id'].unique()

array(['SH06_1', 'SH06_2', 'SH06_3', ..., 'SH09_83', 'SH09_84', 'SH09_85'],
      dtype=object)

In [9]:
# Same for image IDs

dfi['id'].unique()

array(['SH06_1', 'SH06_2', 'SH06_3', ..., 'SH09_26', 'SH09_27', 'SH09_28'],
      dtype=object)

In [10]:
# Fit label encoder on full set annotation IDs

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dfa['id'].values)
dfa['id'] = le.transform(dfa['id'])
dfa = dfa.sort_values(by=['id'])
dfa['id'].nunique(), len(dfa)

(15480, 15480)

In [11]:
# Convert train and test set annot ids

train_ids = df_train['id_x'].dropna().unique()
train_ids = le.transform(train_ids)
print(len(train_ids), df_train['id_x'].notna().sum())

test_ids = df_test['id_x'].dropna().unique()
test_ids = le.transform(test_ids)
print(len(test_ids), df_test['id_x'].notna().sum())

12299 12299
3181 3181


In [12]:
# Check all annots point to the right image ids
assert len(dfa['image_id'].unique()) == len(np.intersect1d(dfi['id'].unique(), dfa['image_id'].unique()))

In [13]:
# Counts of images with annots and all images (empty + annotated)
# We don't really need more empty images since they are going to be subsampled anyways

print('Annotated images: ', dfa['image_id'].nunique())
print('All images: ', dfi['id'].nunique())

Annotated images:  2483
All images:  5325


In [14]:
# Fit label encoder on full set image IDs

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dfi['id'].values)
dfi['id'] = le.transform(dfi['id'])
dfi = dfi.sort_values(by=['id'])
dfi['id'].nunique(), len(dfi)

(5325, 5325)

In [15]:
dfa['image_id'] = le.transform(dfa['image_id'].values)

In [16]:
train_imgids = df_train['id_y'].unique()
train_imgids = le.transform(train_imgids)
print(len(train_imgids), df_train['id_y'].nunique())

test_imgids = df_test['id_y'].unique()
test_imgids = le.transform(test_imgids)
print(len(test_imgids), df_test['id_y'].nunique())

4260 4260
1065 1065


In [17]:
# Check the image to annot mapping is valid (the number of annots should stay the same during merge)

assert dfa.merge(dfi, left_on='image_id', right_on='id', how='right').shape[0] == pd.concat([df_train, df_test]).shape[0]  

In [18]:
dfa.merge(dfi, left_on='image_id', right_on='id', how='right').shape, pd.concat([df_train, df_test]).shape  

((18322, 22), (18322, 29))

In [19]:
# Drop empty column(s) (might be redundant)

dfa = dfa.drop(columns=['Unnamed: 0']).reset_index(drop=True)
dfi = dfi.drop(columns=['Unnamed: 0']).reset_index(drop=True)

### Convert species keys to category IDs

In [20]:
# Simple conversion

dfa['category_id'] = dfa['species'].astype('category').cat.codes

In [21]:
# Generate 'categories' field for the COCO file

categories = dfa[['species', 'category_id']].drop_duplicates().to_dict(orient='records')
categories = [{'id':item['category_id'], 'name':item['species'], 'supercategory': ''} for item in categories]
categories = sorted(categories, key=lambda x: x['id'])
categories

[{'id': 0, 'name': 'baboon', 'supercategory': ''},
 {'id': 1, 'name': 'bird', 'supercategory': ''},
 {'id': 2, 'name': 'buffalo', 'supercategory': ''},
 {'id': 3, 'name': 'bushpig', 'supercategory': ''},
 {'id': 4, 'name': 'canoe', 'supercategory': ''},
 {'id': 5, 'name': 'car', 'supercategory': ''},
 {'id': 6, 'name': 'crocodile', 'supercategory': ''},
 {'id': 7, 'name': 'duiker', 'supercategory': ''},
 {'id': 8, 'name': 'ec3', 'supercategory': ''},
 {'id': 9, 'name': 'ec4', 'supercategory': ''},
 {'id': 10, 'name': 'eland', 'supercategory': ''},
 {'id': 11, 'name': 'elephant', 'supercategory': ''},
 {'id': 12, 'name': 'elephant bull', 'supercategory': ''},
 {'id': 13, 'name': 'gazelle_thomsons', 'supercategory': ''},
 {'id': 14, 'name': 'giraffe', 'supercategory': ''},
 {'id': 15, 'name': 'hartebeest', 'supercategory': ''},
 {'id': 16, 'name': 'hippo', 'supercategory': ''},
 {'id': 17, 'name': 'hyena', 'supercategory': ''},
 {'id': 18, 'name': 'impala', 'supercategory': ''},
 {'id': 

### Subset down to train and test sets from the processed full dataset above

- Note: This certainly violates DRY principle.

In [22]:
## Subset the training set

# train_ids = df_train['id_x'].unique()
print(len(train_ids), df_train['id_x'].notna().sum())

# train_imgids = df_train['id_y'].unique()
print(len(train_imgids), df_train['id_y'].nunique())

dfa_train = dfa[dfa['id'].isin(train_ids)]
dfi_train = dfi[dfi['id'].isin(train_imgids)]
print(dfa_train.shape, dfi_train.shape)

dfa_train = dfa_train.reset_index(drop=True)
dfi_train = dfi_train.reset_index(drop=True)

print(dfa_train.merge(dfi_train, left_on='image_id', right_on='id', how='right').shape, df_train.shape)

12299 12299
4260 4260
(12299, 9) (4260, 11)
(14551, 20) (14551, 29)


In [23]:
## Subset the test set


# test_ids = df_test['id_x'].unique()
print(len(test_ids), df_test['id_x'].notna().sum())

# test = df_test['id_y'].unique()
print(len(test_imgids), df_test['id_y'].nunique())

dfa_test = dfa[dfa['id'].isin(test_ids)]
dfi_test = dfi[dfi['id'].isin(test_imgids)]
print(dfa_test.shape, dfi_test.shape)

dfa_test = dfa_test.reset_index(drop=True)
dfi_test = dfi_test.reset_index(drop=True)


print(dfa_test.merge(dfi_test, left_on='image_id', right_on='id', how='right').shape, df_test.shape)

3181 3181
1065 1065
(3181, 9) (1065, 11)
(3771, 20) (3771, 29)


### Generate additional fields from file name

- They are currently unused, but kept for the possibility of more detailed analysis after training


In [24]:
dfi_train['survey_code'] = dfi_train['file_name'].apply(lambda x: x.split('_')[0])
dfi_train['aircraft_registration'] = dfi_train['file_name'].apply(lambda x: x.split('_')[1].split('-')[0])
dfi_train['camera_side'] = dfi_train['file_name'].apply(lambda x: x.split('_')[1].split('-')[1])
dfi_train['flight_session'] = dfi_train['file_name'].apply(lambda x: x.split('_')[2].split('-')[0])
dfi_train['exif_timestamp'] = dfi_train['file_name'].apply(lambda x: x.split('_')[2].split('-')[1])
dfi_train['orig_file_name'] = dfi_train['file_name'].apply(lambda x: x.split('_')[3].split('.')[0])
dfi_train['strat'] = dfi_train['survey_code'] + '_' + dfi_train['aircraft_registration'] + '_' + dfi_train['camera_side'] + '_' + dfi_train['flight_session'] 


In [25]:
dfi_test['survey_code'] = dfi_test['file_name'].apply(lambda x: x.split('_')[0])
dfi_test['aircraft_registration'] = dfi_test['file_name'].apply(lambda x: x.split('_')[1].split('-')[0])
dfi_test['camera_side'] = dfi_test['file_name'].apply(lambda x: x.split('_')[1].split('-')[1])
dfi_test['flight_session'] = dfi_test['file_name'].apply(lambda x: x.split('_')[2].split('-')[0])
dfi_test['exif_timestamp'] = dfi_test['file_name'].apply(lambda x: x.split('_')[2].split('-')[1])
dfi_test['orig_file_name'] = dfi_test['file_name'].apply(lambda x: x.split('_')[3].split('.')[0])
dfi_test['strat'] = dfi_test['survey_code'] + '_' + dfi_test['aircraft_registration'] + '_' + dfi_test['camera_side'] + '_' + dfi_test['flight_session'] 


### Export image and annot (dfa and dfi) dataframes to coco format

In [25]:
def load_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

def write_json(data, out_path):
    json_object = json.dumps(data, indent=4)
    with open(out_path, "w") as outfile:
        outfile.write(json_object)
        
def export_annos(dfa, dfi, out_path, categories=[]):
    print('out_path', out_path)
    print('shapes: ', dfa.shape, dfi.shape)
    annos_list = dfa.to_dict(orient='records')
    images_list = dfi.to_dict(orient='records')

    data = {
        'info':{},
        'licenses':[],
        'images':images_list,
        'annotations':annos_list,
        'categories':categories
           }
    write_json(data, out_path)

    return out_path

In [26]:
train_coco_path = export_annos(dfa_train, dfi_train, os.path.join(data_dir, 'kaza_train.v0.json'), categories)
test_coco_path = export_annos(dfa_test, dfi_test, os.path.join(data_dir, 'kaza_test.v0.json'), categories)

out_path /media/l3404/43b317c3-b94d-4e83-9199-c98069ecabfc/kaza_files/kaza_export_v0/kaza_train.v0.json
shapes:  (12299, 9) (4260, 18)
out_path /media/l3404/43b317c3-b94d-4e83-9199-c98069ecabfc/kaza_files/kaza_export_v0/kaza_test.v0.json
shapes:  (3181, 9) (1065, 18)
