In [1]:
# if local use new_torch_env

import os

import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter

import cv2
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from xml.etree import ElementTree, ElementInclude

import pickle

from detectron2.structures import BoxMode


In [2]:

np.random.seed(42) # see if this is the culprit.

def get_classes(img_dir):
    """Creates a list of classes and corrosponding ints. also a dict to translate"""

    obj_name = []

    # Get all objects that have been annotated
    for filename in os.listdir(img_dir):
        if filename.split('.')[1] == 'xml':
            box_path = os.path.join(img_dir, filename)

            tree = ElementTree.parse(box_path)
            lst_obj = tree.findall('object')

            for j in lst_obj:
                obj_name.append(j.find('name').text)
    
    classes = list(sorted(set(obj_name))) # all labesl
    classes_int = list(np.arange(0,len(classes))) # corrospoding int
    class_to_int = dict(zip(classes,classes_int)) # a dict to translate between them

    return(classes, classes_int, class_to_int)


def get_img_path(img_dir):

    """Creates a list of all image paths."""

    # right now this does not take into account whether the image was anotated or not.
    # It also does not handle test or train.

    img_path_list = []

    for root, dirs, files in os.walk(img_dir):
        for img_name in files:
            if img_name.split('.')[1] == 'jpg':
                img_path = os.path.join(img_dir, img_name)                
                img_path_list.append(img_path)

    return(img_path_list)


def get_annotation_path(img_dir):

    """Creates a list of all box paths."""

    annotation_list = []

    for filename in os.listdir(img_dir):
        if filename.split('.')[1] == 'xml':
            annotation_list.append(filename)

    return(annotation_list)

def get_train_test(annotation_list, train_ratio = 0.8):

    train_n = int(len(annotation_list) * train_ratio)
    train_set = np.random.choice(annotation_list, train_n, replace = False)
    test_set = [i for i in annotation_list if i not in train_set]

    return(train_set, test_set)


def get_img_dicts(img_dir, train = True):

    _, _, class_to_int = get_classes(img_dir) # only need the dict here.
    annotation_list = get_annotation_path(img_dir) # new
    train_set, test_set = get_train_test(annotation_list) 

    dataset_dicts = []
    idx = 0

    # if you just want a list to go through, you cna generalizr the function below (get_img_path)... 
    # and if you had that function splitting into train and test would be simple.

    if train == True:
        subset = train_set
    
    elif train == False:
        subset = test_set

    # here you need to think about val (you use test right now) and also the full set...

    for filename in subset:

    # for filename in os.listdir(img_dir):
    #    if filename.split('.')[1] == 'xml': # only for annotated images. filename is now effectively annotationes.

        img_name = filename.split('.')[0] + '.jpg' # the image name w/ correct extension.
        
        record = {}
        img_path = os.path.join(img_dir, img_name)

        height, width = cv2.imread(img_path).shape[:2]

        record["file_name"] = img_path #  needs to be the full path to the image file acccording to docs.
        record["image_id"] = idx
        record["height"] = height
        record["width"] = width

        objs = []
        obj_path = os.path.join(img_dir, filename)
        tree = ElementTree.parse(obj_path)

        annotations = tree.findall('object')

        for i in annotations: # go through all annotated objs in a given image

            label = i.find('name').text # get the label
            box = i.findall('bndbox') # find the box

            for j in box: # get the 4 measures from the box

                xmin = float(j.find('xmin').text) 
                xmax = float(j.find('xmax').text) 
                ymin = float(j.find('ymin').text)
                ymax = float(j.find('ymax').text) 

            obj = { 'bbox': [xmin, ymin, xmax, ymax],
                    'bbox_mode': BoxMode.XYXY_ABS, # remember to change!
                    'category_id': class_to_int[label],
                    'catagory_label': label,
                    'iscrowd' : 0}

            objs.append(obj)

        record["annotations"] = objs

        dataset_dicts.append(record)
        idx += 1
        print(idx, end="\r")
  
    return(dataset_dicts)

In [3]:
img_dir = '/home/simon/Documents/Bodies/data/jeppe/images' #'/home/projects/ku_00017/data/raw/bodies/OD_images_annotated' 

train_img_dicts = get_img_dicts(img_dir, train = True)
val_img_dicts = get_img_dicts(img_dir, train = False)

# classes = get_classes(img_dir)

202

In [4]:
def get_count_df(img_dicts):

    list_of_counts = []

    for i in img_dicts:

        list_of_annotations = []
        for j in i['annotations']:

            list_of_annotations.append(j['catagory_label'])

        dict_of_counts_a = {'image_id' : i['image_id'], 'file_name': i['file_name']}

        dict_of_counts_b = dict(Counter(list_of_annotations))

        dict_of_counts = {**dict_of_counts_a, **dict_of_counts_b} # merge dicts

        list_of_counts.append(dict_of_counts)

    count_df = pd.DataFrame(list_of_counts)
    count_df.fillna(0, inplace = True)

    # make all values ints, when applicable
    for i in count_df.columns:
        try: count_df[[i]] = count_df[[i]].astype(int)
        except: pass

    return(count_df)

In [5]:
train_df = get_count_df(train_img_dicts)
val_df = get_count_df(val_img_dicts)

In [6]:
# N objects
train_df.sum(axis=0)[2:] # ignore image_id and file name

person                      1953
male                        1633
adult                       1503
uniformed                    440
hostage                       37
child                        207
blooded_area                  28
casualty                      23
female                       274
elderly                       66
religious_garment_female     158
firearm                      185
youth                        151
flag_iraqi                    56
infant                        10
flag_us                       42
military_vehicle              62
prayer_salah                   6
prayer_informal                2
dtype: object

In [7]:
# N images containing objects
train_df.astype(bool).astype(int).sum(axis=0)[2:] # ignore image_id and file name

person                      739
male                        669
adult                       656
uniformed                   234
hostage                      21
child                       121
blooded_area                 19
casualty                     22
female                      169
elderly                      42
religious_garment_female    104
firearm                     107
youth                        87
flag_iraqi                   39
infant                       10
flag_us                      37
military_vehicle             54
prayer_salah                  2
prayer_informal               2
dtype: int64

In [8]:
# N objects
val_df.sum(axis=0)[2:] # ignore image_id and file name

person                      468
child                        44
female                       81
casualty                      6
adult                       369
male                        375
elderly                      19
flag_iraqi                   20
uniformed                   114
firearm                      50
youth                        33
military_vehicle             30
religious_garment_female     39
flag_us                      11
blooded_area                  4
infant                        1
hostage                       2
dtype: object

In [9]:
# N images containing objects
val_df.astype(bool).astype(int).sum(axis=0)[2:] # ignore image_id and file name

person                      190
child                        24
female                       54
casualty                      6
adult                       176
male                        161
elderly                      13
flag_iraqi                   14
uniformed                    63
firearm                      25
youth                        18
military_vehicle             20
religious_garment_female     30
flag_us                      10
blooded_area                  4
infant                        1
hostage                       2
dtype: int64

In [10]:
# hist of train, val and test

In [11]:
# hist of objects in full set

In [12]:
# hist of objects in train, val and test set

In [13]:
# Hist of total objects