In [1]:
# if local use new_torch_env

import os

import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter

import cv2
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from xml.etree import ElementTree, ElementInclude

import pickle

from detectron2.structures import BoxMode


In [2]:

np.random.seed(42) # see if this is the culprit.

def get_classes(img_dir):
    """Creates a list of classes and corrosponding ints. also a dict to translate"""

    obj_name = []

    # Get all objects that have been annotated
    for filename in os.listdir(img_dir):
        if filename.split('.')[1] == 'xml':
            box_path = os.path.join(img_dir, filename)

            tree = ElementTree.parse(box_path)
            lst_obj = tree.findall('object')

            for j in lst_obj:
                obj_name.append(j.find('name').text)
    
    classes = list(sorted(set(obj_name))) # all labesl
    classes_int = list(np.arange(0,len(classes))) # corrospoding int
    class_to_int = dict(zip(classes,classes_int)) # a dict to translate between them

    return(classes, classes_int, class_to_int)


def get_img_path(img_dir):

    """Creates a list of all image paths."""

    # right now this does not take into account whether the image was anotated or not.
    # It also does not handle test or train.

    img_path_list = []

    for root, dirs, files in os.walk(img_dir):
        for img_name in files:
            if img_name.split('.')[1] == 'jpg':
                img_path = os.path.join(img_dir, img_name)                
                img_path_list.append(img_path)

    return(img_path_list)


def get_annotation_path(img_dir):

    """Creates a list of all box paths."""

    annotation_list = []

    for filename in os.listdir(img_dir):
        if filename.split('.')[1] == 'xml':
            annotation_list.append(filename)

    return(annotation_list)

def get_train_test(annotation_list, train_ratio = 0.8):

    train_n = int(len(annotation_list) * train_ratio)
    train_set = np.random.choice(annotation_list, train_n, replace = False)
    test_set = [i for i in annotation_list if i not in train_set]

    return(train_set, test_set)


def get_img_dicts(img_dir, train = True):

    _, _, class_to_int = get_classes(img_dir) # only need the dict here.
    annotation_list = get_annotation_path(img_dir) # new
    train_set, test_set = get_train_test(annotation_list) 

    dataset_dicts = []
    idx = 0

    # if you just want a list to go through, you cna generalizr the function below (get_img_path)... 
    # and if you had that function splitting into train and test would be simple.

    if train == True:
        subset = train_set
    
    elif train == False:
        subset = test_set

    for filename in subset:

    # for filename in os.listdir(img_dir):
    #    if filename.split('.')[1] == 'xml': # only for annotated images. filename is now effectively annotationes.

        img_name = filename.split('.')[0] + '.jpg' # the image name w/ correct extension.
        
        record = {}
        img_path = os.path.join(img_dir, img_name)

        height, width = cv2.imread(img_path).shape[:2]

        record["file_name"] = img_path #  needs to be the full path to the image file acccording to docs.
        record["image_id"] = idx
        record["height"] = height
        record["width"] = width

        objs = []
        obj_path = os.path.join(img_dir, filename)
        tree = ElementTree.parse(obj_path)

        annotations = tree.findall('object')

        for i in annotations: # go through all annotated objs in a given image

            label = i.find('name').text # get the label
            box = i.findall('bndbox') # find the box

            for j in box: # get the 4 measures from the box

                xmin = float(j.find('xmin').text) 
                xmax = float(j.find('xmax').text) 
                ymin = float(j.find('ymin').text)
                ymax = float(j.find('ymax').text) 

            obj = { 'bbox': [xmin, ymin, xmax, ymax],
                    'bbox_mode': BoxMode.XYXY_ABS, # remember to change!
                    'category_id': class_to_int[label],
                    'catagory_label': label,
                    'iscrowd' : 0}

            objs.append(obj)

        record["annotations"] = objs

        dataset_dicts.append(record)
        idx += 1
        print(idx, end="\r")
  
    return(dataset_dicts)

In [3]:
img_dir = '/home/simon/Documents/Bodies/data/jeppe/images' #'/home/projects/ku_00017/data/raw/bodies/OD_images_annotated' 

train_img_dicts = get_img_dicts(img_dir, train = True)
val_img_dicts = get_img_dicts(img_dir, train = False)

classes = get_classes(img_dir)

202

In [4]:
def get_count_df(img_dicts):

    list_of_counts = []

    for i in img_dicts[0:3]:

        list_of_annotations = []
        for j in i['annotations']:

            list_of_annotations.append(j['catagory_label'])

        dict_of_counts_a = {'image_id' : i['image_id'], 'file_name': i['file_name']}

        dict_of_counts_b = dict(Counter(list_of_annotations))

        dict_of_counts = {**dict_of_counts_a, **dict_of_counts_b} # merge dicts

        list_of_counts.append(dict_of_counts)

    count_df = pd.DataFrame(list_of_counts)
    count_df.fillna(0, inplace = True)

    # make all values ints, when applicable
    for i in count_df.columns:
        try: count_df[[i]] = count_df[[i]].astype(int)
        except: pass

    return(count_df)

In [5]:
train_df = get_count_df(train_img_dicts)
train_df

Unnamed: 0,image_id,file_name,person,male,adult,uniformed,hostage,child,blooded_area,casualty,female
0,0,/home/simon/Documents/Bodies/data/jeppe/images...,2,2,2,1,1,0,0,0,0
1,1,/home/simon/Documents/Bodies/data/jeppe/images...,2,2,2,0,0,0,0,0,0
2,2,/home/simon/Documents/Bodies/data/jeppe/images...,1,0,0,0,0,1,1,1,1


In [6]:
# N objects
train_df.sum(axis=0)[2:] # ignore image_id and file name

person          5
male            4
adult           4
uniformed       1
hostage         1
child           1
blooded_area    1
casualty        1
female          1
dtype: object

In [7]:
# N images containing objects
train_df.astype(bool).astype(int).sum(axis=0)[2:] # ignore image_id and file name

person          3
male            2
adult           2
uniformed       1
hostage         1
child           1
blooded_area    1
casualty        1
female          1
dtype: int64

In [72]:
list_of_counts = []

for i in train_img_dicts[0:3]:

    list_of_annotations = []
    for j in i['annotations']:

        list_of_annotations.append(j['catagory_label'])

    dict_of_counts_a = {'image_id' : i['image_id'], 'file_name': i['file_name']}

    dict_of_counts_b = dict(Counter(list_of_annotations))

    dict_of_counts = {**dict_of_counts_a, **dict_of_counts_b} # merge dicts

    list_of_counts.append(dict_of_counts)

print(list_of_counts)

train_df = pd.DataFrame(list_of_counts)
train_df.fillna(0, inplace = True)

# make all values ints, when applicable
for i in train_df.columns:
    try: train_df[[i]] = train_df[[i]].astype(int)
    except: pass

train_df

[{'image_id': 0, 'file_name': '/home/simon/Documents/Bodies/data/jeppe/images/JS10088.jpg', 'flag_us': 2, 'person': 5, 'adult': 5, 'male': 5, 'uniformed': 4, 'firearm': 2}, {'image_id': 1, 'file_name': '/home/simon/Documents/Bodies/data/jeppe/images/JS10743.jpg', 'person': 4, 'adult': 4, 'female': 2, 'male': 2}, {'image_id': 2, 'file_name': '/home/simon/Documents/Bodies/data/jeppe/images/JS36167.jpg', 'person': 2, 'adult': 2, 'male': 2}]


Unnamed: 0,image_id,file_name,flag_us,person,adult,male,uniformed,firearm,female
0,0,/home/simon/Documents/Bodies/data/jeppe/images...,2,5,5,5,4,2,0
1,1,/home/simon/Documents/Bodies/data/jeppe/images...,0,4,4,2,0,0,2
2,2,/home/simon/Documents/Bodies/data/jeppe/images...,0,2,2,2,0,0,0


In [13]:
train_img_dicts[0]['annotations'][0]['catagory_label']

# get classes
# Make list to be a list of image specific dicts: list_counts 

# For each image i in list images:
    # create annotaiton_list
    # get number of annotations
    # For each annotation a in image i:
        # get catagory_label
        # append to image specific annotation_list

    # img_dict = img_dict_dict(Counter(annotation_list))
    # img_dict[img] = img_id

    # list_counts.append(img_dict) 

#train_df = pd.DataFrame(list_counts)

# you want to end up w/ a datafram where columns are X_count (where X are a specific object) and rows are images.

'person'

In [19]:
test_dict = {'image_id' : [1,2,4,5], 'X1_count' : [3,5,63,2], 'X2_count' : [13,52,3,12]}
pd.DataFrame(test_dict)

Unnamed: 0,image_id,X1_count,X2_count
0,1,3,13
1,2,5,52
2,4,63,3
3,5,2,12


In [39]:
test_list0 = ['i', 'i', 'm']
test_dict0= dict(Counter(test_list0))
test_dict0['img_id'] = 0

test_list1 = ['m', 'i', 'm','a']
test_dict1= dict(Counter(test_list1))
test_dict1['img_id'] = 1


In [62]:
test_dict0a = {'img_id' : 0}
test_list0 = ['i', 'i', 'm']
test_dict0b= dict(Counter(test_list0))
test_dict0 = {**test_dict0a, **test_dict0b}

test_dict1a = {'img_id' : 1}
test_list1 = ['m', 'i', 'm','a']
test_dict1b= dict(Counter(test_list1))
test_dict1 = {**test_dict1a, **test_dict1b}


In [63]:
test_dict1

{'img_id': 1, 'm': 2, 'i': 1, 'a': 1}

In [64]:
list_0f_dicts = [test_dict0, test_dict1]

pd.DataFrame(list_0f_dicts)

Unnamed: 0,img_id,i,m,a
0,0,2,1,
1,1,1,2,1.0


In [None]:
# hist of train, val and test

In [None]:
# hist of objects in full set

In [None]:
# hist of objects in train, val and test set

In [None]:
# Hist of total objects