Creates tfrecord files for Google Object Detection API

In [None]:
import sys
import os

import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

import tensorflow as tf

from object_detection.utils import dataset_util
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
from time import time

from sklearn.model_selection import train_test_split

In [None]:
LABEL_FOLDER = './data/'
labels = pd.read_csv(os.path.join(LABEL_FOLDER, 'data.csv'))
labels = labels.sort_values('label')
labels['code'] = labels.label.apply(lambda x: x.split('_')[0])
prev_name = ''
prev_value = 1
for idx, row in labels.iterrows():
    if row.code == prev_name:
        prev_value += 1
    else:
        prev_value = 1
        prev_name = row.code
    code = '{}_{}'.format(prev_name, str(prev_value).zfill(2))
    labels.at[idx, 'code'] = code
#labels = labels.set_index('code')
labels = labels.sort_index()
labels.fillna(0, inplace=True)
labels.head()

In [None]:

OUTPUT_TRAIN = './tfrecord/train_{}.record'
OUTPUT_TEST = './tfrecord/test_{}.record'

TEST_IDX = 0 # 

In [None]:
def set_first_frame(df, fps):
    first_time = df.TimeStamp.astype(int).min() / 1e9
    if first_time > 0:
        first_frame = np.round(first_time * fps)
        #print ('First frame = {}'.format(first_frame))
        tmp = df.groupby('TimeStamp').size().sort_index().reset_index().drop(0, axis=1)
        tmp['frame'] = np.arange(len(tmp)) + first_frame
        df = df.drop('frame', axis=1).merge(tmp, on='TimeStamp')
    return df

def change_values(df):
    """
    Put everything that you want to change in data here.
    """
    df.type.replace(to_replace = 'motorcycle', vlaue='vehicle', inplace=True)
    return df

def read_data(idx):
    df = pd.read_excel(os.path.join(LABEL_FOLDER, labels.at[idx, 'label']))
    df.TimeStamp = pd.to_timedelta(df.TimeStamp)
    tmp = df.groupby('TimeStamp').size().sort_index().reset_index().drop(0, axis=1)
    tmp['frame'] = np.arange(len(tmp)) - labels.at[idx, 'shift']
    df = df.merge(tmp, on='TimeStamp')
    
    cap = cv2.VideoCapture(labels.loc[idx, 'video']) 
    fps = cap.get(cv2.CAP_PROP_FPS)
    df = set_first_frame(df, fps)
    for col in ['x', 'y', 'frame']:
        df[col] = df[col].astype(int)
    return df, cap


In [None]:
if False: #Automatic
    df_classes = df['type'].value_counts().to_frame()
    df_classes.insert(0, 'class_id', range(1, 1 + len(df_classes)))
else: # Manual
    classes_dic = {
        1 : 'vehicle',
        2 : 'truck',
        -1 : 'delete'
    }
    df_classes = pd.DataFrame(list(classes_dic.items()),columns = ['class_id','class_name']).set_index('class_name')
df_classes

In [None]:
def process_image(img, rows):
    ### Plot rectangles filled with zeros
    tmp = rows[rows.type == 'delete']
    if len(tmp) > 0:
        for idx, row in tmp.iterrows():
            cv2.rectangle(img, (row.x, row.y), (row.x + row.width, row.y + row.height), (0, 0, 0), -1)
    return img

if True: ### Let's test it
    idx = labels[labels.code == 'Amir_01'].index[0]
    test_df, test_cap = read_data(idx)
    frameno = test_df.frame.min()
    test_cap.set(cv2.CAP_PROP_POS_FRAMES, frameno)
    ret, test_img = test_cap.read()
    test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)
    plt.imshow(test_img)
    plt.show()
    test_img = process_image(test_img, test_df[test_df.frame == frameno])
    plt.imshow(test_img)
    plt.show()



In [None]:
def create_tf_example(filename, img, frameno, df, debug=False):
    
    rows = df[df.frame == frameno]
    img = process_image(img, rows)
    height = img.shape[0]
    width = img.shape[1]
    
    xmins = list(rows['x'] / width)
    xmaxs = list((rows['x'] + rows['width']) / width)
    ymins = list(rows['y'] / height)
    ymaxs = list((rows['y'] + rows['height']) / height)

    classes_text = list(rows['type'])
    classes = list(df_classes.loc[classes_text, 'class_id'])
    
    #with tf.gfile.GFile(img_path, 'rb') as fid:
    #    encoded_image_data = fid.read()
    is_success, im_buf_arr = cv2.imencode(".jpg", img)
    encoded_image_data = im_buf_arr.tobytes()

    
    # Change strings into bytes
    image_format = filename.split('.')[-1].encode('utf-8')
    filename = filename.encode('utf-8')
    classes_text = [x.encode('utf-8') for x in classes_text]

    
    if debug: #For debugging
        print ('W, H:', width, height)
        print ('xmins:', xmins)
        print ('xmaxs:', xmaxs)
        print ('ymins:', ymins)
        print ('ymaxs:', ymaxs)
        print ('classes_text:', classes_text)
        print ('classes:', classes)
        print ('Len of encoded data:', len(encoded_image_data))
        print ('Image format:', image_format)
        
    tf_label_and_data = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(encoded_image_data),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    
    return tf_label_and_data

In [None]:
test_df, test_cap = read_data(TEST_IDX)
test_df.head()

In [None]:
if False: ### Let's test it
    test_filename = labels.loc[TEST_IDX, 'video']
    frameno = test_df.frame.min()
    test_cap.set(cv2.CAP_PROP_POS_FRAMES, frameno)
    ret, test_img = test_cap.read()
    test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)
    tld = create_tf_example(test_filename, test_img, frameno, test_df, debug=True)

In [None]:
test_codes = ['Amir_01', 'Qinglian_01', 'Binya_01']
labels_test = labels[labels.code.isin(test_codes)].copy().reset_index()
labels_train = labels[~labels.code.isin(test_codes)].copy().reset_index()

print ('Labels test has {} datasets: {}'.format(len(labels_test), labels_test.code.unique()))
print ('Labels train has {} datasets: {}'.format(len(labels_train), labels_train.code.unique()))

In [None]:
def generate_tf_record(labels, idx, output_path):
    t = time()

    df, cap = read_data(idx)    
    output_path = output_path.format(str(idx).zfill(2))
    
    filename = labels.loc[TEST_IDX, 'video']

    writer = tf.compat.v1.python_io.TFRecordWriter(output_path)
    
    i = 0
    frameno, frameno_max = df.frame.min(), df.frame.max()
    nr_frames = frameno_max - frameno
    cap.set(cv2.CAP_PROP_POS_FRAMES, frameno)
    while frameno <= frameno_max:
        if i%20 == 0:
            sys.stdout.write('{:.1f}% ({}/{}) images processed in {:.1f} seconds           \r'
                            .format(100 * i / nr_frames, i, nr_frames, time() - t))        
        i += 1    
        ret, img = test_cap.read()
        
        if frameno not in df.frame:
            frameno += 1
            print ('Warning: No vehicles for frame: {}'.format(frameno))
            continue
        
        tf_example = create_tf_example(filename, img, frameno, test_df, debug=False)
        writer.write(tf_example.SerializeToString())
        frameno += 1
        
    writer.close()
    
    print('{}/{} ({}) done. {} images has been processed in {:.1f} seconds and written to {}                              '
          .format(
              idx+1, len(labels), labels.at[idx, 'code'],
              nr_frames, time() - t, output_path))
    

In [None]:
for idx in labels_train.index:
    generate_tf_record(labels_train, idx, OUTPUT_TRAIN)
for idx in labels_test.index:
    generate_tf_record(labels_test, idx, OUTPUT_TEST)


In [None]:
labels

In [None]:
def create_file_dic(height, width, img_format, xmin, xmax, ymin, ymax, label, text, image = None):
    dic = {}
    dic['height'] = height
    dic['width'] = width
    dic['format'] = img_format
    dic['xmin'] = xmin
    dic['xmax'] = xmax
    dic['ymin'] = ymin
    dic['ymax'] = ymax
    dic['label'] = label
    dic['text'] = text
    if image is not None:
        dic['image'] = image
    return dic

def load_file_data(tfrecord_path, load_images = True):
    i = 0
    features = {'image/filename' : tf.FixedLenFeature([], tf.string),
                'image/height' : tf.FixedLenFeature([], tf.int64),
                'image/width' : tf.FixedLenFeature([], tf.int64),
                'image/format' : tf.FixedLenFeature([], tf.string),
                'image/encoded' : tf.FixedLenFeature([], tf.string),
                'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
                'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
                'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
                'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
                'image/object/class/label': tf.VarLenFeature(dtype=tf.int64),
                'image/object/class/text': tf.VarLenFeature(dtype=tf.string),                     
               }

    loaded_files = {}

    with tf.Session() as sess:
        for s_example in tf.python_io.tf_record_iterator(tfrecord_path):
            example = tf.parse_single_example(s_example, features=features)
            (filename, height, width, img_format, 
             xmin, xmax, ymin, ymax, label, text) = sess.run([
                example['image/filename'],
                example['image/height'],
                example['image/width'],
                example['image/format'],
                example['image/object/bbox/xmin'],
                example['image/object/bbox/xmax'],
                example['image/object/bbox/ymin'],
                example['image/object/bbox/ymax'],
                example['image/object/class/label'],
                example['image/object/class/text'],
            ])


            filename = filename.decode("utf-8") 
            img_format = img_format.decode("utf-8") 

            text = [x.decode('utf-8') for x in text.values]
            label = label.values
            xmin = xmin.values
            xmax = xmax.values
            ymin = ymin.values
            ymax = ymax.values

            if load_images:
                if img_format == 'png':
                    image = tf.image.decode_png(example['image/encoded'])
                elif img_format == 'jpg' or img_format == 'jpeg':
                    image = tf.image.decode_jpeg(example['image/encoded'])
                else:
                    raise ('Unknown Image Format:' + img_format)

                image = sess.run(image)
                
                loaded_files[filename] = create_file_dic(height, width, img_format, 
                    xmin, xmax, ymin, ymax,
                    label, text, image)
            else:
                loaded_files[filename] = create_file_dic(height, width, img_format, 
                    xmin, xmax, ymin, ymax,
                    label, text, image=None)

            i += 1
            sys.stdout.write('{} files processed.     \r'.format(i))
            if i > 5:
                return loaded_files 

In [None]:
fd = load_file_data(OUTPUT_TEST)

In [None]:
fd['vid3/frameAnnotations-vid_cmp2.avi_annotations/stop_1323819291.avi_image14.png']

In [None]:
import matplotlib.pyplot as plt
plt.imshow(image)
plt.show()

In [None]:
record_iterator = tf.python_io.tf_record_iterator(path=OUTPUT_TEST)

for string_record in record_iterator:
    
    example = tf.train.Example()
    example.ParseFromString(string_record)
    break


In [None]:
example

In [None]:
string_record

In [None]:
example.features.feature['image/object/bbox/xmin']

# Old code, may be useful for creating an ampty pascal file

In [None]:
import xml.etree.ElementTree as ET

In [None]:
def createXML(filename, folder = './img', imgsize = [640, 480, 3]):
    tree = ET.parse("empty.xml")
    root = tree.getroot()
    root.find('folder').text = folder
    root.find('filename').text = filename
    root.find('path').text = '{}/{}'.format(folder, filename)
    
    size = root.find('size')
    size.find('width').text = str(imgsize[0])    
    size.find('height').text = str(imgsize[1])    
    size.find('depth').text = str(imgsize[2])    
    
    return tree
annotation = createXML('myfile.jpg')
ET.dump(annotation)

In [None]:
files = df.Filename.unique()
for filename in files[:1]:
    file = filename.split('/')[-1]
    folder = '/'.join(filename.split('/')[:-1])
    myXML = createXML(file, folder)
    ET.dump(myXML)

In [None]:
for obj in root.findall('object'):
    print (type(obj))
    name = obj.find("name").text
    print (name)

In [None]:
from xml.etree import ElementTree
from xml.dom import minidom

def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

def createXML(filename, folder = './img', imgsize = [640, 480, 3]):
    # <annotation>
    annot = ET.Element('annotation')
    annot.set('version', '1.0')
    #     <folder></folder>
    fol = ET.SubElement(annot, 'folder')
    fol.text = folder
    #     <filename></filename>
    fn = ET.SubElement(annot, 'filename')
    fn.text = 'Filename'
    #     <path></path>
    path = ET.SubElement(annot, 'path')
    path.text = folder+filename
    #     <source>
    #         <database></database>
    #     </source>
    source = ET.SubElement(annot, 'source')
    database = ET.SubElement(source, 'database')
    database.text = 'Unknown'
    #     <size>
    #         <width></width>
    #         <height></height>
    #         <depth></depth>
    #     </size>
    size = ET.SubElement(annot, 'size')
    width = ET.SubElement(size, 'width')
    height = ET.SubElement(size, 'height')
    depth = ET.SubElement(size, 'depth')
    width.text = '{}'.format(imgsize[0])
    height.text = '{}'.format(imgsize[1])
    depth.text = '{}'.format(imgsize[2])
    #     <segmented></segmented>
    segmented = ET.SubElement(annot, 'segmented')
    segmented.text = '0'
    return annot

annotation = createXML('myfile.jpg')
#ET.dump(annotation)
print (prettify(annotation))
#annotation.write('test.xml')

In [None]:

<annotation>
     <folder></folder>
     <filename></filename>
     <path></path>
     <source>
         <database></database>
     </source>
     <size>
         <width></width>
         <height></height>
         <depth></depth>
     </size>
     <segmented></segmented>


In [None]:
ET.dump(tree)

In [None]:
import six.moves.urllib as urllib
import tarfile
import os 
# What model to download.
MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

NUM_CLASSES = 90

opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
  file_name = os.path.basename(file.name)
  if 'frozen_inference_graph.pb' in file_name:
    tar_file.extract(file, os.getcwd())