# Test how well we can write and read TFrecords files

In [21]:
# where to save tmp files?
binaries_file = '/Users/jillnaiman/Downloads/tmp/'

In [33]:
import numpy as np
from sys import path
path.append('../')
import config
import pandas as pd
from glob import glob
import tensorflow as tf
from annotation_utils import get_all_ocr_files, make_ann_directories, collect_ocr_process_results, \
   get_makesense_info_and_years, get_years, get_cross_index, get_pdffigures_info, get_annotation_name, \
   true_box_caption_mod
from post_processing_utils import parse_annotations_to_labels
from feature_generation_utils import generate_single_feature
from general_utils import parse_annotation

In [None]:
# let's get all of the ocr files
ocrFiles = get_all_ocr_files()
# get important quantities from these files
print('retreiving OCR data, this can take a moment...')
ws, paragraphs, squares, html, rotations,colorbars = collect_ocr_process_results(ocrFiles)
# create dataframe
df = pd.DataFrame({'ws':ws, 'paragraphs':paragraphs, 'squares':squares, 
                   'hocr':html, 'rotation':rotations, 'colorbars':colorbars})#, 'pdfwords':pdfwords})
df = df.drop_duplicates(subset='ws')
df = df.set_index('ws')

In [16]:
# get annotations
imgDirAnn = config.save_binary_dir + config.ann_name + str(int(config.IMAGE_H)) + 'x' + str(int(config.IMAGE_W))  + '_ann/'
# get all annotations
annotations = glob(imgDirAnn+'*.xml')

LABELS, labels, slabels, \
  CLASS, annotations, Y_full, maxboxes = parse_annotations_to_labels(imgDirAnn, 
                                                           '', 
                                                           benchmark=True,
                                                          return_max_boxes=True)



In [24]:
iw = 5
img_resize=(config.IMAGE_H, config.IMAGE_W)

fname = annotations[iw].split('/')[-1].split('.xml')[0]
#floc = binaries_file + fname + '.npz'

dfsingle = df.loc[fname+'.jpeg']

# if we've made it this far, let's generate features
feature_list = ['grayscale']
feature_name = generate_single_feature(dfsingle, LABELS, maxboxes, 
                                       feature_list = feature_list, 
                                       binary_dir = binaries_file)


In [25]:
# different kinds of features
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


In [48]:
# Create a dictionary with features that may be relevant.
def image_example(image, boxes, img_name):
    #image_shape = tf.io.decode_jpeg(image_string).shape
    image_string = image.astype('float32')/255.0
    image_string = image.reshape(image.shape[0]*image.shape[1]*image.shape[2])

    nfeatures = image.shape[2]
    nboxes = boxes.shape[0]
    if nboxes>0:
        boxout = boxes.reshape(boxes.shape[0]*boxes.shape[1])
    else:
        boxout = np.array([])

    feature = {
      'nbox': _float_feature(np.float32(nboxes)),
      'nfeatures': _float_feature(np.float32(nfeatures)),
      'boxes': _bytes_feature(boxout.astype('float32').tobytes()),
      'image_raw': _bytes_feature(image_string.astype('float32').tobytes()),
      #'image_name': _bytes_feature(img_name.tobytes()),
      'image_name': _bytes_feature(img_name.encode('utf-8')),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))  

In [49]:
'hi'.encode('utf-8')

b'hi'

In [50]:
# make a temp record file to see how big each file is, on avearge
# write one image file and see how big it is
record_file = binaries_file+'TMPTFRECORD/test.tfrecords'
compress = 'GZIP'
tf_record_options = tf.io.TFRecordOptions(compression_type = compress) 

#with tf.io.TFRecordWriter(record_file) as writer:
with tf.io.TFRecordWriter(record_file, options=tf_record_options) as writer:
    a = imgDirAnn + annotations[iw].split('/')[-1]
    imgs_name, bbox = parse_annotation([a], LABELS,
                                           feature_dir=config.tmp_storage_dir+'TMPTFRECORD/',
                                           annotation_dir=imgDirAnn) 
    arr = np.load(imgs_name[0])['arr_0']

    # fake boxes
    fakebox = np.random.random([maxboxes,5])
    tf_example = image_example(arr,fakebox,imgs_name[0])
    writer.write(tf_example.SerializeToString())

## Now, let's try to read this data.

In [36]:
# for tfrecrords, get datasets
test_list = [binaries_file+'TMPTFRECORD/test.tfrecords']
test_raw_data = tf.data.TFRecordDataset(filenames=test_list, 
                                         compression_type='GZIP', 
                                         buffer_size=None, 
                                        num_parallel_reads=tf.data.AUTOTUNE)

In [41]:
# Create a dictionary describing the features.
image_feature_description = {
    'nbox': tf.io.FixedLenFeature([], tf.float32),
    'nfeatures': tf.io.FixedLenFeature([], tf.float32),
    'boxes': tf.io.FixedLenFeature([], tf.string),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
    'image_name': tf.io.FixedLenFeature([], tf.string),
}

def _parse_image_function_test(example_proto):
    image_features = tf.io.parse_single_example(example_proto, image_feature_description)
    # parse the data
    nboxes = image_features['nbox']
    nfeatures = image_features['nfeatures']
    boxes = tf.io.decode_raw(image_features['boxes'],tf.float32)
    boxes = tf.reshape(boxes,[nboxes,5])
    images_raw = image_features['image_raw']
    image = tf.io.decode_raw(images_raw,tf.float32)
    image = tf.reshape(image,[config.IMAGE_H,config.IMAGE_W,nfeatures])
    img_name = tf.cast(image_features['image_name'],tf.string)
    #print(img_name)
    #image_name = tf.io.decode_raw(image_features['image_name'],tf.string) 
    #print(image_name)
    return image,img_name,nboxes,nfeatures,boxes

test_dataset = test_raw_data.interleave(lambda x: test_raw_data.map(lambda example_proto:_parse_image_function_test(example_proto), 
                                    num_parallel_calls=tf.data.AUTOTUNE))

In [44]:
# show
for image,img_name,nbox,nfeatures,boxes in test_dataset.take(1):
    #print(nbox,nfeatures,boxes)
    print(nfeatures)
    print(img_name)

tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(b'/\x00\x00\x00U\x00\x00\x00s\x00\x00\x00e\x00\x00\x00r\x00\x00\x00s\x00\x00\x00/\x00\x00\x00j\x00\x00\x00i\x00\x00\x00l\x00\x00\x00l\x00\x00\x00n\x00\x00\x00a\x00\x00\x00i\x00\x00\x00m\x00\x00\x00a\x00\x00\x00n\x00\x00\x00/\x00\x00\x00D\x00\x00\x00o\x00\x00\x00w\x00\x00\x00n\x00\x00\x00l\x00\x00\x00o\x00\x00\x00a\x00\x00\x00d\x00\x00\x00s\x00\x00\x00/\x00\x00\x00t\x00\x00\x00m\x00\x00\x00p\x00\x00\x00/\x00\x00\x00T\x00\x00\x00M\x00\x00\x00P\x00\x00\x00T\x00\x00\x00F\x00\x00\x00R\x00\x00\x00E\x00\x00\x00C\x00\x00\x00O\x00\x00\x00R\x00\x00\x00D\x00\x00\x00/\x00\x00\x001\x00\x00\x008\x00\x00\x009\x00\x00\x005\x00\x00\x00A\x00\x00\x00p\x00\x00\x00J\x00\x00\x00_\x00\x00\x00_\x00\x00\x00_\x00\x00\x00_\x00\x00\x00_\x00\x00\x001\x00\x00\x00_\x00\x00\x00_\x00\x00\x003\x00\x00\x000\x00\x00\x005\x00\x00\x00P\x00\x00\x00_\x00\x00\x00p\x00\x00\x000\x00\x00\x00.\x00\x00\x00n\x00\x00\x00p\x00\x00\x00z\x00\x00\x00', shape=(), dtype=string)
