In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
AUTO = tf.data.experimental.AUTOTUNE
from PIL import Image
import os
import IPython.display as display

In [None]:
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

In [None]:
df = pd.read_csv('df.csv')

# Train - 67.5%
# Val - 22.5%
# Test - 10%

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
val_data = pd.read_csv('val.csv')

# for i in range(len(test_data)):
#     row = test_data.iloc[i]

#     if row.model_id not in train_data.model_id.unique():
#         test_data.at[i, 'model_id'] = 1

#     if row.make_id not in train_data.make_id.unique():
#         test_data.at[i, 'make_id'] = 1

label_encoder = LabelEncoder().fit(df.make_id.astype(str))
train_data.make_id = label_encoder.transform(train_data.make_id.astype(str))
label_encoder = LabelEncoder().fit(df.model_id.astype(str))
train_data.model_id = label_encoder.transform(train_data.model_id.astype(str))

label_encoder = LabelEncoder().fit(df.make_id.astype(str))
val_data.make_id = label_encoder.transform(val_data.make_id.astype(str))
label_encoder = LabelEncoder().fit(df.model_id.astype(str))
val_data.model_id = label_encoder.transform(val_data.model_id.astype(str))

label_encoder = LabelEncoder().fit(df.make_id.astype(str))
test_data.make_id = label_encoder.transform(test_data.make_id.astype(str))
label_encoder = LabelEncoder().fit(df.model_id.astype(str))
test_data.model_id = label_encoder.transform(test_data.model_id.astype(str))

In [None]:
train_image_paths = train_data['filename']
train_labels = train_data[['make_id', 'model_id']]

val_image_paths = val_data['filename']
val_labels = val_data[['make_id', 'model_id']]

test_image_paths = test_data['filename']
test_labels = test_data[['make_id', 'model_id']]

tfrecord_train_dir = 'tfrecords/train/'
tfrecord_val_dir = 'tfrecords/val/'
tfrecord_test_dir = 'tfrecords/test/'

## TRAIN DATA

In [None]:
SHARDS = 128
nb_images = len(train_data)
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

Pattern matches 92201 images which will be rewritten as 128 .tfrec files containing 721 images each.


In [None]:
def _parse_function(filename, label):
    img_raw = tf.io.read_file(filename)
    return img_raw, label

In [None]:
files = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))
dataset = files.map(_parse_function)
dataset = dataset.batch(shard_size)

In [None]:
def to_tfrecord(tfrec_filewriter, img_bytes, label):
    one_hot_class = [np.eye(163)[label[0]], np.eye(1716)[label[1]]]
    
    feature = {
        "image": _bytestring_feature([img_bytes]), # one image in the list
        "make_id": _int_feature([label[0]]),
        "make_id_oh": _float_feature(one_hot_class[0].tolist()),
        "model_id": _int_feature([label[1]]),
        "model_id_oh": _float_feature(one_hot_class[1].tolist())
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = tfrecord_train_dir + "{:02d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i],
                              label.numpy()[i])
        out_file.write(example.SerializeToString())
    
    print("Wrote file {} containing {} records".format(filename, shard_size))

Writing TFRecords


IndexError: index 1702 is out of bounds for axis 0 with size 1677

## VALIDATION DATA

In [None]:
SHARDS = 32
nb_images = len(val_data)
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

Pattern matches 30734 images which will be rewritten as 32 .tfrec files containing 961 images each.


In [None]:
# def _parse_function(filename, label):
#     img_raw = tf.io.read_file(filename)
#     return img_raw, label

In [None]:
files = tf.data.Dataset.from_tensor_slices((val_image_paths, val_labels))
dataset = files.map(_parse_function)
dataset = dataset.batch(shard_size)

In [None]:
# def to_tfrecord(tfrec_filewriter, img_bytes, label):
#     one_hot_class = np.eye(42)[label] 
#     feature = {
#         "image": _bytestring_feature([img_bytes]), # one image in the list
#         "class": _int_feature([label]),
#         "one_hot_class": _float_feature(one_hot_class.tolist())
#     }
#     return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = tfrecord_val_dir + "{:02d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i],
                              label.numpy()[i])
        out_file.write(example.SerializeToString())
    
    print("Wrote file {} containing {} records".format(filename, shard_size))

Writing TFRecords
Wrote file tfrecords/val/00-961.tfrec containing 961 records
Wrote file tfrecords/val/01-961.tfrec containing 961 records
Wrote file tfrecords/val/02-961.tfrec containing 961 records
Wrote file tfrecords/val/03-961.tfrec containing 961 records
Wrote file tfrecords/val/04-961.tfrec containing 961 records
Wrote file tfrecords/val/05-961.tfrec containing 961 records
Wrote file tfrecords/val/06-961.tfrec containing 961 records
Wrote file tfrecords/val/07-961.tfrec containing 961 records
Wrote file tfrecords/val/08-961.tfrec containing 961 records
Wrote file tfrecords/val/09-961.tfrec containing 961 records
Wrote file tfrecords/val/10-961.tfrec containing 961 records
Wrote file tfrecords/val/11-961.tfrec containing 961 records
Wrote file tfrecords/val/12-961.tfrec containing 961 records
Wrote file tfrecords/val/13-961.tfrec containing 961 records
Wrote file tfrecords/val/14-961.tfrec containing 961 records
Wrote file tfrecords/val/15-961.tfrec containing 961 records
Wrote 

## TEST DATA

In [None]:
SHARDS = 16
nb_images = len(test_data)
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

Pattern matches 13791 images which will be rewritten as 16 .tfrec files containing 862 images each.


In [None]:
# def _parse_function(filename):
#     img_raw = tf.io.read_file(filename)
#     return img_raw

In [None]:
files = tf.data.Dataset.from_tensor_slices((test_image_paths, test_labels))
dataset = files.map(_parse_function)
dataset = dataset.batch(shard_size)

In [None]:
# def to_tfrecord(tfrec_filewriter, img_bytes):
#     feature = {
#         "image": _bytestring_feature([img_bytes]), # one image in the list
#     }
#     return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = tfrecord_test_dir + "{:02d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i],
                              label.numpy()[i])
        out_file.write(example.SerializeToString())
    
    print("Wrote file {} containing {} records".format(filename, shard_size))

Writing TFRecords
Wrote file tfrecords/test/00-862.tfrec containing 862 records
Wrote file tfrecords/test/01-862.tfrec containing 862 records
Wrote file tfrecords/test/02-862.tfrec containing 862 records
Wrote file tfrecords/test/03-862.tfrec containing 862 records
Wrote file tfrecords/test/04-862.tfrec containing 862 records
Wrote file tfrecords/test/05-862.tfrec containing 862 records
Wrote file tfrecords/test/06-862.tfrec containing 862 records
Wrote file tfrecords/test/07-862.tfrec containing 862 records
Wrote file tfrecords/test/08-862.tfrec containing 862 records
Wrote file tfrecords/test/09-862.tfrec containing 862 records
Wrote file tfrecords/test/10-862.tfrec containing 862 records
Wrote file tfrecords/test/11-862.tfrec containing 862 records
Wrote file tfrecords/test/12-862.tfrec containing 862 records
Wrote file tfrecords/test/13-862.tfrec containing 862 records
Wrote file tfrecords/test/14-862.tfrec containing 862 records
Wrote file tfrecords/test/15-861.tfrec containing 86

## READ TRAIN/VAL TFRECORDS

In [None]:
IMAGE_SIZE = [224,224]
BATCH_SIZE = 128

def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "make_id": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        "make_id_oh": tf.io.VarLenFeature(tf.float32) # a certain number of floats
        "model_id": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        "model_id_oh": tf.io.VarLenFeature(tf.float32)# a certain number of floats
    }
    
    feature = tf.io.parse_single_example(example, features)
    image = tf.image.decode_jpeg(feature['image'], channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    label = feature['class']
    one_hot_class = tf.sparse.to_dense(feature['one_hot_class'])
    one_hot_class = tf.reshape(one_hot_class, [42])
    return image, one_hot_class

    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

train_path = tf.io.gfile.glob(tfrecord_train_dir+ "*.tfrec")
val_path = tf.io.gfile.glob(tfrecord_val_dir + "*.tfrec")

training_dataset = tf.data.TFRecordDataset(train_path, num_parallel_reads=AUTO)
training_dataset = training_dataset.with_options(option_no_order)
training_dataset = training_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
training_dataset = training_dataset.batch(BATCH_SIZE)

val_dataset = tf.data.TFRecordDataset(val_path, num_parallel_reads=AUTO)
val_dataset = val_dataset.with_options(option_no_order)
val_dataset = val_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [None]:
for image, label in training_dataset.take(1):
    print(image.numpy().shape, label.numpy().shape)
    

(128, 224, 224, 3) (128, 42)


In [None]:
for image, label in val_dataset.take(1):
    print(image.numpy().shape, label.numpy().shape)

(128, 224, 224, 3) (128, 42)


# READ TEST TFRECORDS

In [None]:
IMAGE_SIZE = [224,224]
BATCH_SIZE = 128

def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
    }
    
    feature = tf.io.parse_single_example(example, features)
    image = tf.image.decode_jpeg(feature['image'], channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    return image

    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

test_path = tf.io.gfile.glob(tfrecord_test_dir+ "*.tfrec")

test_dataset = tf.data.TFRecordDataset(test_path, num_parallel_reads=AUTO)
test_dataset = test_dataset.with_options(option_no_order)
test_dataset = test_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
for image in test_dataset.take(1):
    print(image.numpy().shape)

(128, 224, 224, 3)
