## What are TPUs?
The Tensor Processing Unit (TPU) is a custom integrated chip, designed specifically to accelerate the process of training machine learning models. 

## TPUs for free at Kaggle
**You can use up to 30 hours per week of TPUs and up to 9h at a time in a single session.**
**For more info you can visit [here](https://www.kaggle.com/docs/tpu).**

## Why do we need TFRecord format?
The TFRecord format is tensorflow's custom data format which is simple for storing a sequence of binary records. The advantages of using TFRecords are amazingly more efficient storage, fast I/O, self-contained files, etc. The main advantage of TPUs are faster I/O which results in faster model training.

For understanding the basics of TFRecords, please visit Ryan Holbrook notebook: [TFRecords Basics](https://www.kaggle.com/ryanholbrook/tfrecords-basics).

### In this notebook you will learn how to convert image dataset into TFRecord format.

## Useful resources which helped me:¶
* https://www.tensorflow.org/tutorials/load_data/tfrecord
* https://www.kaggle.com/mgornergoogle/five-flowers-with-keras-and-xception-on-tpu
* https://towardsdatascience.com/a-practical-guide-to-tfrecords-584536bc786c
* https://keras.io/examples/keras_recipes/creating_tfrecords/
* https://www.kaggle.com/lqdisme/dog-breed-identification
* https://cloud.google.com/blog/products/ai-machine-learning/what-makes-tpus-fine-tuned-for-deep-learning
* https://pub.towardsai.net/writing-tfrecord-files-the-right-way-7c3cee3d7b12

# Imports

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
from numpy import asarray
from tensorflow.keras.preprocessing.image import img_to_array,load_img,ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Loading datasets

In [None]:
TRAIN_DIR = '../input/dog-breed-identification/train'
TEST_DIR = '../input/dog-breed-identification/test'
df = pd.read_csv('../input/dog-breed-identification/labels.csv')
submission_df = pd.read_csv('../input/dog-breed-identification/sample_submission.csv')

# Feature Engineering

In [None]:
dog_breeds = sorted(list(set(df['breed'])))
num_classes = len(dog_breeds)
class_to_num = dict(zip(dog_breeds, range(num_classes)))

In [None]:
from tqdm import tqdm

def load_image(TRAIN_DIR, df, image_size):
    image_names = df['id']
    labels = df['breed']
    data_size = len(labels)
    
    X = np.empty([data_size, image_size[0], image_size[1], image_size[2]], dtype=np.uint8)
    y = np.empty([data_size, 1], dtype=np.uint8)
    
    for i in tqdm(range(data_size)):
        img_path = os.path.join(TRAIN_DIR, image_names[i]+'.jpg')
        image = load_img(img_path, target_size=image_size)
        X[i] = image
        y[i] = class_to_num[labels[i]]
    y = to_categorical(y)    
    ind = np.random.permutation(data_size)
    X = X[ind]
    y = y[ind]
    print(f"X shape: {X.shape}\ny shape: {y.shape}")
    return X,y

image_size = (331, 331, 3)
X, y = load_image(TRAIN_DIR, df, image_size) 

In [None]:
def load_test_image(TEST_DIR, test_df, image_size):
    image_names = test_df['id']
    data_size = len(image_names)
    X = np.empty([data_size, image_size[0], image_size[1], image_size[2]], dtype=np.uint8)

    for i in tqdm(range(data_size)):
        image_path = os.path.join(TEST_DIR, image_names[i]+'.jpg')
        image = load_img(image_path, target_size=image_size)
        X[i] = image
    
    print(f"Test data shape: {X.shape}")
    return X

test = load_test_image(TEST_DIR, submission_df, image_size)

# Funtions for feature creation
The following functions can be used to convert a value to a type compatible which takes a scalar input values and returns a tf.train.Feature.

In [None]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array

In [None]:
def parse_single_image(image, label=None):
    if label is None:
        data = {
                'height' : _int64_feature(image.shape[0]),
                'width' : _int64_feature(image.shape[1]),
                'depth' : _int64_feature(image.shape[2]),
                'raw_image' : _bytes_feature(serialize_array(image)),
        }
    else:
        data = {
                'height' : _int64_feature(image.shape[0]),
                'width' : _int64_feature(image.shape[1]),
                'depth' : _int64_feature(image.shape[2]),
                'raw_image' : _bytes_feature(serialize_array(image)),
                'label' : _bytes_feature(serialize_array(label))
        }
    out = tf.train.Example(features=tf.train.Features(feature=data))
    return out

# Writing and Converting to TFRecord

Now, we'll create a dictionary to store the actual image, height, width and depth of the image and the label where we first serialize the array and then convert it to a bytes_feature. All these key:value mappings make up the features for one Example.


In [None]:
import tqdm
def write_images_to_tfr(images, filename, labels=None):
    max_files=800
    out_dir="./"
    splits = (len(images)//max_files) + 1 #determine how many shards are needed
    if len(images)%max_files == 0:
        splits-=1
    print(f"\nUsing {splits} shard(s) for {len(images)} files, with up to {max_files} samples per shard")
    
    file_count = 0

    for i in tqdm.tqdm(range(splits)):
        current_shard_name = "{}{}_{}{}.tfrecords".format(out_dir, i+1, splits, filename)
        writer = tf.io.TFRecordWriter(current_shard_name)

        current_shard_count = 0
        while current_shard_count < max_files: 
            index = i*max_files+current_shard_count
            if index == len(images): 
                break
            if labels is None:  
                current_image = images[index]
                out = parse_single_image(image=current_image)

            else:
                current_image = images[index]
                current_label = labels[index]
                out = parse_single_image(image=current_image, label=current_label)
    
            writer.write(out.SerializeToString())
            current_shard_count+=1
            file_count += 1

        writer.close()
    print(f"\nWrote {file_count} elements to TFRecord")
    return file_count

In [None]:
write_images_to_tfr(X, "train_images", y)
write_images_to_tfr(test, "test_images")
