### Acknowledgement

In this notebook, we follow the approach outlined by Martin Görner in [Part 1 of his Keras on TPU series](https://codelabs.developers.google.com/codelabs/keras-flowers-data/#0).

### Loading libraries

In [None]:
import numpy as np
import pandas as pd
import os, sys, math
import tensorflow as tf
from pathlib import Path
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

# AUTO will be used in tf.data.Dataset API
AUTO = tf.data.experimental.AUTOTUNE 

print("Tensorflow version " + tf.__version__)

### Setting up basic parameters

In [None]:
show_files=0

# if you want to see the full content of the
# 'kaggle/input'directory set show_files=1

if show_files:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

In [None]:
SHARDS = 20
TARGET_SIZE = [512, 512]
CLASSES = [b'benign', b'malignant']

PATH_DATA=Path('/kaggle/input/siim-isic-melanoma-classification/')
PATH_FOLDS=Path('/kaggle/input/siim-stratified-groupkfold-5-folds/')

### Loading training data

In [None]:
train=pd.read_csv(PATH_DATA/'train.csv')
print(f"The shape of the `train` is {train.shape}.\n")
print(f"The columns present in `train` are {train.columns.values}.")

In [None]:
test=pd.read_csv(PATH_DATA/'test.csv')
print(f"The shape of the `test` is {test.shape}.\n")
print(f"The columns present in `test` are {test.columns.values}.")

Note that there is no `diagnosis` column in the test set. 

### Imputing missing values

The `sex`, `age_approx`, and `anatom_site_general_challenge` columns of the training set contain missing values:

In [None]:
train.isna().sum()

The `anatom_site_general_challenge` column of the test set contains missing values as well

In [None]:
test.isna().sum()

We will replace the missing values for `age_approx` with the median age of the patients present in the dataset. As for the other two columns, we will mark the missing values with the word "unknown". 

In [None]:
median_age=train['age_approx'].median()
print(f"The median age of the patients in the training set is {median_age} years.")

In [None]:
train['age_approx'].fillna(median_age, inplace=True)
train.fillna('unknown', inplace=True)
test.fillna('unknown', inplace=True)

In [None]:
print(f"The total number of NA's after imputation in `train` is {train.isna().sum().sum()}.")
print(f"The total number of NA's after imputation in `test` is {test.isna().sum().sum()}.")

### One-hot encoding for categorical variables

The unique values in `train`:

In [None]:
print("The unique values of 'age_approx':")
print(np.unique(train['age_approx'].values))
print("\nThe unique values of 'sex':")
print(np.unique(train['sex'].values))
print("\nThe unique values of 'anatom_site_general_challenge':")
print(np.unique(train['anatom_site_general_challenge'].values))
print("\nThe unique values of 'diagnosis':")
print(np.unique(train['diagnosis'].values))

The unique values in `test`:

In [None]:
print("The unique values of 'age_approx':")
print(np.unique(test['age_approx'].values))
print("\nThe unique values of 'sex':")
print(np.unique(test['sex'].values))
print("\nThe unique values of 'anatom_site_general_challenge':")
print(np.unique(test['anatom_site_general_challenge'].values))

Observe that the age values are all integer in both `train` and `test`. Let's cast `age_approx` into `np.uint8` format.

In [None]:
train['age_approx']=train['age_approx'].astype(np.uint8)
test['age_approx']=test['age_approx'].astype(np.uint8)

Checking if `anatom_site_general_challenge` has the same set of values in `train` and `test`:

In [None]:
np.equal(np.unique(test['anatom_site_general_challenge'].values),
         np.unique(train['anatom_site_general_challenge'].values)
        ).all()

Yes, it does. Now we will apply one-hot encoding to `sex` and `anatom_site_general_challenge`. We will not be one-hot encoding `diagnosis` since it is present only in the training set. 

In [None]:
train = pd.concat([train, pd.get_dummies(train['sex'], prefix='sex')], axis=1)
train = pd.concat([train, pd.get_dummies(train['anatom_site_general_challenge'], 
                                         prefix='site')], axis=1)
# train = pd.concat([train, pd.get_dummies(train['diagnosis'], prefix='diagn')], axis=1)

train.shape

In [None]:
test = pd.concat([test, pd.get_dummies(test['sex'], prefix='sex')], axis=1)
test = pd.concat([test, pd.get_dummies(test['anatom_site_general_challenge'],
                                       prefix='site')], axis=1)
# the following columns is added for consistency with `train`
test['sex_unknown']=np.zeros(len(test))
test['sex_unknown']=test['sex_unknown'].astype(np.uint8)

test.shape

In [None]:
pd.set_option('display.max_columns', None)
test.head()

### Scaling the age feature

In [None]:
%%time

scaler=StandardScaler()

train['age_scaled']=scaler.fit_transform(train['age_approx'].values.reshape(-1, 1))
test['age_scaled']=scaler.transform(test['age_approx'].values.reshape(-1, 1))

### Turning the fold data into a TF dataset

In [None]:
excluded_cols=['sex', 'anatom_site_general_challenge', 'diagnosis', ]

cols=[c for c in test.columns if c not in excluded_cols]

print(cols)
print(f"\nThe total number of features is {len(cols)}.")

In [None]:
dataset0 = tf.data.Dataset.from_tensor_slices(dict(test[cols]))

In [None]:
def show_instance(item, special):
    for k, v in item.items():
        if k not in special:
            print(k, v.numpy())
        else:
            print("Image shape", v.numpy().shape)

In [None]:
def show_ds(ds, n=1, special=['image']):
    for item in ds.take(n):
        show_instance(item, special)

In [None]:
show_ds(dataset0)

In [None]:
def decode_jpeg(data_dict): 
    fname="/kaggle/input/siim-isic-melanoma-classification/jpeg/test/" \
          +data_dict['image_name']+".jpg"
    bits = tf.io.read_file(fname)
    data_dict['image'] = tf.image.decode_jpeg(bits)  
    return data_dict

In [None]:
dataset1 = dataset0.map(decode_jpeg, num_parallel_calls=AUTO)

In [None]:
show_ds(dataset1)

### Visualization function

In [None]:
def show_9(dataset):
    plt.figure(figsize=(13,13))
    subplot=331
    i=0
    for data in dataset:  
        i+=1
        plt.subplot(subplot)
        plt.axis('off')
        plt.imshow(data['image'].numpy().astype(np.uint8))
        subplot += 1
        if i==9:
            break
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.1, hspace=0.1)
    plt.show()

In [None]:
show_9(dataset1)

### Resizing and cropping

In [None]:
def resize_and_crop_image(data):
    # Resize and crop using "fill" algorithm:
    # always make sure the resulting image
    # is cut out from the source image so that
    # it fills the TARGET_SIZE entirely with no
    # black bars and a preserved aspect ratio.
    w = tf.shape(data['image'])[0]
    h = tf.shape(data['image'])[1]
    tw = TARGET_SIZE[1]
    th = TARGET_SIZE[0]
    resize_crit = (w * th) / (h * tw)
    data['image'] = tf.cond(resize_crit < 1,
                            # if true
                            lambda: tf.image.resize(data['image'], [w*tw/w, h*tw/w],
                                                    method='lanczos3',
                                                    antialias=True
                                                   ),
                            # if false
                            lambda: tf.image.resize(data['image'], [w*th/h, h*th/h],
                                                    method='lanczos3',
                                                    antialias=True
                                                   )
                           )
    nw = tf.shape(data['image'])[0]
    nh = tf.shape(data['image'])[1]
    data['image'] = tf.image.crop_to_bounding_box(data['image'], 
                                                  (nw - tw) // 2, 
                                                  (nh - th) // 2, 
                                                  tw, th
                                                 )
    return data, h, w

In [None]:
dataset2 = dataset1.map(resize_and_crop_image, num_parallel_calls=AUTO)

Redefine our plotting finction to account for the new height and width features (alternatively, you can just add these features to the `data` dictionary).

In [None]:
def show_9(dataset):
    plt.figure(figsize=(13,13))
    subplot=331
    i=0
    for data, h, w in dataset:  
        i+=1
        plt.subplot(subplot)
        plt.axis('off')
        plt.imshow(data['image'].numpy().astype(np.uint8))
        subplot += 1
        if i==9:
            break
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.1, hspace=0.1)
    plt.show()

In [None]:
show_9(dataset2)

### Speed test: too slow

Google Cloud Storage is capable of great throughput but has a per-file access penalty. Run the cell below and see that throughput is around 5 images per second (at least this was the speed at the time of writing this notebook).

In [None]:
%%time

display_dataset = dataset2.batch(10)
for item, h, w in display_dataset.take(10):
    print(f"Image batch shape {item['image'].numpy().shape}")

### Recompress the images

As we just saw, working with thousands of individual files will be too slow. We have to use the TFRecord format to group files together. To do that, we first need to recompress our images. The bandwidth savings outweight the decoding CPU cost. The bandwidth savings outweight the decoding CPU cost.

In [None]:
def recompress_image(data, h, w):

    data['image'] = tf.cast(data['image'], tf.uint8)
    data['image'] = tf.image.encode_jpeg(data['image'], 
                                         #quality=100,
                                         optimize_size=True, 
                                         chroma_downsampling=False)
    return data, h, w

In [None]:
dataset3 = dataset2.map(recompress_image, num_parallel_calls=AUTO)

### Write dataset to TFRecord files 

In [None]:
nb_images = len(test)
shard_size = math.ceil(1.0 * nb_images / SHARDS)

print(f"The total number of images = {nb_images}")
print(f"The number of  .tfrecord files = {SHARDS}")
print(f"The number of images in each .tfrecord file = {shard_size}")

Sharding: there will be one "batch" of images per file

In [None]:
dataset4 = dataset3.batch(shard_size)

Three types of data can be stored in TFRecords: bytestrings, integers and floats. They are always stored as lists, a single data element will be a list of size 1.

In [None]:
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

In [None]:
def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

In [None]:
def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

In [None]:
def to_tfrecord(tfrec_filewriter, image, image_name, patient_id, 
                age, age_scaled, sex_female, sex_male, sex_unknown, 
                site_head_neck, site_lower_extremity, site_oral_genital, 
                site_palms_soles, site_torso, site_unknown, site_upper_extremity, 
                height, width):

    feature = {
        # bytestring features
        "image": _bytestring_feature([image]), 
        "image_name": _bytestring_feature([image_name]),
        "patient_id": _bytestring_feature([patient_id]), 
        # integer features
        "age": _int_feature([age]),
        "sex_female": _int_feature([sex_female]),        
        "sex_male": _int_feature([sex_male]),
        "sex_unknown": _int_feature([sex_unknown]),
        "site_head/neck": _int_feature([site_head_neck]),
        "site_lower extremity": _int_feature([site_lower_extremity]),
        "site_oral/genital": _int_feature([site_oral_genital]),
        "site_palms/soles": _int_feature([site_palms_soles]), 
        "site_torso": _int_feature([site_torso]), 
        "site_unknown": _int_feature([site_unknown]), 
        "site_upper extremity": _int_feature([site_upper_extremity]),
        "height": _int_feature([height]),
        "width": _int_feature([width]),
        # float features
        "age_scaled": _float_feature([age_scaled]),
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (data, height, width) in enumerate(dataset4):
    
#     if shard not in range(SHARDS//2*(N-1), SHARDS//2*N):
#         continue
    # batch size used as shard size here
    shard_size = data['image'].numpy().shape[0]
    # good practice to have the number of records in the filename
    filename = "{:02d}-{}.tfrec".format(shard, shard_size)

    with tf.io.TFRecordWriter(filename) as out_file:
        for i in range(shard_size):
            example = to_tfrecord(out_file,
                                  # re-compressed image: already a byte string
                                  data['image'].numpy()[i],
                                  data['image_name'].numpy()[i],
                                  data['patient_id'].numpy()[i],
                                  data['age_approx'].numpy()[i],
                                  data['age_scaled'].numpy()[i],
                                  data['sex_female'].numpy()[i],
                                  data['sex_male'].numpy()[i],
                                  data['sex_unknown'].numpy()[i],
                                  data['site_head/neck'].numpy()[i],
                                  data['site_lower extremity'].numpy()[i],
                                  data['site_oral/genital'].numpy()[i],
                                  data['site_palms/soles'].numpy()[i],
                                  data['site_torso'].numpy()[i],
                                  data['site_unknown'].numpy()[i],
                                  data['site_upper extremity'].numpy()[i],
                                  height.numpy()[i],
                                  width.numpy()[i]
                                 )

            out_file.write(example.SerializeToString())

    print("Wrote file {} containing {} records".format(filename, shard_size))