# Create TFREcord
* Useful to store sequence of binary records
* 

In [14]:
import os
from os.path import join, dirname
import tensorflow as tf
import time
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Dense
from tensorflow.keras.layers import AvgPool2D, GlobalAveragePooling2D, MaxPool2D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import ReLU, concatenate
import tensorflow.keras.backend as K
import numpy as np                                    
import pandas as pd 
import os
import random
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator,img_to_array
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
warnings.filterwarnings("ignore")

## Load Traning Metadata

In [15]:
# Load the training dataset
path = os.getcwd()+"\cleaned_isic.csv"
y_train = pd.read_csv(path)

# Renaming
y_train.rename({'anatom_site_general_challenge':'anatom_site_general'},axis=1,inplace=True)

# Sort values by image name
y_train.sort_values(by=['image_name'], inplace=True)

# Display the dataset.
y_train.head()

Unnamed: 0.1,Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general,diagnosis,benign_malignant,target
1,1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0
5,5,ISIC_0074311,IP_2950485,female,40.0,lower extremity,unknown,benign,0


In [16]:
# Create a list of image ids contained in the dataset
image_id_list = y_train["image_name"].tolist()
print("{} training images.".format(len(image_id_list)))

30032 training images.


## Load testing metadata

In [17]:
# Load the test dataset
path_test_AA = os.getcwd()+"/test_ISIC_AA.csv"
test_meta_AA = pd.read_csv(path_test_AA)
path_test_A = os.getcwd()+"/test_ISIC_A.csv"
test_meta_A = pd.read_csv(path_test_A)

# Renaming
test_meta_AA.rename({'anatom_site_general_challenge':'anatom_site_general'},axis=1,inplace=True)
test_meta_A.rename({'anatom_site_general_challenge':'anatom_site_general'},axis=1,inplace=True)


# Sort values by image name
test_meta_AA.sort_values(by=['image_name'], inplace=True)
test_meta_A.sort_values(by=['image_name'], inplace=True)

# Display the dataset.
test_meta_AA.head()

Unnamed: 0.1,Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general,diagnosis,benign_malignant,target
0,78,ISIC_0125706,IP_4096093,female,60.0,torso,unknown,benign,0
1,85,ISIC_0148465,IP_1517386,male,40.0,head/neck,unknown,benign,0
2,112,ISIC_0155173,IP_1730314,male,30.0,upper extremity,unknown,benign,0
3,148,ISIC_0164603,IP_6648913,female,75.0,upper extremity,unknown,benign,0
4,161,ISIC_0168418,IP_1697215,female,45.0,lower extremity,unknown,benign,0


In [18]:
# Create a list of TEST image ids contained in the dataset
test_AA_ids = test_meta_AA["image_name"].tolist()
print("{} test AA images.".format(len(test_AA_ids)))
test_A_ids = test_meta_A["image_name"].tolist()
print("{} test A images.".format(len(test_A_ids)))

826 test AA images.
1921 test A images.


## Encode cathegorical values:

In [19]:
# Categorical columns
cats = ['patient_id','sex','anatom_site_general', "diagnosis", "benign_malignant"]

# Convert categorical values to numerical values
counter = 0; 
for c in cats:
    y_train[c],mp = y_train[c].factorize()
    test_meta_AA[c],mp2 = test_meta_AA[c].factorize()
    test_meta_A[c],mp3 = test_meta_A[c].factorize()
    counter += 1

# Fill the null age values with its mean
print('Imputing Age NaN count =',y_train.age_approx.isnull().sum())

y_train["age_approx"].fillna(y_train.age_approx.astype(float).mean(),inplace=True)
test_meta_AA["age_approx"].fillna(test_meta_AA.age_approx.astype(float).mean(),inplace=True)
test_meta_A["age_approx"].fillna(test_meta_A.age_approx.astype(float).mean(),inplace=True)

y_train['age_approx'] = y_train.age_approx.astype('int')
test_meta_AA['age_approx'] = test_meta_AA.age_approx.astype('int')
test_meta_A['age_approx'] = test_meta_A.age_approx.astype('int')

Imputing Age NaN count = 44


## Write TFRecords (Train & Test)

In [20]:
# Functions to convert a value to a type compatible with tf.train.Example
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [21]:
# Generate flexible message type with a feature mapping
def serialize_example(feature0, feature1, feature2, feature3, feature4, feature5, feature6, feature7):
    """
    INPUT:  feature0:   the encoded image
            feature1:   the image name
            feature2:   the patient id
            feature3:   sex of the patient
            fature4:    age of the patient
            feature5:   anatomic site of area to inspect
            feature6:   the diagnosis
            feature7:   the target \in [0,1]
    OUTPUT: message type with feature mapping
    """
    feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'patient_id': _int64_feature(feature2),
      'sex': _int64_feature(feature3),
      'age_approx': _int64_feature(feature4),
      'anatom_site_general': _int64_feature(feature5),
      'diagnosis': _int64_feature(feature6),
      'target': _int64_feature(feature7)
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [22]:
# @TODO to run update your train and test path to your train and test images
SIZE = 200 # How many images will be stored in one TFRecord
TRAIN_PATH = "C:\\Users\\clara\\DataSets\\ISIC_2020_Training_JPEG\\train\\"
#TEST_PATH = "C:\\Users\\clara\\DataSets\\ISIC_2020_Test_JPEG\\ISIC_2020_Test_Input\\"

# Get a sorted list of image names
IMGS = sorted(list(os.listdir(TRAIN_PATH)))
#IMGS_TEST = sorted(list(os.listdir(TEST_PATH)))

# Remove unknown diagnosis (not contained in the cleaned_isic.csv file)
IMGS = [i for i in IMGS if i[0:-4] in image_id_list]

In [11]:
print(len(IMGS))

30032


## Write Records

In [38]:
def write_tfrecords(IMGS, SIZE, csv_file, in_path, out_path):
   # How many TFREcords will be written
    CT = len(IMGS)//SIZE + int(len(IMGS)%SIZE!=0)

    # Write the TFRecords
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))

        # Checks if the current Record is filled or if its the last record and its not completely filled.
        CT2 = min(SIZE,len(IMGS)-j*SIZE)
        # Name of the record to write
        with tf.io.TFRecordWriter(out_path + '%.2i-%i.tfrec'%(j,CT2)) as writer:
            for k in range(CT2):
                # Open the current Image
                img = cv2.imread(in_path+IMGS[SIZE*j+k])
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                # Resize all images
                img = cv2.resize(img, (512, 512))
                # Encode the image 
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                name = IMGS[SIZE*j+k].split('.')[0]
                row = csv_file.loc[csv_file.image_name==name]
                # Create flexible feature mapping message
                example = serialize_example(
                    img, str.encode(name),
                    row.patient_id.values[0],
                    row.sex.values[0],
                    row.age_approx.values[0],                        
                    row.anatom_site_general.values[0],
                    row.diagnosis.values[0],
                    row.target.values[0])
                writer.write(example)
            if k%100==0: print(k,', ',end='') 

In [None]:
# @TODO to run update your train and test path to your train and test images
SIZE_1 = 200 # How many images will be stored in one TFRecord
SIZE_2 = 1000
TRAIN_PATH = "C:\\Users\\clara\\DataSets\\ISIC_2020_Training_JPEG\\train\\"
OUT_PATH_1 = "tfrecords/train/200/train"
OUT_PATH_2 = "tfrecords/train/1000/train"

# Get a sorted list of image names
IMGS = sorted(list(os.listdir(TRAIN_PATH)))
#IMGS_TEST = sorted(list(os.listdir(TEST_PATH)))

# Remove unknown diagnosis (not contained in the cleaned_isic.csv file)
IMGS = [i for i in IMGS if i[0:-4] in image_id_list]
write_tfrecords(IMGS, SIZE_1, y_train, TRAIN_PATH, OUT_PATH_1)
write_tfrecords(IMGS, SIZE_2, y_train, TRAIN_PATH, OUT_PATH_2)

In [39]:
# @TODO to run update your train and test path to your train and test images
SIZE_TEST = 50 # How many images will be stored in one TFRecord
TRAIN_PATH = "C:\\Users\\clara\\DataSets\\ISIC_2020_Training_JPEG\\train\\"
OUT_PATH_AA = "tfrecords/test/AA/50/test"
OUT_PATH_A = "tfrecords/test/A/50/test"

IMGS_TEST = sorted(list(os.listdir(TRAIN_PATH)))
# Remove unknown diagnosis (not contained in the cleaned_isic.csv file)
IMGS_AA = [i for i in IMGS_TEST if i[0:-4] in test_AA_ids]
IMGS_A = [i for i in IMGS_TEST if i[0:-4] in test_A_ids]

write_tfrecords(IMGS_AA, SIZE_TEST, test_meta_AA, TRAIN_PATH, OUT_PATH_AA)
write_tfrecords(IMGS_A, SIZE_TEST, test_meta_A, TRAIN_PATH, OUT_PATH_A)
print(x)


Writing TFRecord 0 of 17...

Writing TFRecord 1 of 17...

Writing TFRecord 2 of 17...

Writing TFRecord 3 of 17...

Writing TFRecord 4 of 17...

Writing TFRecord 5 of 17...

Writing TFRecord 6 of 17...

Writing TFRecord 7 of 17...

Writing TFRecord 8 of 17...

Writing TFRecord 9 of 17...

Writing TFRecord 10 of 17...

Writing TFRecord 11 of 17...

Writing TFRecord 12 of 17...

Writing TFRecord 13 of 17...

Writing TFRecord 14 of 17...

Writing TFRecord 15 of 17...

Writing TFRecord 16 of 17...

Writing TFRecord 0 of 39...

Writing TFRecord 1 of 39...

Writing TFRecord 2 of 39...

Writing TFRecord 3 of 39...

Writing TFRecord 4 of 39...

Writing TFRecord 5 of 39...

Writing TFRecord 6 of 39...

Writing TFRecord 7 of 39...

Writing TFRecord 8 of 39...

Writing TFRecord 9 of 39...

Writing TFRecord 10 of 39...

Writing TFRecord 11 of 39...

Writing TFRecord 12 of 39...

Writing TFRecord 13 of 39...

Writing TFRecord 14 of 39...

Writing TFRecord 15 of 39...

Writing TFRecord 16 of 39...
