In [None]:
# Create TFREcord
* Useful to store sequence of binary records
* 

In [70]:
import os
from os.path import join, dirname
import tensorflow as tf
import time
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Dense
from tensorflow.keras.layers import AvgPool2D, GlobalAveragePooling2D, MaxPool2D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import ReLU, concatenate
import tensorflow.keras.backend as K
import numpy as np                                    
import pandas as pd 
import os
import random
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator,img_to_array
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
warnings.filterwarnings("ignore")

## Load Traning Metadata

In [71]:
# Load the training dataset
path = os.getcwd()+"\cleaned_isic.csv"
y_train = pd.read_csv(path)

# Renaming
y_train.rename({'anatom_site_general_challenge':'anatom_site_general'},axis=1,inplace=True)

# Sort values by image name
y_train.sort_values(by=['image_name'], inplace=True)

# Display the dataset.
y_train.head()

Unnamed: 0.1,Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general,diagnosis,benign_malignant,target
0,2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
1,12,ISIC_0076995,IP_2235340,female,55.0,torso,nevus,benign,0
2,26,ISIC_0084086,IP_4023055,male,60.0,lower extremity,nevus,benign,0
3,27,ISIC_0084270,IP_2961528,male,40.0,lower extremity,nevus,benign,0
4,28,ISIC_0084395,IP_0175539,female,45.0,torso,nevus,benign,0


In [72]:
# Create a list of image ids contained in the dataset
image_id_list = y_train["image_name"].tolist()
print("{} training images.".format(len(image_id_list)))

6002 training images.


## Load testing metadata

In [73]:
# Load the test dataset
path_test = os.getcwd()+"\ISIC_2020_Test_Metadata.csv"
test_meta = pd.read_csv(path_test)

# Renaming
test_meta.rename({'image':'image_name'},axis=1,inplace=True)
test_meta.rename({'patient':'patient_id'},axis=1,inplace=True)

# Sort values by image name
test_meta.sort_values(by=['image_name'], inplace=True)

# Display the dataset.
test_meta.head()

Unnamed: 0,image_name,patient_id,age_approx,anatom_site_general,sex
0,ISIC_0052060,IP_3579794,70,,male
1,ISIC_0052349,IP_7782715,40,lower extremity,male
2,ISIC_0058510,IP_7960270,55,torso,female
3,ISIC_0073313,IP_6375035,50,torso,female
4,ISIC_0073502,IP_0589375,45,lower extremity,female


## Encode cathegorical values:

In [74]:
# Categorical columns
cats = ['patient_id','sex','anatom_site_general', "diagnosis", "benign_malignant"]

# Convert categorical values to numerical values
counter = 0; 
for c in cats:
    y_train[c],mp = y_train[c].factorize()
    # Since the test dataset does not conatin the columns "diagnosis", "benign_malignant"
    if (counter <= 2):
        test_meta[c],mp2 = test_meta[c].factorize()
    counter += 1
    print(mp)

# Fill the null age values with its mean
print('Imputing Age NaN count =',y_train.age_approx.isnull().sum())

y_train["age_approx"].fillna(y_train.age_approx.astype(float).mean(),inplace=True)
test_meta["age_approx"].fillna(test_meta.age_approx.astype(float).mean(),inplace=True)

y_train['age_approx'] = y_train.age_approx.astype('int')
test_meta['age_approx'] = test_meta.age_approx.astype('int')

Index(['IP_2842074', 'IP_2235340', 'IP_4023055', 'IP_2961528', 'IP_0175539',
       'IP_2825529', 'IP_3933152', 'IP_3076695', 'IP_4042098', 'IP_1273286',
       ...
       'IP_3222187', 'IP_0733805', 'IP_6814737', 'IP_9327025', 'IP_2772363',
       'IP_9362467', 'IP_2172761', 'IP_3298186', 'IP_6776978', 'IP_7507212'],
      dtype='object', length=1224)
Index(['female', 'male'], dtype='object')
Index(['lower extremity', 'torso', 'upper extremity', 'head/neck',
       'palms/soles', 'oral/genital'],
      dtype='object')
Index(['nevus', 'melanoma', 'seborrheic keratosis', 'lentigo NOS',
       'lichenoid keratosis', 'solar lentigo', 'cafe-au-lait macule',
       'atypical melanocytic proliferation'],
      dtype='object')
Index(['benign', 'malignant'], dtype='object')
Imputing Age NaN count = 3


## Write TFRecords (Train)

In [75]:
# Functions to convert a value to a type compatible with tf.train.Example
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [76]:
# Generate flexible message type with a feature mapping
def serialize_example(feature0, feature1, feature2, feature3, feature4, feature5, feature6, feature7):
    """
    INPUT:  feature0:   the encoded image
            feature1:   the image name
            feature2:   the patient id
            feature3:   sex of the patient
            fature4:    age of the patient
            feature5:   anatomic site of area to inspect
            feature6:   the diagnosis
            feature7:   the target \in [0,1]
    OUTPUT: message type with feature mapping
    """
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'patient_id': _int64_feature(feature2),
      'sex': _int64_feature(feature3),
      'age_approx': _int64_feature(feature4),
      'anatom_site_general': _int64_feature(feature5),
      'diagnosis': _int64_feature(feature6),
      'target': _int64_feature(feature7)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [77]:
# @TODO to run update your train and test path to your train and test images
SIZE = 2071 # How many images will be stored in one TFRecord
TRAIN_PATH = "C:\\Users\\clara\\DataSets\\ISIC_2020_Training_JPEG\\train\\"
TEST_PATH = "C:\\Users\\clara\\DataSets\\ISIC_2020_Test_JPEG\\ISIC_2020_Test_Input\\"

# Get a sorted list of image names
IMGS = sorted(list(os.listdir(TRAIN_PATH)))
IMGS_TEST = sorted(list(os.listdir(TEST_PATH)))

# Remove unknown diagnosis (not contained in the cleaned_isic.csv file)
IMGS = [i for i in IMGS if i[0:-4] in image_id_list]

6002 6002


In [78]:
# How many TFREcords will be written
CT = len(IMGS)//SIZE + int(len(IMGS)%SIZE!=0)

# Write the TFRecords
for j in range(CT):
    print(); print('Writing TFRecord %i of %i...'%(j,CT))

    # Checks if the current Record is filled or if its the last record and its not completely filled.
    CT2 = min(SIZE,len(IMGS)-j*SIZE)
    # Name of the record to write
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(j,CT2)) as writer:
        for k in range(CT2):
            # Open the current Image
            img = cv2.imread(TRAIN_PATH+IMGS[SIZE*j+k])
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            # Resize all images
            img = cv2.resize(img, (512, 512))
            # Encode the image 
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
            name = IMGS[SIZE*j+k].split('.')[0]
            row = y_train.loc[y_train.image_name==name]
            # Create flexible feature mapping message
            example = serialize_example(
                img, str.encode(name),
                row.patient_id.values[0],
                row.sex.values[0],
                row.age_approx.values[0],                        
                row.anatom_site_general.values[0],
                row.diagnosis.values[0],
                row.target.values[0])
            writer.write(example)
            if k%100==0: print(k,', ',end='')


Writing TFRecord 0 of 3...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 
Writing TFRecord 1 of 3...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 
Writing TFRecord 2 of 3...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 

## Test TfRecords

In [None]:
# Currently left empty (could be used to generate the test TFRecords)