# Introduction
**This notebook is based on  [Jesse Mostipak’s Tutorial](https://www.kaggle.com/jessemostipak/getting-started-tpus-cassava-leaf-disease)**  
In this notebook we check the importance of colours in the classification process.  
We study a baseline model.

# Set up environment

In [None]:
import math, re, os, json
import seaborn as sn
import cv2
import itertools
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kaggle_datasets import KaggleDatasets
from tensorflow import keras
import tensorflow as tf, tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.models import load_model
from functools import partial
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import confusion_matrix
print("Tensorflow version " + tf.__version__)

# TPU


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

# Set up variables
We'll set up some of our variables for our notebook here. 

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH = KaggleDatasets().get_gcs_path()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
BASE_DIR = '../input/cassava-leaf-disease-classification/'
IMAGE_SIZE = [512, 512]
CLASSES = ['0', '1', '2', '3', '4']
EPOCHS = 50
PROBA_CONTRAST=1.

### F1 score.

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(y_true * y_pred,axis=0)
    possible_positives = K.sum(y_true,axis=0)
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(y_true * y_pred,axis=0)
    predicted_positives = K.sum(y_pred,axis=0)
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    y_pred = tf.one_hot(tf.argmax(y_pred,axis=-1),len(CLASSES))
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*K.mean((precision*recall)/(precision+recall+K.epsilon()))

### Confusion matrix.

In [None]:
def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)


    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, np.round(1000*cm[i, j])/1000,
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Load the data

## Decode the data

In [None]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32)
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

In [None]:
def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example['image'])
    if labeled:
        label = tf.cast(example['target'], tf.int32)
        return image, label
    idnum = example['image_name']
    return image, idnum

In [None]:
def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    return dataset

In [None]:
TRAINING_FILENAMES, VALID_FILENAMES = train_test_split(
    tf.io.gfile.glob(GCS_PATH + '/train_tfrecords/ld_train*.tfrec'),
    test_size=0.35, random_state=5
)

TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test_tfrecords/ld_test*.tfrec')

## Adding in augmentations 

In [None]:
# One hot label and float images.
def data_treat(image,label):
    label = tf.one_hot(label,len(CLASSES))
    image = tf.cast(image, tf.float32)
    return image,label

In [None]:
# Values going from 0 to 255.
def data_treat_test(image,label):
    image = tf.cast(image, tf.float32)
    image = image-tf.math.reduce_min(image)
    image = image/tf.math.reduce_max(image)
    image = image*255
    return image,label

In [None]:
def data_augment(image, label):
    # Thanks to the dataset.prefetch(AUTO) statement in the following function this happens essentially for free on TPU. 
    # Data pipeline code is executed on the "CPU" part of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    return image, label

In [None]:
def func_standard(p=PROBA_CONTRAST):
    def data_standard(image, label):
        if tf.random.uniform(shape=(), minval=0, maxval=1)<p:
            image = image-tf.math.reduce_min(image)
            image = image/tf.math.reduce_max(image)
            image = tf.math.round(image*255)
        return image, label
    return data_standard

## Define data loading methods
The following functions will be used to load our `training`, `validation`, and `test` datasets, as well as print out the number of images in each dataset.

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALID_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)

print('Dataset: {} training images, {} validation images, {} (unlabeled) test images'.format(
    NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))

# Brief exploratory data analysis (EDA)
First we'll print out the shapes and labels for a sample of each of our three datasets:

In [None]:
with open(os.path.join(BASE_DIR, "label_num_to_disease_map.json")) as file:
    map_classes = json.loads(file.read())
    map_classes = {int(k) : v for k, v in map_classes.items()}
    
print(json.dumps(map_classes, indent=4))

In [None]:
input_files = os.listdir(os.path.join(BASE_DIR, "train_images"))
print(f"Number of train images: {len(input_files)}")

In [None]:
df_train = pd.read_csv(os.path.join(BASE_DIR, "train.csv"))
df_train["class_name"] = df_train["label"].map(map_classes)
plt.figure(figsize=(8, 4))
sn.countplot(y="class_name", data=df_train);

In [None]:
# This function returns the labels weights, compounded by a coefficient n.
def c_weights(labels,n=3/4):
    c_labels = Counter(labels)
    A=len(c_labels)/np.sum([x**-n for x in c_labels.values()])
    cw = {i:A*c_labels[i]**-n for i in range(5)}
    return cw

In [None]:
cw = c_weights(df_train["label"],n=3/4)

The following code chunk sets up a series of functions that will print out a grid of images. The grid of images will contain images and their corresponding labels.

# Building the model
## Learning rate schedule
We learned about learning rates in the **[Intro to Deep Learning: Stochastic Gradient Descent](https://www.kaggle.com/ryanholbrook/stochastic-gradient-descent)** lesson, and here I've created a learning rate schedule mostly using the defaults in the **[Keras Exponential Decay Learning Rate Scheduler](https://keras.io/api/optimizers/learning_rate_schedules/exponential_decay/)** documentation (I did change the `initial_learning_rate`. You can adjust the learning rate scheduler below, and read more about the other types of schedulers available to you in the **[Keras learning rate schedules API](https://keras.io/api/optimizers/learning_rate_schedules/)**.

In [None]:
lr_scheduler = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4, 
    decay_steps=1000, 
    decay_rate=0.9)

## Simple color model.
We want to check the importance of colors to the illness detection. We create a color only model.

In [None]:
def create_histogram(x,l):
    Z=tf.split(x, 3, axis=-1)
    Z=[tf.histogram_fixed_width(z,[0., 255.], nbins=32) for z in Z]
    Z=tf.concat(Z, -1)
    return Z,l

In [None]:
def get_training_histogram(ordered=False):
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True, ordered=ordered)  
    dataset = dataset.map(data_treat, num_parallel_calls=AUTOTUNE)  
    dataset = dataset.map(func_standard(p=1),num_parallel_calls=AUTOTUNE)  
    dataset = dataset.map(create_histogram, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
def get_test_histogram(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.map(data_treat_test, num_parallel_calls=AUTOTUNE)  
    dataset = dataset.map(create_histogram, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
def get_validation_histogram(ordered=False):
    dataset = load_dataset(VALID_FILENAMES, labeled=True, ordered=ordered) 
    dataset = dataset.map(data_treat, num_parallel_calls=AUTOTUNE)  
    dataset = dataset.map(func_standard(p=1),num_parallel_calls=AUTOTUNE)  
    dataset = dataset.map(create_histogram, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

Counting the labels

In [None]:
p = 0.1

Model based on the colours distribution.

In [None]:
def create_model():
    A2=tf.keras.layers.Input(shape=(32*3,))
    X=tf.keras.layers.Dense(2048,activation='relu')(A2)
    X=tf.keras.layers.Dropout(p)(X)
    X=tf.keras.layers.Dense(128,activation='relu')(X)
    X=tf.keras.layers.Dropout(p)(X)
    X=tf.keras.layers.Dense(len(CLASSES),activation='softmax')(X)
    model = tf.keras.Model(inputs=A2, outputs=X)
    return model

In [None]:
with strategy.scope():       
    model = create_model()
    if os.path.exists('pierre_color.h5'):
        print('loading')
        modelp = tf.keras.models.load_model('pierre_color.h5',compile=False)
        model.set_weights(modelp.get_weights())
    else:
        print('creating')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler),
        loss='categorical_crossentropy',  metrics=['categorical_accuracy',f1_m],
        )

In [None]:
model.summary()

In [None]:
# load data
train_dataset = get_training_histogram()
valid_dataset = get_validation_histogram()

We first train without class weights.

In [None]:
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

history = model.fit(train_dataset, 
                    steps_per_epoch=STEPS_PER_EPOCH, 
                    epochs=EPOCHS,#class_weight=c_weigths,
                    validation_data=valid_dataset,
                    validation_steps=VALID_STEPS)

In [None]:
model.save('pierre_color.h5')

## Checking the results.

In [None]:
# this code will convert our test image data to a float32 
def to_float32(image, label):
    return tf.cast(image, tf.float32), label

In [None]:
train_dataset = get_training_histogram(ordered=True)
valid_dataset = get_validation_histogram(ordered=True)

In [None]:
dataset = load_dataset(VALID_FILENAMES, labeled=True, ordered=True)
#dataset = tf.data.TFRecordDataset(TRAINING_FILENAMES[0], num_parallel_reads=AUTOTUNE)
valid_labels = []
for images, labels in dataset.take(-1):  # only take first element of dataset
    valid_labels.append(labels.numpy())

In [None]:
train_ds = train_dataset.map(to_float32)
fit_train_label = model.predict(train_ds,steps=STEPS_PER_EPOCH)
fit_train_label = np.argmax(fit_train_label,axis=1)

In [None]:
valid_ds = valid_dataset.map(to_float32)
fit_valid_label = model.predict(valid_ds)
fit_valid_label = np.argmax(fit_valid_label,axis=1)

In [None]:
cm = confusion_matrix(valid_labels[:len(fit_valid_label)],fit_valid_label)

In [None]:
plot_confusion_matrix(cm, [0,1,2,3,4],normalize=True)

This model simply return the most common class (here 3). We will compare with the same model with weights added.

In [None]:
with strategy.scope():       
    model = create_model()
    if os.path.exists('pierre_color_W.h5'):
        print('loading')
        modelp = tf.keras.models.load_model('pierre_color_W.h5',compile=False)
        model.set_weights(modelp.get_weights())
    else:
        print('creating')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler, epsilon=0.001),
        loss='categorical_crossentropy',  metrics=['categorical_accuracy',f1_m],
        )

In [None]:
train_dataset = get_training_histogram(ordered=False)
valid_dataset = get_validation_histogram(ordered=False)

In [None]:
# First weight : same weights per class.
cw = c_weights(df_train["label"],n=1)

In [None]:
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

history = model.fit(train_dataset, 
                    steps_per_epoch=STEPS_PER_EPOCH, 
                    epochs=EPOCHS,class_weight=cw,
                    validation_data=valid_dataset,
                    validation_steps=VALID_STEPS)

In [None]:
model.save('pierre_color_W.h5')

## Visualizing the new results.
Let's see if the distribution has changed.

In [None]:
train_dataset = get_training_histogram(ordered=True)
valid_dataset = get_validation_histogram(ordered=True)

In [None]:
train_ds = train_dataset.map(to_float32)
fit_train_label = model.predict(train_ds,steps=STEPS_PER_EPOCH)
fit_train_label = np.argmax(fit_train_label,axis=1)

In [None]:
valid_ds = valid_dataset.map(to_float32)
fit_valid_label = model.predict(valid_ds)
fit_valid_label = np.argmax(fit_valid_label,axis=1)

In [None]:
cm = confusion_matrix(valid_labels[:len(fit_valid_label)],fit_valid_label)

In [None]:
plot_confusion_matrix(cm, [0,1,2,3,4],normalize=True)

Now, all entries goes to the smaller class.  
Checking the litterature, a power law of 3/4 is often prefered. We try it as our last model.

## Last model n=3/4. 

In [None]:
with strategy.scope():       
    model = create_model()
    if os.path.exists('pierre_color_34.h5'):
        print('loading')
        modelp = tf.keras.models.load_model('pierre_color_34.h5',compile=False)
        model.set_weights(modelp.get_weights())
    else:
        print('creating')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler, epsilon=0.001),
        loss='categorical_crossentropy',  metrics=['categorical_accuracy',f1_m],
        )

In [None]:
train_dataset = get_training_histogram(ordered=False)
valid_dataset = get_validation_histogram(ordered=False)

In [None]:
# Second weight : n=3/4.
cw = c_weights(df_train["label"],n=3/4)

In [None]:
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

history = model.fit(train_dataset, 
                    steps_per_epoch=STEPS_PER_EPOCH, 
                    epochs=EPOCHS,class_weight=cw,
                    validation_data=valid_dataset,
                    validation_steps=VALID_STEPS)

In [None]:
model.save('pierre_color_34.h5')

## Visualizing the last results.
Let's see if the distribution has changed.

In [None]:
train_dataset = get_training_histogram(ordered=True)
valid_dataset = get_validation_histogram(ordered=True)

In [None]:
train_ds = train_dataset.map(to_float32)
fit_train_label = model.predict(train_ds,steps=STEPS_PER_EPOCH)
fit_train_label = np.argmax(fit_train_label,axis=1)

In [None]:
valid_ds = valid_dataset.map(to_float32)
fit_valid_label = model.predict(valid_ds)
fit_valid_label = np.argmax(fit_valid_label,axis=1)

In [None]:
cm = confusion_matrix(valid_labels[:len(fit_valid_label)],fit_valid_label)

In [None]:
plot_confusion_matrix(cm, [0,1,2,3,4],normalize=True)

Once again, only one class is found. We can conclude that colour distribution is not a good parameter to determine the illness of cassava leaf. A model based on correlation seems needed.

# Creating a submission file
Now that we've trained a model and made predictions we're ready to submit to the competition! You can run the following code below to get your submission file.

In [None]:
#print('Generating submission.csv file...')
#test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
#test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
#np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')
#!head submission.csv

Be aware that because this is a code competition with a hidden test set, internet and TPUs cannot be enabled on your submission notebook. Therefore TPUs will only be available for training models. For a walk-through on how to train on TPUs and run inference/submit on GPUs, see our [TPU Docs](https://www.kaggle.com/docs/tpu#tpu6).