In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
import matplotlib.pyplot as plt
import os
from PIL import Image

os.listdir("../input/cassava-leaf-disease-classification")

In [None]:
# code from https://www.kaggle.com/ryanholbrook/the-convolutional-classifier
# Reproducability 
def set_seed(seed=2020):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    
set_seed()

In [None]:
train_df = pd.read_csv("../input/cassava-leaf-disease-classification/train.csv")
train_df.head()

In [None]:
train_df["label"].value_counts()

In [None]:
train_df["label"] = train_df["label"].astype(str)
train_df.dtypes

In [None]:
import json

with open("../input/cassava-leaf-disease-classification/label_num_to_disease_map.json") as json_file:
    for k,v in json.load(json_file).items():
        print(f"{k}: {v}")

# checking some sample images

## sample images of label 0: Cassava Bacterial Blight (CBB)
Some leaves looks getting yellow or blown(maybe some cells are dead)

In [None]:
train_images_dir = "../input/cassava-leaf-disease-classification/train_images"

In [None]:
label0_sample_image_filenames = train_df[train_df["label"] == "0"][:16]["image_id"].to_list()
label0_sample_images = [Image.open(os.path.join(train_images_dir, path)) for path in label0_sample_image_filenames]

plt.figure(figsize=(16, 16))
for i in range(16):
    plt.title(label0_sample_image_filenames[i])
    plt.subplot(4, 4, i+1)
    plt.imshow(label0_sample_images[i])

plt.show()

## sample images of label 1: Cassava Brown Streak Disease (CBSD)
Some leaves are getting yellow, so it's hard for me to distinguish with label0(CBB)  
I don't know why some cassava potatos are labeled as CBSD.🤔

In [None]:
label1_sample_image_filenames = train_df[train_df["label"] == "1"][:16]["image_id"].to_list()
label1_sample_images = [Image.open(os.path.join(train_images_dir, path)) for path in label1_sample_image_filenames]

plt.figure(figsize=(16, 16))
for i in range(16):
    plt.title(label1_sample_image_filenames[i])
    plt.subplot(4, 4, i+1)
    plt.imshow(label1_sample_images[i])

plt.show()

## sample images of label 2: Cassava Green Mottle (CGM)
Some leaves have some white mottle.

In [None]:
label2_sample_image_filenames = train_df[train_df["label"] == "2"][:16]["image_id"].to_list()
label2_sample_images = [Image.open(os.path.join(train_images_dir, path)) for path in label2_sample_image_filenames]

plt.figure(figsize=(16, 16))
for i in range(16):
    plt.title(label2_sample_image_filenames[i])
    plt.subplot(4, 4, i+1)
    plt.imshow(label2_sample_images[i])

plt.show()

## sample images of label 3: Cassava Mosaic Disease (CMD)
Some leaves's have weird shape.

In [None]:
label3_sample_image_filenames = train_df[train_df["label"] == "3"][:16]["image_id"].to_list()
label3_sample_images = [Image.open(os.path.join(train_images_dir, path)) for path in label3_sample_image_filenames]

plt.figure(figsize=(16, 16))
for i in range(16):
    plt.title(label3_sample_image_filenames[i])
    plt.subplot(4, 4, i+1)
    plt.imshow(label3_sample_images[i])

plt.show()

## sample images of label 4: Healthy(...Really?)
Some leaves are nice and green, with no yellow part or white mottle.  
But the top left image(1003442061.jpg) seems far from healthy...

In [None]:
label4_sample_image_filenames = train_df[train_df["label"] == "4"][:16]["image_id"].to_list()
label4_sample_images = [Image.open(os.path.join(train_images_dir, path)) for path in label4_sample_image_filenames]

plt.figure(figsize=(16, 16))
for i in range(16):
    plt.title(label4_sample_image_filenames[i])
    plt.subplot(4, 4, i+1)
    plt.imshow(label4_sample_images[i])

plt.show()

## Is it OK to resize the images? Can we still find the yellow part or white mottle?
Original images have the shape of 800 by 600. It's a little too big.  
So let's try resizing some images down to 300 by 300

In [None]:
plt.figure(figsize=(16, 16))
for i in range(16):
    plt.title(label0_sample_image_filenames[i])
    plt.subplot(4, 4, i+1)
    plt.imshow(label0_sample_images[i].resize((300, 300)))

plt.show()

As you can see, we can still find the yellow or blown parts, so it's seems fine to resize.

# making ImageDataGenerator

In [None]:
target_size = (299, 299)
input_shape = (299, 299, 3)
batch_size = 64

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(validation_split=0.05)

train_generator = datagen.flow_from_dataframe(
    train_df,
    directory="../input/cassava-leaf-disease-classification/train_images",
    x_col="image_id",
    y_col="label",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="sparse",
    subset="training",
)

In [None]:
val_generator = datagen.flow_from_dataframe(
    train_df,
    directory="../input/cassava-leaf-disease-classification/train_images",
    x_col="image_id",
    y_col="label",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="sparse",
    subset="validation",
)

# making a model with InceptionResNetV2

In [None]:
from tensorflow.keras.applications import DenseNet169, ResNet50V2, InceptionResNetV2

inception_resnet_v2 = InceptionResNetV2(
    include_top=False,
    weights="../input/inceptionresnetv2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5",
    input_shape=input_shape,
)

In [None]:
len(inception_resnet_v2.layers)

In [None]:
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.layers.experimental.preprocessing import RandomRotation, RandomFlip, RandomZoom, CenterCrop, Rescaling
from tensorflow.keras.applications.inception_resnet_v2 import preprocess_input

def create_model():
    inputs = Input(input_shape)
    
    x = preprocess_input(inputs)
    x = Rescaling(1./255)(x)
    
    # some layers for data augmentation
    x = RandomFlip()(x)
    x = RandomRotation(factor=0.3)(x)
    
    x = BatchNormalization()(x)
    
    x = inception_resnet_v2(x)

    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(256, (1, 1), activation=LeakyReLU())(x)
    x = BatchNormalization()(x)
    
    x = Flatten()(x)
    x = Dropout(0.75)(x)

    x = Dense(256, activation=LeakyReLU())(x)
    x = Dropout(0.75)(x)
    x = BatchNormalization()(x)
    
    outputs = Dense(5, activation="softmax")(x)
    
    model = tf.keras.Model(inputs, outputs)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    
    return model

In [None]:
model = create_model()
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

cp = ModelCheckpoint(
    "best_model_weights.h5",
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True,
)

es = EarlyStopping(
    monitor="val_loss",
    patience=10,
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    patience=2,
)

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_df["label"]),
                                                 train_df["label"])

class_weights = dict(enumerate(class_weights))

class_weights

# training my model

In [None]:
tic = time.time()

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=100,
    callbacks=[cp, es, reduce_lr],
    class_weight=class_weights,
)

toc = time.time()

In [None]:
print(f"model training took {int((toc - tic) / 60)} minutes")

In [None]:
# code from https://www.kaggle.com/ryanholbrook/the-convolutional-classifier
history_frame = pd.DataFrame(history.history)
history_frame.loc[:, ['loss', 'val_loss']].plot(ylim=(0, 3))
history_frame.loc[:, ['accuracy', 'val_accuracy']].plot(ylim=(0., 1.))

In [None]:
model.evaluate_generator(val_generator)

In [None]:
model.load_weights("best_model_weights.h5")
model.evaluate_generator(val_generator)

In [None]:
model.metrics_names

In [None]:
submission_df = pd.read_csv("../input/cassava-leaf-disease-classification/sample_submission.csv")
submission_df.head()

In [None]:
# test_datagen = ImageDataGenerator()

# test_generator = test_datagen.flow_from_dataframe(
#     submission_df,
#     directory="../input/cassava-leaf-disease-classification/test_images",
#     x_col="image_id",
#     target_size=target_size,
#     batch_size=batch_size,
#     class_mode=None
# )

In [None]:
# y_pred = model.predict(test_generator)
# y_pred

In [None]:
from PIL import Image

# code from https://www.kaggle.com/sinamhd9/keras-available-models-part-2-inference
test_images = os.listdir('/kaggle/input/cassava-leaf-disease-classification/test_images/')
y_preds = []

for i in test_images:
    image = Image.open(f'/kaggle/input/cassava-leaf-disease-classification/test_images/{i}')
    image = image.resize(target_size)
    image = np.expand_dims(image, axis=0)
    y_preds.append(np.argmax(model.predict(image)))

In [None]:
df_sub = pd.DataFrame({'image_id': test_images, 'label': y_preds})
display(df_sub)

In [None]:
# y_pred = np.argmax(y_pred, axis=1)
# y_pred

In [None]:
# submission_df["label"] = y_pred
# submission_df.head()

In [None]:
# submission_df.to_csv("submission.csv", index=None)
df_sub.to_csv("submission.csv", index=None)