# Dependencies

In [None]:
!pip install -q efficientnet # Efficientnet not supported in tensorflow yet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from kaggle_datasets import KaggleDatasets
from sklearn.linear_model import LogisticRegression
import efficientnet.tfkeras as efn
from tensorflow.keras.applications import *
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.models import save_model, Sequential, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import CategoricalCrossentropy
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Kaggle TPU Activation

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path("plant-pathology-2020-fgvc7")

In [None]:
# Configs
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
IMG_SIZE = 800

# Helper Functions

In [None]:
# Show random set of images
def show_images_in(path, n=16):
    files_to_show = os.listdir(path)[:n]
    assert files_to_show[0].endswith((".jpg",".jpeg",".png"))
    np.random.shuffle(files_to_show)
    img_paths = [os.path.join(path,file) for file in files_to_show]
    plt.figure(figsize=(12,12))
    for i in range(n):
        img = plt.imread(img_paths[i])
        plt.subplot(4,4,i+1)
        plt.imshow(img)
    plt.tight_layout()
show_images_in("../input/plant-pathology-2020-fgvc7/images/",n=16)

In [None]:
# Learning rate scheduler
def build_lrfn(lr_start=0.00001, lr_max=0.00005, 
               lr_min=0.00001, lr_rampup_epochs=5, 
               lr_sustain_epochs=0, lr_exp_decay=.8):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) *\
                 lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    return lrfn

lr_schedule = LearningRateScheduler(lrfn, verbose=1)

In [None]:
def read_image(filename, label=None, image_size=(IMG_SIZE, IMG_SIZE)):
    # Read file as binary
    bits = tf.io.read_file(filename)
    # Decode binary file into pixel values
    image = tf.image.decode_jpeg(bits, channels=3)
    # Normalize pixel values (change into floats in [0,1])
    image = tf.cast(image, tf.float32) / 255.0
    # Resize image to correct size
    image = tf.image.resize(image, image_size)
 
    if label is None: # Used for testing data, label is not given in dataframe because of competition
        return image
    else:
        return image, label

def data_augment(image, label=None):
    # Data augmentation, nothing fancy here.
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)

    if label is None:
        return image
    else:
        return image, label

In [None]:
train = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/train.csv')
test = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/test.csv')
sub = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv')

# Turning filepaths into absolute paths in our dataframe
train_paths = train.image_id.apply(lambda x: GCS_DS_PATH + "/images/" + x +".jpg").values
test_paths = test.image_id.apply(lambda x: GCS_DS_PATH + "/images/" + x +".jpg").values
train_labels = train.loc[:, 'healthy':].values

In [None]:
# Creating the dataset objects for TPU feeding
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(read_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(IMG_SIZE, reshuffle_each_iteration=True)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((valid_paths, valid_labels))
    .map(read_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(read_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

# Training

In [None]:
arrs = [] # For model ensemebling. After every trained model, we append predictions here

# Train 2 EfficientNetB7s and append predictions to arr
for i in range(2):
    with strategy.scope():
        model = Sequential()
        base = efn.EfficientNetB7(input_shape=(IMG_SIZE,IMG_SIZE,3), include_top=False, weights="imagenet")
        #base.trainable=False
        model.add(base)  
        model.add(Dropout(0.5))
        model.add(GlobalAvgPool2D())
        model.add(Dense(4, activation="softmax"))
        model.compile(optimizer="adam",loss=CategoricalCrossentropy(label_smoothing=0.1), metrics=["categorical_accuracy"])
        early_stop = EarlyStopping(monitor="loss", patience=8, min_delta=0.01)
        reduce_lr = ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.00001, verbose=1)
        H = model.fit(train_dataset, epochs=15,steps_per_epoch=len(train_paths)//BATCH_SIZE, callbacks=[early_stop,lr_schedule])
        preds = np.array(model.predict(test_dataset))
        arrs.append(preds)

In [None]:
# Train 2 IncepResNets and append predictions to arr
for i in range(2):
    with strategy.scope():
        model = Sequential()
        base = InceptionResNetV2(input_shape=(IMG_SIZE,IMG_SIZE,3), include_top=False, weights="imagenet")
        model.add(base)
        model.add(Dropout(0.5))
        model.add(GlobalAveragePooling2D())
        model.add(Dense(4, activation="softmax"))
        model.compile(optimizer="adam",loss=CategoricalCrossentropy(label_smoothing=0.1), metrics=["categorical_accuracy"])
        early_stop = EarlyStopping(monitor="loss", min_delta=.01, patience=8)
        H = model.fit(train_dataset, epochs=15,steps_per_epoch=len(train_paths)//BATCH_SIZE, callbacks=[early_stop,lr_schedule])
        preds = model.predict(test_dataset)
        # Predictions
        preds = np.array(model.predict(test_dataset))
        arrs.append(preds)

# Testing

In [None]:
# Inference function, averages all predictions then creates a submission dataframe for competition
def make_new_preds(arr_list, NUM_OLD_MODELS=1, submission_name="new_preds.csv"):
    arrs = np.asarray(arr_list)
    avg = np.sum(arrs, axis=0)
    old_preds = pd.read_csv("../input/incesres/2IncResNets  2EFNB7.csv").loc[:,"healthy":].values * NUM_OLD_MODELS
    new_preds = (old_preds + avg) / (len(arr_list) + NUM_OLD_MODELS)
    sub.loc[:,"healthy":] = new_preds
    sub.head()
    sub.to_csv(submission_name, index=False)
    
make_new_preds(arrs, 4)