In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Plant Pathology 2020-FGVC7 challenge competition had a pilot dataset of 3,651 RGB images of foliar disease of apples. For Plant Pathology 2021-FGVC8, we have significantly increased the number of foliar disease images and added additional disease categories. This year’s dataset contains approximately 18,632 (trainset) high-quality RGB images of apple foliar diseases, including a large expert-annotated disease dataset. This dataset reflects real field scenarios by representing non-homogeneous backgrounds of leaf images taken at different maturity stages and at different times of day under different focal camera settings.

In [None]:
# to prevent unnecessary warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# TensorFlow and tf.keras
import tensorflow as tf

from pathlib import Path

#import useful module for keras library
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from keras.preprocessing.image import ImageDataGenerator

# get modules from sklearn library
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 

#import libraries
import matplotlib.pyplot as plt
import seaborn as sns
import random

#computer vision library
import cv2

### **Load the image Dataset**

In [None]:
#read the data with pandas read csv
folia_data = pd.read_csv("../input/plant-pathology-2021-fgvc8/train.csv")

#view the first 4 rows of the dataset in a table
folia_data.head()

In [None]:
#get shape of dataset
folia_data.shape

In [None]:
folia_data.info()

### Visu

In [None]:
folia_data['labels'].unique()

In [None]:
#get the counts of the labels
folia_data['labels'].value_counts()

In [None]:
#visualize
plt.figure(figsize = (10,5))

plt.xticks(rotation = 90, fontsize = 7)
sns.countplot(folia_data['labels'])

In [None]:
# create a function to visualize the images
def visualize_batch(path,image_ids, labels):
    plt.figure(figsize=(18, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_ids, labels)):
        plt.subplot(5, 3, ind + 1)
        image = cv2.imread(os.path.join(path, image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(f"Class: {label}", fontsize = 8)
        plt.axis("off")
    plt.show()

In [None]:
image_path = '../input/plant-pathology-2021-fgvc8/train_images' # the file path for only images

tmp_df = folia_data.sample(15)
image_ids = tmp_df["image"].values #the id for the image in the dataframe
labels = tmp_df["labels"].values #the labels for the image in the dataframe

# call the function to visualize the 15 images from the dataset
visualize_batch(image_path,image_ids,labels)

In [None]:
#label = healthy

th_df = folia_data[folia_data["labels"] == 'healthy']


th_df = th_df.sample(9)
image_ids = th_df["image"].values
labels = th_df["labels"].values

visualize_batch(image_path, image_ids, labels)

In [None]:
#label = complex

tmc_df = folia_data[folia_data["labels"] == 'complex']


tmc_df = tmc_df.sample(9)
image_ids = tmc_df["image"].values
labels = tmc_df["labels"].values

visualize_batch(image_path, image_ids, labels)

In [None]:
#label = rust

tmr_df = folia_data[folia_data["labels"] == 'rust']


tmr_df = tmr_df.sample(9)
image_ids = tmr_df["image"].values
labels = tmr_df["labels"].values

visualize_batch(image_path, image_ids, labels)

In [None]:
#label = scab

tms_df = folia_data[folia_data["labels"] == 'scab']


tms_df = tms_df.sample(9)
image_ids = tms_df["image"].values
labels = tms_df["labels"].values

visualize_batch(image_path, image_ids, labels)

In [None]:
#label = powdery_mildew

tpm_df = folia_data[folia_data["labels"] == 'powdery_mildew']


tpm_df = tpm_df.sample(9)
image_ids = tpm_df["image"].values
labels = tpm_df["labels"].values

visualize_batch(image_path, image_ids, labels)

In [None]:
# label = frog_eye_leaf_spot

tfe_df = folia_data[folia_data["labels"] == 'frog_eye_leaf_spot']


tfe_df = tfe_df.sample(9)
image_ids = tfe_df["image"].values
labels = tfe_df["labels"].values

visualize_batch(image_path, image_ids, labels)

#### The labels counts shows that there are 12 different classes .
**In actuality there are 6 labels, 5 diseases and 1 healthy case.**

 1.rust
 
 2.scab
 
 3.complex
 
 4.frog eye leaf spot
 
 5.powdery mildew

and another label is

6.healthy (healthy leaves)

**But there are cases where an image contains one or more diseases, that means this problem is Multi label classification problem.**

**And then lets find out the actual frequencies of the labels.**

We divide it based on " " or space character , in order to get the labels for each of the image



In [None]:
folia_df = folia_data.copy() # create a copy of the original data set

folia_df['labels'] = folia_df['labels'].apply(lambda string: string.split(' '))
# this kind of separates the compound names that we thought were initially unique into the separate diseases classes
folia_df

#### We can check the label for image two and understand what has taken place. If you check the label it has previously, you will see that it was 'scab frog_eye_leaf_spot complex' which made the label think it was another disease.
#### But applying lamda separated the diseases in the label to [scab, frog_eye_leaf_spot, complex]

In [None]:
# using the multilabel binarizer from sklearn
from sklearn.preprocessing import MultiLabelBinarizer

f = list(folia_df['labels'])
mlb = MultiLabelBinarizer()
df = pd.DataFrame(mlb.fit_transform(f), columns = mlb.classes_, index = folia_df.index)
print(df.columns)

In [None]:
df.sum()

In [None]:
labels = list(df.sum().keys())
#print(labels)
label_counts = df.sum().values.tolist()

fig, ax = plt.subplots(1,1, figsize=(14,6))

sns.barplot(x= labels, y= label_counts, ax=ax)

#### Splitiing the dataset

In [None]:
# There is a different dataset for testing,
#we split the train dataset into train and validation set

train_set, val_set = train_test_split(folia_df, test_size = 0.2, random_state = 42)

print(train_set.shape)
print(val_set.shape)

### Image Generator Preprocessing

In [None]:
img_gen = ImageDataGenerator(preprocessing_function = tf.keras.applications.mobilenet_v2.preprocess_input, 
                             rescale=1/255, zoom_range = 0.2,
                                rotation_range = 20,
                                width_shift_range = 0.2,
                                height_shift_range = 0.2,
                                horizontal_flip = True)

# img_gen cannot take in an array, so ensure the data that is been passed is a dataframe
train = img_gen.flow_from_dataframe(dataframe = train_set,
    directory = '../input/plant-pathology-2021-fgvc8/train_images' ,  # the path contaning the images                                 
    x_col = 'image', #name of the column containing the image in the train set
    y_col ='labels', #name of column containing the target in the train set
    target_size = (224, 224),
    color_mode = 'rgb',
    class_mode = 'categorical',#the class mode here and that for the model_loss(when using sequential model)
                                    #should be the same
    batch_size = 32,
    shuffle = False #not to shuffle the given data
)

val = img_gen.flow_from_dataframe(dataframe = val_set,
    directory = '../input/plant-pathology-2021-fgvc8/train_images' ,   # the path conataining the images                          
    x_col = 'image', #name of the column containing the image in the test set
    y_col ='labels', #name of column containing the target in the test set
    target_size =(224, 224),
    color_mode ='rgb',
    class_mode ='categorical',
    batch_size = 32,
    shuffle = False # not to shuffle the given data
)

### Building the Sequential Convolution model 

In [None]:
# define sequential model
model = tf.keras.models.Sequential()
# define conv-pool layers - set 1
model.add(tf.keras.layers.Conv2D(filters = 32, kernel_size=(3, 3), strides=(1, 1), 
                                activation='relu', padding='valid', input_shape = (224, 224, 3)))
model.add(BatchNormalization(axis=3))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))


# define conv-pool layers - set 2
model.add(tf.keras.layers.Conv2D(filters = 16, kernel_size=(3, 3), strides=(1, 1), 
                                activation='relu', padding='valid'))
model.add(BatchNormalization(axis=3))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))

# define conv-pool layers - set 3
model.add(tf.keras.layers.Conv2D(filters = 16, kernel_size=(3, 3), strides=(1, 1), 
                                activation='relu', padding='valid'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))

# add flatten layer
model.add(tf.keras.layers.Flatten())

# add dense layers with some dropout
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(rate = 0.3))
model.add(tf.keras.layers.Dense(64, activation='relu'))

model.add(tf.keras.layers.Dense(16, activation='relu'))

# add output layer
model.add(tf.keras.layers.Dense(6, activation='softmax')) #use softmax as activation in the output layer
#for multiclass. Sigmoid activation is used for binary and 'relu' shouldnt be use for output layer


# view model layers
model.summary()

In [None]:
#import modules that will wnable early stopping for optimization during model training
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from datetime import datetime

#tensorboard
logdir = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=logdir)

#define the early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [None]:
# compile model
model.compile(optimizer='adam', # optimize the model with adam optimizer
              loss="categorical_crossentropy", 
              metrics=['accuracy']) #to get accuracy of the model in each run

In [None]:
#fit the model on train data and add val data fro validation
history = model.fit(train,
    batch_size = 32,
    verbose = 1, # Suppress chatty output; use Tensorboard instead
    epochs = 25,
    validation_data = val, #add the validation set to evaluate the performance in each run
    callbacks = [tensorboard_callback, es],
)


In [None]:
# cnn_model.save('model-cnn.folia')
model.save('model.folia')

In [None]:
acc = history.history['accuracy'] # get history report of the model

val_acc = history.history['val_accuracy'] # get history of the validation set

loss = history.history['loss'] #get the history of the lossses recorded on the train set
val_loss = history.history['val_loss'] #get the history of the lossses recorded on the validation set

plt.figure(figsize=(8, 8)) # set figure size for the plot generated
plt.subplot(2, 1, 1) 

plt.plot(acc, label='Training Accuracy') #plot accuracy curve for each train run
plt.plot(val_acc, label='Validation Accuracy') #plot accuracy curve for each validation run

plt.legend(loc='lower right')
plt.ylabel('Accuracy') #label name for y axis
plt.ylim([min(plt.ylim()),1]) #set limit for y axis
plt.title('Training and Validation Accuracy') #set title for the plot

In [None]:
plt.figure(figsize=(8, 8)) # set figure size for the plot generated
plt.subplot(2, 1, 1) 

plt.plot(loss, label='Training Loss') #plot loss curve for each train run
plt.plot(val_loss, label='Validation Loss') #plot loss curve for each validation run

plt.legend(loc='lower right')
plt.ylabel('Loss') #label name for y axis
plt.ylim([min(plt.ylim()),1]) #set limit for y axis
plt.title('Training and Validation Loss') #set title for the plot

### Prediction

In [None]:
#get the testing data
test_path = "../input/plant-pathology-2021-fgvc8/sample_submission.csv"
test_set = pd.read_csv(test_path)
test_set

In [None]:
imgen = ImageDataGenerator(preprocessing_function = tf.keras.applications.mobilenet_v2.preprocess_input, 
                             rescale=1/255 )

# img_gen cannot take in an array, so ensure the data that is been passed is a dataframe
train = imgen.flow_from_dataframe(dataframe = test_set,
    directory = '../input/plant-pathology-2021-fgvc8/test_images' ,  # the path contaning the images                                 
    x_col = 'image', #name of the column containing the image in the train set
    y_col ='labels', #name of column containing the target in the train set
    target_size = (224, 224),
    color_mode = 'rgb',
    class_mode = 'categorical',#the class mode here and that for the model_loss(when using sequential model)
                                    #should be the same
    batch_size = 32,
    shuffle = False #not to shuffle the given data
)

In [None]:
preds = model.predict(test)

preds = preds.tolist()
indices = []
for pred in preds:
    temp = []
    for category in pred:
        if category>=0.23:
            temp.append(pred.index(category))
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)
    

In [None]:
labels = (train.class_indices)
labels = dict((v,k) for k,v in labels.items())
print(labels)

testlabels = []
for image in indices:
    temp = []
    for i in image:
        temp.append(str(labels[i]))
    testlabels.append(' '.join(temp))
print(testlabels)

In [None]:
submission['labels'] = testlabels
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)