## Modelling and Evaluation

### Objectives
* Answer business requirement 2:
    - The client is interested in predicting if a cherry leaf is healthy or contains powdery mildew

### Inputs
* Split datasets:
    - inputs/datasets/cherry-leaves/train
    - inputs/datasets/cherry-leaves/validation
    - inputs/datasets/cherry-leaves/test

### Outputs
* Leafs healt classification.
* Save model.
* 

---

## Change working directory
Change from working to root directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

In [None]:
os.chdir(os.path.dirname(current_dir))

current_dir = os.getcwd()
current_dir

---

### Import pakages

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread

sns.set_style("white")

### Setup directories and variables

#### Store file paths
input

In [None]:
data_dir = "input/datasets/cherry-leaves"

train_dir = data_dir + "/train"
val_dir = data_dir + "/validation"
test_dir = data_dir + "/test"

### Create outputs directory

In [None]:
Version = "v1"

file_path = f"outputs/{Version}"

if "outputs" in os.listdir(current_dir) and version in os.listdir(current_dir + "/outputs"):
    print("This version is already in use. Create a new version.")
    pass
else:
    os.makedirs(name=file_path)

### Store labels

In [None]:
labels = os.listdir(train_dir)
print("Label is:", labels)

---

### Display the balance of target labels

In [None]:
#Function is from CI Walktrough

def plot_target_balance_per_set(data_dir, save_image=False):
    df_freq = pd.DataFrame([])
    for folder in ["train", "validation", "test"]:
        for label in labels:
            df_freq = df_freq.append(
                pd.Series(
                    data={
                        "Set": folder,
                        "Label": label,
                        "Frequency": int(
                            len(os.listdir(data_dir + "/" + folder + "/" + label))
                        ),
                    }
                ),
                ignore_index=True,
            )

            print(
                f"* {folder} - {label}: {len(os.listdir(data_dir+'/'+ folder + '/' + label))} images"
            )

    print("\n")
    sns.set_style("white")
    plt.figure(figsize=(8, 5))
    sns.barplot(data=df_freq, x="Set", y="Frequency", hue="Label")

    if save_image:
        plt.savefig(
            f"{file_path}/labels_distribution.png", bbox_inches="tight", dpi=150
        )

    plt.show()

In [None]:
plot_target_balance_per_set(data_dir)

Save image

In [None]:
plot_target_balance_per_set(data_dir, save_image=True)

---

### Load iamges

In [None]:
from tensorflow.keras.utils import image_dataset_from_directory

batch_size = 20

train_set = image_dataset_from_directory(
    train_dir,
    label_mode="categorical",  
    seed=123,
    batch_size=batch_size,
)

train_set 

In [None]:
validation_set = image_dataset_from_directory(
    train_dir,
    label_mode="categorical",
    seed=123,
    batch_size=batch_size,
)

validation_set 


In [None]:
test_set = image_dataset_from_directory(
    train_dir,
    label_mode="categorical",
    seed=123,
    batch_size=batch_size,
)

test_set 

---


### Save class names

In [None]:
import joblib

joblib.dump(value=train_set.class_names, filename=f"{file_path}/class_names.pkl")

---

### Model Creation

Import model packages

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPooling2D, Rescaling

In [None]:
image_shape = (256, 256, 3)

def create_model():
    model = Sequential()

    
    model.add(Rescaling(1.0 / 255))

   
    model.add(
        Conv2D(filters=32, kernel_size=(3, 3), input_shape=image_shape, activation="relu",)
    )
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(
        Conv2D(filters=64, kernel_size=(3, 3), input_shape=image_shape, activation="relu",)
    )
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(
        Conv2D(filters=64, kernel_size=(3, 3), input_shape=image_shape, activation="relu",)
    )
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())

    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))

    model.add(Dense(2, activation="softmax"))

    model.compile(
        loss="categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"],
    )

    return model

---

### Model training

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss", patience=3)

### Fit model for training

In [None]:
model = create_tf_model()
model.fit(train_set,
          epochs=100,
          validation_data=validation_set,
          callbacks=[early_stop],
          verbose=1
          )

Save model

In [None]:
model.save('outputs/v1/leaf_health_model_h5')

---

### Model Evaluation

#### Show model learning curve

In [None]:
def plot_learning_curve(model, file_path=None, save_image=False):
    losses = pd.DataFrame(model.history.history)

    sns.set_style("whitegrid")
    sns.lineplot(x=range(len(losses['loss'])), y=losses['loss'], label='loss')
    sns.lineplot(x=range(len(losses['val_loss'])), y=losses['val_loss'], label='val_loss')
    plt.title("Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    if save_image:
        plt.savefig(f"{file_path}/model_training_losses.png", bbox_inches="tight", dpi=150)
    plt.show()

    print("\n")
    sns.lineplot(x=range(len(losses['accuracy'])), y=losses['accuracy'], label='accuracy')
    sns.lineplot(x=range(len(losses['val_accuracy'])), y=losses['val_accuracy'], label='val_accuracy')
    plt.title("Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    if save_image:
        plt.savefig(f"{file_path}/model_training_accuracy.png", bbox_inches="tight", dpi=150)
    plt.show()

plot_learning_curve(model, file_path)

#### Save the images

In [None]:
plot_learning_curve(model, save_image=True)

#### Model testing

In [None]:
evaluation = model.evaluate(test_set)

print("Loss: ", evaluation[0])
print("Accuracy: ", evaluation[1])

save the test

In [None]:
joblib.dump(value=evaluation, filename=f"outputs/v1/evaluation.pkl")