## 03 - Modelling and Evaluating

### Objective

Answer Business Requirement 2:

- The client is interested to predict whether a cherry leaf is healthy or contains powdery mildew.

### Inputs

- inputs/cherry_leaves_split/train
- inputs/cherry_leaves_split/val
- inputs/cherry_leaves_split/test
- outputs/02_data_visualisation/image_shape.pkl

### Outputs

- Model saved as .h5
- Class indices saved as .pkl
- Learning curves
- Model evaluation score

### Imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from matplotlib.image import imread
from matplotlib.image import imread
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K

sns.set_style("whitegrid")

### Set Working Directory

In [None]:
os.chdir('/workspaces/cherry-leaves-health')
print("Current directory:", os.getcwd())

### Set paths

In [None]:
cwd = os.getcwd()
print("Working directory set:", cwd)

train_path = "/workspaces/cherry-leaves-health/inputs/cherry_leaves_split/train"
val_path = "/workspaces/cherry-leaves-health/inputs/cherry_leaves_split/val"
test_path = "/workspaces/cherry-leaves-health/inputs/cherry_leaves_split/test"

version = "03_modelling_and_evaluating"
file_path = f"outputs/{version}"
os.makedirs(file_path, exist_ok=True)

### Labels and image shape

In [None]:
labels = os.listdir(train_path)
print("Project Labels:", labels)

image_shape = joblib.load("outputs/02_data_visualisation/image_shape.pkl")
print("Image shape:", image_shape)

### Image count distribution

In [None]:
df_freq = pd.DataFrame()
for folder in ['train', 'val', 'test']:
    for label in labels:
        count = len(os.listdir(f"inputs/cherry_leaves_split/{folder}/{label}"))
        df_freq = pd.concat([df_freq, pd.DataFrame([{"Set": folder, "Label": label, "Frequency": count}])])

sns.barplot(data=df_freq, x='Set', y='Frequency', hue='Label')
plt.title("Image Distribution by Set and Class")
plt.tight_layout()
plt.savefig(f"{file_path}/labels_distribution.png", dpi=150)
plt.show()

### Image Augmentation

In [None]:
batch_size = 20

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

train_set = train_datagen.flow_from_directory(train_path, target_size=image_shape[:2], batch_size=batch_size, class_mode='binary')
val_set = val_test_datagen.flow_from_directory(val_path, target_size=image_shape[:2], batch_size=batch_size, class_mode='binary')
test_set = val_test_datagen.flow_from_directory(test_path, target_size=image_shape[:2], batch_size=batch_size, class_mode='binary')

### Preview augmented images

In [None]:
for _ in range(3):
    img, label = next(train_set)
    plt.imshow(img[0])
    plt.axis("off")
    plt.show()

### Save class indices

In [None]:
joblib.dump(train_set.class_indices, f"{file_path}/class_indices.pkl")

### Create CNN Model

In [None]:
def create_model():
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=image_shape),
        MaxPooling2D(2, 2),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(2, 2),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(2, 2),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model()
model.summary()

### Train the model

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    train_set,
    epochs=25,
    steps_per_epoch=len(train_set),
    validation_data=val_set,
    validation_steps=len(val_set),
    callbacks=[early_stop],
    verbose=1
)

### Save the model

In [None]:
model.save("/workspaces/cherry-leaves-health/jupyter_notebooks/outputs/v1/cherry_leaf_mildew_model.h5")

### Learning Curves

In [None]:
loss_df = pd.DataFrame(history.history)

loss_df[['loss', 'val_loss']].plot(style='.-')
plt.title("Training vs Validation Loss")
plt.savefig(f"{file_path}/training_loss.png", dpi=150)
plt.show()

loss_df[['accuracy', 'val_accuracy']].plot(style='.-')
plt.title("Training vs Validation Accuracy")
plt.savefig(f"{file_path}/training_accuracy.png", dpi=150)
plt.show()

### Evaluate on Test Set

In [None]:
K.clear_session()  # Clear previous model traces
model = load_model("/workspaces/cherry-leaves-health/jupyter_notebooks/outputs/v1/cherry_leaf_mildew_model.h5")

test_eval = model.evaluate(test_set)
print("Test Set Evaluation (Loss, Accuracy):", test_eval)

joblib.dump(test_eval, "/workspaces/cherry-leaves-health/jupyter_notebooks/outputs/v1/evaluation.pkl")

### Predict on a random image

In [None]:
def predict_image(model, image_path, class_map):
    img = load_img(image_path, target_size=image_shape[:2])
    arr = img_to_array(img) / 255.
    arr = np.expand_dims(arr, axis=0)
    prob = model.predict(arr)[0, 0]
    pred_class = class_map[int(prob > 0.5)]
    confidence = prob if pred_class == class_map[1] else 1 - prob
    return pred_class, confidence

pointer = 10
label = labels[0]
img_file = os.listdir(f"{test_path}/{label}")[pointer]
img_path = os.path.join(test_path, label, img_file)

plt.imshow(load_img(img_path))
plt.axis("off")
plt.title(f"Test Image - {label}")
plt.show()

class_map = {v: k for k, v in train_set.class_indices.items()}
pred, conf = predict_image(model, img_path, class_map)
print(f"Predicted: {pred} ({conf:.2f} confidence)")

### Push files to repo