<a href="https://colab.research.google.com/github/RaymondBrien/cherry-ml/blob/main/jupyter_notebooks/ColabOnly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Notebook for Google Collab**

# This notebook is for use in Google Collab only, compiled in a single workflow.

## Objectives
Answer Business requirement 2: Binary Classification using Convolutional Neural Networks

* predict if a given leaf is infected or not judging by the presence of powdery mildew.
* use the CNN to map relationships between features and labels.
* build a binary classifier and generate reports.

## Inputs

* inputs/cherry-leaves-dataset/cherry-leaves/train
* inputs/cherry-leaves-dataset/cherry-leaves/test
* inputs/cherry-leaves-dataset/cherry-leaves/validation
* image shape embeddings pickle file

## Outputs

* cherry-tree-model.h5 (model)
* class_distribution.png
* class_indices.pkl
* model_training_acc.png
* model_training_losses.png
* test-evaluation.pkl
* train-evaluation.pkl
* val-evaluation.pkl





---

### ANNOTATE MODEL VERSION

In [1]:
version = 'v6'  # change as needed

Mount drive for backup

In [None]:
from google.colab import drive
drive.mount('/content/drive')


### Import regular packages

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread

### Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

In [None]:
# manually upload kaggle.json
from google.colab import files
files.upload()

In [None]:
!pip install kaggle==1.5.12

In [9]:
# allow kaggle.json access
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

In [None]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry-leaves-dataset"  # creates new dir/dir
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

In [11]:
import zipfile

try:
    with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
        zip_ref.extractall(DestinationFolder)
except Exception as e:
    print(e)


### Set input directory

In [12]:
my_data_dir = 'inputs/cherry-leaves-dataset/cherry-leaves'
train_path = my_data_dir + '/train'
val_path = my_data_dir + '/validation'
test_path = my_data_dir + '/test'

### Set output directory

In [13]:

file_path = f'outputs/{version}'

if 'outputs' in os.listdir(current_dir) and version in os.listdir(current_dir + '/outputs'):
    print('Old version is already available, create a new version.')
    pass
else:
    os.makedirs(name=file_path)

### Gather labels

In [None]:
try:
    labels = os.listdir(train_path)
except:
    labels = ['healthy', 'powdery_mildew']

print(f"Project Labels: {labels}")

### Load image shape embeddings

In [None]:
import joblib

try:
    # Import saved image shape embedding pickle file
    image_shape = joblib.load(filename=f"outputs/{version}/image_shape.pkl")

except:
    # for google collab
    image_shape = (256, 256, 3)

finally:
    print(image_shape)



## Validate image files:

In [16]:
# Uncomment if using google collab

def remove_non_image_files(my_data_dir):
    print('Removing non image files...\n')
    image_extension = ('.png', '.jpg', 'jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(f'{my_data_dir}/{folder}')
        # print files
        non_image = []
        image_count = []
        for given_file in files:
            try:
                if not given_file.lower().endswith(image_extension):
                    file_location = f'{my_data_dir}/{folder}/{given_file}'
                    os.remove(file_location) # remove non image file
                    non_image.append(1)
                else:
                    image_count.append(1)
                    pass
            except Exception as e:
                print(e)

        print(f'Folder: {folder} has - {len(image_count)} image files')
        print(f'Folder: {folder} has - {len(non_image)} non image files, which have been removed')

In [None]:
remove_non_image_files('inputs/cherry-leaves-dataset/cherry-leaves')

# Split train, val, test sets with dirs

In [18]:
import os
import shutil
import random
import joblib

def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    # confirm ratios total 1.0
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print('Ratios should total 1.0.')
        print('You entered:\n')
        print(f'Train radio: {train_set_ratio}')
        print(f'Validation radio: {validation_set_ratio}')
        print(f'Test radio: {test_set_ratio}')
        return

    # get classes labels
    labels = os.listdir(my_data_dir)  # expect only folder name
    if 'test' in labels:
        pass
    else:
        try:
            # create train, test folders with classes labels sub-folder
            for folder in ['train', 'validation', 'test']:
                for label in labels:
                    os.makedirs(name=f'{my_data_dir}/{folder}/{label}')

            for label in labels:

                files = os.listdir(f'{my_data_dir}/{label}')
                random.seed(42)
                random.shuffle(files)

                train_set_files_qty = int(len(files) * train_set_ratio)
                validation_set_files_qty = int(len(files) * validation_set_ratio)

                count = 1
                for file_name in files:
                    if count <= train_set_files_qty:
                        # move given file to train set
                        shutil.move(f'{my_data_dir}/{label}/{file_name}',
                                    f'{my_data_dir}/train/{label}/{file_name}')
                    elif count <= (train_set_files_qty + validation_set_files_qty):
                        # move a given file to the validation set
                        shutil.move(f'{my_data_dir}/{label}/{file_name}',
                                    f'{my_data_dir}/validation/{label}/{file_name}')
                    else:
                        # move given file to test set
                        shutil.move(f'{my_data_dir}/{label}/{file_name}',
                                    f'{my_data_dir}/test/{label}/{file_name}')

                    count += 1

                os.rmdir(f'{my_data_dir}/{label}')

        except Exception as e:
            print(e)

    print('Done!')


In [None]:
split_train_validation_test_images(
    my_data_dir='inputs/cherry-leaves-dataset/cherry-leaves',
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2
)

***

# Review class distribution

* across whole dataset
* per train, test, and validation

In [None]:
df_freq = pd.DataFrame([])
total_images_count = 0


# gather info
for folder in ['train', 'validation', 'test']:
    for label in labels:

        path = my_data_dir + '/' + folder + '/' + label

        image_count = int(len(os.listdir(path)))
        total_images_count += image_count

        # Create a new DataFrame with the data for the current row
        new_row = pd.DataFrame({'Set': [folder], 'Label': [label], 'Frequency': [image_count]})

        # Concatenate the new row to the existing DataFrame
        df_freq = pd.concat([df_freq, new_row], ignore_index=True)

        print(f"* {folder}- {label}: {image_count} images\n")


print(f'{total_images_count} images total')
print('--------')

### plot class distribution
plt.figure(figsize=(8, 5))
sns.set_style('darkgrid')
sns.barplot(x='Set', y='Frequency', hue='Label', data=df_freq, )
            # color='Frequency' alpha="Frequency", edgestyle='Frequency'
plt.title('Class Distribution')
plt.savefig(f'{file_path}/class_distribution.png', bbox_inches='tight', dpi=150)
plt.show()
print('\n')

print('--------')

# confirm percentages of dataset
df_freq.set_index('Label', inplace=True)
df_freq['Percent of DataSet'] = round(df_freq['Frequency'] / total_images_count * 100)

print(df_freq)

We can confirm that train, validation and test set percentages of dataset are split as expected, and that there are equal amounts of both classes (healthy and powdery_mildew) in each set.

***

# Image Augmentation

### Define image data generator, initialize


In [39]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Initialize
augmented_image_data = ImageDataGenerator(rotation_range=20,
                                          width_shift_range=0.1,
                                          height_shift_range=0.1,
                                        #   brightness_range=[0.8, 1.2],
                                          shear_range=0.1,
                                          zoom_range=0.1,
                                          horizontal_flip=True,
                                          vertical_flip=True,
                                          fill_mode='nearest',
                                          rescale=1./255
                                          )

### Define batch size

In [40]:
batch_size = 20

### Augment TRAINING image dataset


In [None]:
train_set = augmented_image_data.flow_from_directory(train_path,
                                                     target_size=image_shape[:2],
                                                     color_mode='rgb',
                                                     batch_size=batch_size,
                                                     class_mode='binary',
                                                     shuffle=True,
                                                     # seed=42,
                                                     )


train_set.class_indices

### Rescale validation image dataset


In [None]:
validation_set = ImageDataGenerator(rescale=1./255).flow_from_directory(val_path,
                                                          target_size=image_shape[:2],
                                                          color_mode='rgb',
                                                          batch_size=batch_size,
                                                          class_mode='binary',
                                                          shuffle=False
                                                          )

validation_set.class_indices

### Rescale test image dataset

In [None]:
test_set = ImageDataGenerator(rescale=1./255).flow_from_directory(test_path,
                                                    target_size=image_shape[:2],
                                                    color_mode='rgb',
                                                    batch_size=batch_size,
                                                    class_mode='binary',
                                                    shuffle=False
                                                    )

test_set.class_indices

### Plot augmented training images

In [None]:
for _ in range(3):
    try:
        img, label = train_set.next()
    except:
        # for google collab functionality
        img, label = next(train_set)

    print(f'{img.shape}\n')  # expect: (20, 256, 256, 3)
    plt.imshow(img[0])
    print('--------------')
    plt.show()


### Plot augmented validation and test images

In [None]:
# validation_set
for _ in range(3):
    try:
        img, label = validation_set.next()
    except:
        # for google collab functionality
        img, label = next(validation_set)
    print(f'{img.shape}\n')
    plt.imshow(img[0])
    print('--------------')
    plt.show()



In [None]:
# test set
for _ in range(3):
    try:
        img, label = test_set.next()
    except:
        # for google collab functionality
        img, label = next(test_set)
    print(f'{img.shape}\n')
    plt.imshow(img[0])
    print('--------------')
    plt.show()

###  Observations
Augmented validation and test images have been standardized between 0 to 255 pixels. As you can see, the images are ugmented and are ready to be used for developing and training a CNN model.

### Save class indices

In [None]:
joblib.dump(value=train_set.class_indices,
            filename=f"{file_path}/class_indices.pkl")

---

# Model Creation

---

### ML Model

* Import model packages

In [57]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from tensorflow.keras.utils import plot_model

* ### Model

In [58]:
def create_tf_model():
    """
    Creates a CNN model for binary classification of leaf images
    """
    model = Sequential()

    # Input layer: CONV1
    model.add(Conv2D(filters=32, kernel_size=(3, 3),
        input_shape=image_shape,  # average image shape
        activation='relu', ))
    model.add(MaxPooling2D(pool_size=(2,2)))

    # CONV2
    model.add(Conv2D(filters=12, kernel_size=(3, 3),
        activation='relu', ))
    model.add(MaxPooling2D(pool_size=(2,2)))

    # CONV3
    model.add(Conv2D(filters=8, kernel_size=(3,3),
        activation='relu', ))
    model.add(MaxPooling2D(pool_size=(2,2)))

    # Flatten
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))

    # Output
    model.add(Dense(1, activation='sigmoid'))

    # Compile
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy',])

    return model

## Model Summary

In [None]:
summary = create_tf_model().summary(show_trainable=True)

In [None]:
try:
    plot_model(model, show_shapes=True, to_file=f'model_{version}.png')
except Exception as e:
    print(e)
    pass


Early Stopping
* Avoid overfitting

In [61]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=3)

***

In [None]:
print(f'Train set object: {train_set}')
print(f"Number of samples in training set: {train_set.samples}")
print(f"Number of classes: {len(train_set.classes)}")
print(f"Batch size: {batch_size}")
print(f"Current steps calculation (classes/batch_size): {len(train_set.classes) // batch_size}")
print(f"Correct steps calculation (samples/batch_size): {train_set.samples // batch_size}")
print(f"Number of validation samples: {validation_set.samples}")
print(f"Validation steps per epoch: {validation_set.samples // batch_size}")

# Fit Model for training

### Save checkpoints

In [65]:
import os

# check if the full path exists
checkpoint_folder = f'/workspace/outputs/{version}/training_checkpoints'

if not os.path.exists(checkpoint_folder):
    os.makedirs(checkpoint_folder)
    print('training checkpoints folder made')

# dynamically include the epoch in checkpoint file name
checkpoint_path = f"{checkpoint_folder}/cp-{{epoch:04d}}.weights.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

# callback to save model weights per epoch
cp_callback = ModelCheckpoint(filepath=checkpoint_path,
                                                 verbose=1,
                                                 save_weights_only=True,
                                                 save_freq='epoch',
                                                 monitor='accuracy',
                                                 save_best_only=True)


Brief untrained model evaluation

In [None]:
loss, acc = create_tf_model().evaluate(test_set, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100 * acc)) # expect c.50% accuracy

Add CSV logger for history access in case of training runtime errors

In [68]:
history_csv_logger = CSVLogger('training.log', separator=',', append=False)

Define epoch count

In [None]:
EPOCHS = 25

In [None]:
try:
    model = create_tf_model()
    model.fit(train_set,
            epochs=EPOCHS,
            steps_per_epoch = len(train_set.classes) // batch_size,
            validation_data=validation_set,
            callbacks=[early_stop, cp_callback, history_csv_logger],
            verbose=1,
            )
except Exception as e:
    print(e + '\n')

    # load latest weights
    latest = tf.train.latest_checkpoint(checkpoint_dir)
    model.load_weights(latest)

    print('Model restored! Continuing...')
    # continue model training
    model.fit(train_set,
            epochs=EPOCHS,
            steps_per_epoch = len(train_set.classes) // batch_size,
            validation_data=validation_set,
            callbacks=[early_stop, cp_callback, history_csv_logger],
            verbose=1,
            )


### Save model

In [None]:
try:
    model.save(f'outputs/{version}/cherry-tree-model.h5')
    print('model saved!')
except Exception as e:
    print(e)

***

# Evaluate Model Performance

Model learning curve

In [None]:
losses = pd.DataFrame(model.history.history)

sns.set_style("darkgrid")
losses[['loss','val_loss']].plot(style='.-')
plt.title("Loss")
plt.savefig(f'{file_path}/model_training_losses.png', bbox_inches='tight', dpi=150)
plt.show()


print("\n")
losses[['accuracy','val_accuracy']].plot(style='.-')
plt.title("Accuracy")
plt.savefig(f'{file_path}/model_training_acc.png', bbox_inches='tight', dpi=150)
plt.show()

### Evaluate and save

In [None]:
from keras.models import load_model

model = load_model(f'{file_path}/cherry-tree-model.h5')

In [None]:
try:
    eval1 = model.evaluate(test_set)
    evaluation_train = model.evaluate(train_set)
    evaluation_val = model.evaluate(validation_set)
except Exception as e:
    eval1 = model.evaluate(test_set)  # evaluate on test set only
    print(e)
    pass

Save evaluations

In [None]:
try:
    joblib.dump(value=eval1,filename=f"outputs/{version}/test-evaluation.pkl")
    joblib.dump(value=evaluation_train,filename=f"outputs/{version}/train-evaluation.pkl")
    joblib.dump(value=evaluation_val,filename=f"outputs/{version}/val-evaluation.pkl")
except Expection as e:
    print(e)
    print('-----')
    print('saving successful test eval')
    joblib.dump(value=eval1,filename=f"outputs/{version}/test-evaluation.pkl")


Save backup model to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r outputs/{version} /content/drive/MyDrive/{version}


# Run Live Prediction

Load image as PIL

In [None]:
from tensorflow.keras.preprocessing import image

pointer = 66  # choose random int
label = labels[1]  # choose 0 or 1 (healthy, infected)

pil_image = image.load_img(test_path + '/' + label + '/' + os.listdir(test_path+'/' + label)[pointer],
                           target_size=image_shape, color_mode='rgb')
print(f'Image shape: {pil_image.size}, Image mode: {pil_image.mode}')
pil_image

Convert prediction image to array for prediction

In [None]:
pred_img = image.img_to_array(pil_image)
pred_img = np.expand_dims(pred_img, axis=0)/255
print(pred_img.shape)

Predict class probability on test image

In [None]:
pred_proba = model.predict(pred_img)[0, 0]

target_map = {v: k for k, v in train_set.class_indices.items()}
pred_class = target_map[pred_proba > 0.5]  # define binary boundary

if pred_class == target_map[0]:
  pred_proba = 1 - pred_proba

print(f'Prediction: {pred_class}\nConfidence: {pred_proba*100:.2f}%')

## Save backup files to drive

Save tf version used in notebook to file

In [None]:
tf_version = tf.__version__
tf_version

joblib.dump(value=tf_version,filename=f"outputs/{version}/tf_version.pkl")

In [82]:
!pip freeze > outputs/{version}/colab_requirements.txt

Reconfirm all files saved in backup

In [None]:
from google.colab import drive
drive.mount('/content/drive')


!cp -r outputs/{version} /content/drive/MyDrive/{version}

***