## Loading the splits and building input pipeline

In [1]:
import pandas as pd
import numpy as np
import warnings
#import mlflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers




In [6]:
# loading the data splits

from pathlib import Path

split_dir = Path("../data/splits")

train_paths  = np.load(split_dir / "train_paths.npy", allow_pickle=True)
train_labels = np.load(split_dir / "train_labels.npy", allow_pickle=True)

val_paths  = np.load(split_dir / "val_paths.npy", allow_pickle=True)
val_labels = np.load(split_dir / "val_labels.npy", allow_pickle=True)

test_paths  = np.load(split_dir / "test_paths.npy", allow_pickle=True)
test_labels = np.load(split_dir / "test_labels.npy", allow_pickle=True)

print(len(train_paths), len(val_paths), len(test_paths))
print(train_paths[0], train_labels[0])


14034 3509 3096
..\data\raw\PlantVillage\YellowLeaf__Curl_Virus\60d14bc3-b703-4b83-8bf9-f13124970145___YLCV_GCREC 2934.JPG 7


In [8]:
# Build TensorFlow datasets from the indices

import tensorflow as tf

IMG_SIZE = (224, 224)
BATCH_SIZE = 32
SEED = 42

def load_image(path, label):
    img = tf.io.read_file(path) #reads the image file
    img = tf.image.decode_jpeg(img, channels=3) #decodes JPEG images into uint8 tensor, RGB channels
    img = tf.image.resize(img, IMG_SIZE) #resizes images to specified size
    img = img = tf.cast(img, tf.float32)  # keep [0..255]
    return img, label #returns image and label in a format suitable for Keras

def make_dataset(paths, labels, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((paths.astype(str), labels)) #creates a dataset where each element is (path, label)
    ds = ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE) #each element is processed by load_image function
    if shuffle: # shuffle is for training dataset only, prevents from seeing the data in the same order every epoch
        ds = ds.shuffle(1000, seed=SEED)
    return ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) #groups samples into batches and fetches them in the background

train_ds = make_dataset(train_paths, train_labels, shuffle=True)
val_ds   = make_dataset(val_paths, val_labels)
test_ds  = make_dataset(test_paths, test_labels)