In [23]:
import pickle
import os
import numpy as np 
import tensorflow.compat.v1 as tf

AUTOTUNE = tf.data.experimental.AUTOTUNE


def get_dataset(n_devices, batch_size, normalize, dtype):
    """Get DeFungi dataset splits."""
    if batch_size % n_devices:
        raise ValueError("Batch size %d isn't divided evenly by n_devices %d" %
                     (batch_size, n_devices))

    relative_path = os.path.join('..', 'EDA', 'Dataset')
    
    # Dataset that is all grayscale, or all RGB. Pulled from ../EDA/Dataset
    all_dataset = tf.data.Dataset.list_files(os.path.join(relative_path, f'*/{dtype}_*.jpg'))
    
    print('Inspecting the first 5 filepaths to make sure nothing is awry\n')
    # Take and display the first 5 file paths
    for file_path in all_dataset.take(5).as_numpy_iterator():
        print(file_path.decode())

    def decode(x):
        # Read and decode the image
        image = tf.io.read_file(x)
        image = tf.image.decode_jpeg(image, channels=3)
        
        # Extract class label from the directory path
        class_name_tensor = tf.strings.regex_replace(x, r'.*/Dataset/([a-zA-Z0-9]{2}).*', r'\1')
        class_name_str = tf.py_function(lambda x: x.numpy().decode("utf-8"), [class_name_tensor], tf.string)
        
        
        # Define different class labels as a dict
        class_labels = {'H1': 0, 'H2': 1, 'H3': 2, 'H5': 3, 'H6': 4}
        
        # Create inputs and targets dictionary
        decoded = {
        'inputs':
            tf.cast(tf.image.rgb_to_grayscale(image), dtype=tf.int32),
        'targets':
            class_labels[class_name_str.numpy().decode()]
        }
        if normalize:
            decoded['inputs'] = decoded['inputs'] / 255
        return decoded
    
    # Maps the entire dataset to its corresponding label (supervised learning)
    # This takes a bit to run
    all_dataset = all_dataset.map(decode, num_parallel_calls=AUTOTUNE)
    
    # Check the first 20 members of all_dataset
    print('Inspecting the first 20 images after extraction and labelling\n')
    for file_path in all_dataset.take(20).as_numpy_iterator():
        print(file_path.decode())
    
    # Shuffle, then split all_dataset into training, testing, validating
    train_ratio = 0.7
    val_ratio = 0.15
    test_ratio = 0.15
    
    all_size = all_dataset.cardinality().numpy()


    # train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.batch(batch_size, drop_remainder=True)

    train_dataset = train_dataset.shuffle(
        buffer_size=256, reshuffle_each_iteration=True)

    return train_dataset, val_dataset, test_dataset, 10, 256, (batch_size, 32, 32,
                                                             1)



# Pass in grayscale or RGB selection from Python args
dtype = input('Enter `rgb` if you want an rgb dataset, `grayscale` otherwise: ')

# Return trainset, evalset, testset
train_ds, eval_ds, test_ds, num_classes, vocab_size, input_shape = get_dataset(1, 256, True, dtype)

# Map the 3 sets, encode in binary 
mapping = {"train": train_ds, "dev": eval_ds, "test": test_ds}
for component in mapping:
    ds_list = []
    for idx, inst in enumerate(iter(mapping[component])):
        ds_list.append({
            "input_ids_0": inst["inputs"].numpy()[0].reshape(-1),
            "label": inst["targets"].numpy()[0]
        })
        if idx % 100 == 0:
            print(f"{idx}\t\t", end="\r")
    with open(f"../defungi_datasets/image.{component}.pickle", "wb") as f:
        pickle.dump(ds_list, f)


Enter `rgb` if you want an rgb dataset, `grayscale` otherwise:  rgb


Inspecting the first 5 filepaths to make sure nothing is awry

../EDA/Dataset/H6_rgb/rgb_H6_50a_5.jpg
../EDA/Dataset/H5_rgb_pt2/rgb_H5_141a_6.jpg
../EDA/Dataset/H2/rgb_H2_95d_3.jpg
../EDA/Dataset/H5_rgb_pt1/rgb_H5_9a_3.jpg
../EDA/Dataset/H5_rgb_pt1/rgb_H5_35b_1.jpg


AttributeError: in user code:

    File "/tmp/ipykernel_44568/1903835726.py", line 39, in decode  *
        decoded = {

    AttributeError: 'SymbolicTensor' object has no attribute 'numpy'
