In [1]:
import os

import tensorflow as tf
from tensorflow import keras

AUTOTUNE = tf.data.experimental.AUTOTUNE

import numpy as np

import matplotlib.pyplot as plt

import matplotlib.image as mpimg

import kaggle

from definitions import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Preprocess files from raw dataset

### Download data

In [2]:
#kaggle.api.authenticate()
#kaggle.api.dataset_download_files(dataset_name, path=raw_data_dir, unzip=True, quiet=False)

<span style="color:red"> !!! TODO: next steps should be: </span>
    1. Take images from bee1 and bee2 dir and create folder bee and save them there
    2. Take images from wasp1 and wasp2 dir and create foldet wasp and save them there
    3. Delete: example_notebook, label_generator, labels.csv and README.md
    4. Move all remaining folders one level up in a directory tree, to raw_data_dir directory.
        
<span style="color:red"> !!! Now it has to be done manualy </span>      

### Create list of files and labels

In [3]:
subdirs = [f.name for f in os.scandir(raw_data_dir) if f.is_dir()]

files = []
for subdir in subdirs:
    subdir_path = os.path.join(raw_data_dir, subdir)
    files += [os.path.join(subdir_path, f.name) for f in os.scandir(subdir_path) if f.is_file()]

# Remove not images from dataset
not_jpg = [f for f in files if not f.endswith(".jpg")]
files = [file for file in files if file not in not_jpg]

# Sort files to maintain order
files = sorted(files)

labels = [file.split("\\")[-2] for file in files]

### Encode labels

In [4]:
label2index = dict((label, index) for index, label in enumerate(sorted(set(labels))))
encoded_labels = [label2index[label] for label in labels]

In [5]:
set(encoded_labels)

{0, 1, 2, 3}

### Split files into training, testing and validation

In [6]:
NUMBER_OF_FILES  = len(files)
NUMBER_OF_LABELS = len(label2index)

In [7]:
from sklearn.model_selection import train_test_split

train_files, test_files, train_labels, test_labels = train_test_split(files,
                                                                      encoded_labels,
                                                                      test_size=TEST_SPLIT_FACTOR,
                                                                      random_state=1969)

train_files, val_files, train_labels, val_labels = train_test_split(train_files,
                                                                    train_labels,
                                                                    test_size=VAL_SPLIT_FACTOR,
                                                                    random_state=1969)

In [8]:
# TO DELETE
print(len(train_files))
print(len(test_files))
print(len(val_files))
print(len(train_files) + len(val_files) + len(test_files))

7308
2284
1828
11420


### <span style="color:red"> TODO: Data augmentation on: train_files, test_files, val_files </span>
#### We have to remember also about augmenting labels 

### Prepare images in dataset

In [9]:
def prepare_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.uint8)
    image = tf.image.encode_jpeg(
        image,
        optimize_size=True,
        x_density=96,
        y_density=96
    )

    return image

def load_and_prepare_image(path):
    image = tf.io.read_file(path)
    return prepare_image(image)

# Create TFRecord files

### Create split datasets

In [16]:
train_images_ds = tf.data.Dataset.from_tensor_slices(train_files)
test_images_ds  = tf.data.Dataset.from_tensor_slices(test_files)
val_images_ds   = tf.data.Dataset.from_tensor_slices(val_files)

train_labels_ds = tf.data.Dataset.from_tensor_slices(train_labels)
test_labels_ds  = tf.data.Dataset.from_tensor_slices(test_labels)
val_labels_ds   = tf.data.Dataset.from_tensor_slices(val_labels)

### Apply preprocessing to images datasets

In [17]:
train_images_ds = train_images_ds.map(load_and_prepare_image, num_parallel_calls=4)
test_images_ds  =  test_images_ds.map(load_and_prepare_image, num_parallel_calls=4)
val_images_ds   =   val_images_ds.map(load_and_prepare_image, num_parallel_calls=4)

### Save images datasets to binary files

In [18]:
train_images_ds = train_images_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
test_images_ds  =  test_images_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
val_images_ds   =   val_images_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)

train_writer = tf.data.experimental.TFRecordWriter(train_images_file)
test_writer  = tf.data.experimental.TFRecordWriter( test_images_file)
val_writer   = tf.data.experimental.TFRecordWriter(  val_images_file)

# <span style="color:red">Do not use the commented code below!!!</span>
### ...unless you're 100% sure you know why are you doing this
This will override our dataset and it will be no longer consistent with a previous version

In [19]:
 train_writer.write(train_images_ds)
 test_writer.write(test_images_ds)
 val_writer.write(val_images_ds)

### Save labels datasets to binary files

In [20]:
train_labels_ds = train_labels_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
test_labels_ds  =  test_labels_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)
val_labels_ds   =   val_labels_ds.map(tf.io.serialize_tensor, num_parallel_calls=4)

train_writer = tf.data.experimental.TFRecordWriter(train_labels_file)
test_writer  = tf.data.experimental.TFRecordWriter( test_labels_file)
val_writer   = tf.data.experimental.TFRecordWriter(  val_labels_file)

# <span style="color:red">Do not use the commented code below!!!</span>
### ...unless you're 100% sure you know why are you doing this
This will override our dataset and it will be no longer consistent with a previous version

In [21]:
train_writer.write(train_labels_ds)
test_writer. write(test_labels_ds)
val_writer.  write(val_labels_ds)