# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import skimage
import os
import glob
import importlib
import wandb
import numpy as np
import matplotlib.pyplot as plt
import egg_class_functions as ecf
import tensorflow as tf
import albumentations as A
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint
from tensorflow.keras.callbacks import ModelCheckpoint

importlib.reload(ecf)

2025-06-27 11:56:55.563474: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-27 11:56:55.572349: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751018215.581612   29765 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751018215.584406   29765 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751018215.593006   29765 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

<module 'egg_class_functions' from '/home/tibor/Documents/Python/mosquito-egg-identification/egg_class_functions.py'>

# Variable setup

In [2]:
seg_data_path = "Data/predicted_segmentation_data.csv"
image_paths = sorted(glob.glob("Data/raw/microscope/**/*.*", recursive=True), key=lambda x: (os.path.dirname(x), os.path.basename(x)))
BATCH_SIZE = 32
EPOCHS = 100
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
df_pred = ecf.segmented_image_import(seg_data_path)
single_egg_df = df_pred.loc[df_pred["single"] == 1].reset_index(drop=True)
single_egg_df = single_egg_df.dropna()
single_egg_df["segment"] = single_egg_df.apply(ecf.rotate_and_pad_rgb_segment, axis=1)
single_egg_df['species'] = single_egg_df['species'].replace("aegypti_old", "aegypti")
single_egg_df['species'] = single_egg_df['species'].replace("albopictus_old", "albopictus")
train, test = train_test_split(single_egg_df, test_size=0.1)

X_train = train['segment']
X_train = np.stack(X_train.to_list()).astype(np.float32)
y_train = train['species']
X_test = test['segment']
X_test = np.stack(X_test.to_list()).astype(np.float32)
y_test = test['species']
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_onehot = to_categorical(y_train_encoded)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train_onehot, test_size=0.1, random_state=42)

In [6]:
print(f"albopictus: {sum(single_egg_df['species'] == 'albopictus')}")
print(f"aegypti: {sum(single_egg_df['species'] == 'aegypti')}")

albopictus: 152
aegypti: 160


### Albumentation image augmenation

In [4]:
transforms = A.Compose([
    A.HorizontalFlip(),
    A.VerticalFlip(),
    A.ShiftScaleRotate(shift_limit=(0, 0.0625), scale_limit=0.0, rotate_limit=(-10, 10)),
    A.RandomBrightnessContrast(),
    A.RandomGamma(),
])

def albumentations_augment(image):
    """
    Image: Numpy-Array [H,W,3] float32 (0-255)
    Output: Augmentiertes Image (float32, 0-1)
    """
    image = image.astype(np.uint8)  # Albumentations erwartet uint8-Bilder
    augmented = transforms(image=image)
    aug_image = augmented['image'].astype(np.float32) / 255.0
    return aug_image


def tf_albumentations_augment(image, label):
    aug_image = tf.numpy_function(albumentations_augment, [image], tf.float32)
    aug_image.set_shape(image.shape)
    return aug_image, label

train_ds = ecf.prepare_dataset_alb(X_train_split, y_train_split, tf_albumentations_augment, BATCH_SIZE)
val_ds = ecf.prepare_dataset_alb(X_val_split, y_val_split, None, BATCH_SIZE, shuffle=False)

  original_init(self, **validated_kwargs)
I0000 00:00:1751017361.032893   23882 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6285 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6


### Tensorflow image augmentation

In [4]:
data_augmentation = tf.keras.Sequential([
  layers.RandomFlip("horizontal_and_vertical"),
  layers.RandomRotation(0.0277),
  layers.RandomBrightness(factor=0.2),
  layers.RandomContrast(factor=0.2)
])

train_ds = ecf.prepare_dataset_tf(X_train_split, y_train_split, data_augmentation, BATCH_SIZE)
val_ds = ecf.prepare_dataset_tf(X_val_split, y_val_split, data_augmentation, BATCH_SIZE, shuffle=False)

I0000 00:00:1751014987.807521    6338 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6314 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6


# Model Training

In [5]:
base_model = tf.keras.applications.EfficientNetV2B0(
    input_shape=(200, 200, 3),
    include_top=False,
    weights="imagenet"
)
base_model.trainable = False

model = tf.keras.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(2, activation='softmax')
])

In [6]:
wandb.init(project="egg-classification", config={
    "architecture": "EfficientNetV2B0",
    "input_shape": (200, 200, 3),
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "optimizer": "adam",
    "loss": "categorical_crossentropy",
    "num_classes": 2
})


local_checkpoint = ModelCheckpoint(
    filepath="models/model.{epoch:02d}.h5",
    save_best_only=True,
    save_weights_only=False
)

wandb_checkpoint = WandbModelCheckpoint(
    filepath="models-wandb/model-{epoch:02d}.keras",
    save_best_only=True
)

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_ds,
                    epochs=EPOCHS,
                    validation_data=val_ds,
                    callbacks=[WandbMetricsLogger(),
                               wandb_checkpoint,
                               local_checkpoint
                                ]
                    )

y_true = []
y_pred = []

for x, y in val_ds:
    preds = model.predict(x)
    y_true.extend(np.argmax(y.numpy(), axis=1))
    y_pred.extend(np.argmax(preds, axis=1))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
wandb.log({"confusion_matrix": wandb.plot.confusion_matrix(
    probs=None,
    y_true=y_true,
    preds=y_pred,
    class_names=["aegypti", "albopictus"]
)})

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mtibor-schaffrin[0m ([33mtibor-schaffrin-universit-t-hamburg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/100


I0000 00:00:1751017373.690634   23945 service.cc:152] XLA service 0x7baa94015e80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751017373.690648   23945 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2025-06-27 11:42:53.881778: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1751017374.773807   23945 cuda_dnn.cc:529] Loaded cuDNN version 90501









[1m6/8[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 12ms/step - accuracy: 0.4744 - loss: 0.8015 

I0000 00:00:1751017381.850817   23945 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.










[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.4776 - loss: 0.7975  



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2s/step - accuracy: 0.4792 - loss: 0.7954 - val_accuracy: 0.3929 - val_loss: 0.9390
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.4766 - loss: 0.7531 - val_accuracy: 0.3929 - val_loss: 0.9414
Epoch 3/100
[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 14ms/step - accuracy: 0.5424 - loss: 0.6940



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 119ms/step - accuracy: 0.5179 - loss: 0.7047 - val_accuracy: 0.3929 - val_loss: 0.9185
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.5057 - loss: 0.7051 - val_accuracy: 0.3929 - val_loss: 0.9393
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.6034 - loss: 0.6704 - val_accuracy: 0.3929 - val_loss: 0.9311
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.4733 - loss: 0.7209 - val_accuracy: 0.3929 - val_loss: 0.9544
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5120 - loss: 0.6991 - val_accuracy: 0.3929 - val_loss: 0.9675
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.4678 - loss: 0.7018 - val_accuracy: 0.3929 - val_loss: 0.9654
Epoch 9/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - accuracy: 0.5048 - loss: 0.7025 - val_accuracy: 0.3929 - val_loss: 0.9093
Epoch 16/100
[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 15ms/step - accuracy: 0.5352 - loss: 0.6993



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step - accuracy: 0.5092 - loss: 0.7028 - val_accuracy: 0.3929 - val_loss: 0.8993
Epoch 17/100
[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 14ms/step - accuracy: 0.5660 - loss: 0.6993



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 231ms/step - accuracy: 0.5416 - loss: 0.7106 - val_accuracy: 0.3929 - val_loss: 0.8819
Epoch 18/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4569 - loss: 0.7270 - val_accuracy: 0.3929 - val_loss: 0.8930
Epoch 19/100
[1m5/8[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 14ms/step - accuracy: 0.5182 - loss: 0.6926



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 150ms/step - accuracy: 0.5067 - loss: 0.6983 - val_accuracy: 0.3929 - val_loss: 0.8798
Epoch 20/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.4770 - loss: 0.7145 - val_accuracy: 0.3929 - val_loss: 0.8903
Epoch 21/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.4936 - loss: 0.6981 - val_accuracy: 0.3929 - val_loss: 0.8901
Epoch 22/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.4886 - loss: 0.6925 - val_accuracy: 0.3929 - val_loss: 0.9019
Epoch 23/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5380 - loss: 0.6937 - val_accuracy: 0.3929 - val_loss: 0.9053
Epoch 24/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.4540 - loss: 0.7035 - val_accuracy: 0.3929 - val_loss: 0.9093
Epoch 25/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step - accuracy: 0.4940 - loss: 0.7064 - val_accuracy: 0.3929 - val_loss: 0.8754
Epoch 40/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.4025 - loss: 0.7008 - val_accuracy: 0.3929 - val_loss: 0.8783
Epoch 41/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.4821 - loss: 0.6963 - val_accuracy: 0.3929 - val_loss: 0.8824
Epoch 42/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4443 - loss: 0.7144 - val_accuracy: 0.3929 - val_loss: 0.8832
Epoch 43/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.4285 - loss: 0.7204 - val_accuracy: 0.3929 - val_loss: 0.9045
Epoch 44/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5061 - loss: 0.6987 - val_accuracy: 0.3929 - val_loss: 0.9041
Epoch 45/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━

2025-06-27 11:43:57.035071: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch/accuracy,▄▄█▆▆▅▂▄▃▆▅▄▆▆▅▁▂▃▅▆▆▅▅▅▅▅▃▆▂▅▃▁▆▅█▆▅▆▃▅
epoch/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▅▁▄▃▄▃▄▄▃▃▃▃▃▃▃▆▄▃▂▃▃▃▃▃▃▃▃▃▂▂▂▃▃▂▂▃▃▃▃
epoch/val_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_loss,▆▇██▆▂▃▃▃▂▂▂▁▁▃▄▄▄▃▃▃▃▄▄▄▄▄▄▅▅▄▄▄▄▅▅▅▅▄▄

0,1
epoch/accuracy,0.50794
epoch/epoch,99.0
epoch/learning_rate,0.001
epoch/loss,0.69488
epoch/val_accuracy,0.39286
epoch/val_loss,0.91272
