<a href="https://colab.research.google.com/github/Peter-Phi-Tran/AI-caramba/blob/cv/ai_caramba_plant_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install -q tensorflow

In [29]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json


In [80]:
import kagglehub
import os

# Download latest version
dataset_root = kagglehub.dataset_download("gverzea/edible-wild-plants")

print("Path to dataset files:", dataset_root)
print("Root folder contents:", os.listdir(dataset_root))

datasets_path = os.path.join(dataset_root, "datasets")
print("Datasets folder contains:", os.listdir(datasets_path))

Path to dataset files: /kaggle/input/edible-wild-plants
Root folder contents: ['datasets', 'final_model_weights.hdf5', 'edible wild plants metadata.xls', 'vanilla_model_weights.hdf5']
Datasets folder contains: ['dataset', 'dataset-user_images', 'dataset-test']


In [81]:
import tensorflow as tf

img_size = (224,224)
batch_size = 32

train_dir = os.path.join(datasets_path, "dataset")
test_dir = os.path.join(datasets_path, "dataset-test")

In [82]:
import shutil

# Copy train + test datasets into /kaggle/working
working_train_dir = "/kaggle/working/train_dataset"
working_test_dir = "/kaggle/working/test_dataset"

# Copy only once
if not os.path.exists(working_train_dir):
    shutil.copytree(train_dir, working_train_dir)

if not os.path.exists(working_test_dir):
    shutil.copytree(test_dir, working_test_dir)

print("Train dir copied to:", working_train_dir)
print("Test dir copied to:", working_test_dir)

Train dir copied to: /kaggle/working/train_dataset
Test dir copied to: /kaggle/working/test_dataset


In [83]:
from PIL import Image, UnidentifiedImageError
import os

def force_clean_dataset(directories, bad_formats=("WEBP", "MPO")):
    """
    Force convert any WEBP/MPO images (detected by actual format, not extension) into JPEG.
    Removes originals.
    """
    for directory in directories:
        print(f"\n🔍 Scanning {directory} ...")
        converted, failed = 0, 0

        for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                try:
                    with Image.open(file_path) as img:
                        fmt = img.format  # actual format, not guessed by extension
                        if fmt in bad_formats:
                            rgb = img.convert("RGB")
                            new_path = os.path.splitext(file_path)[0] + ".jpg"
                            rgb.save(new_path, "JPEG")
                            os.remove(file_path)  # delete original
                            converted += 1
                            print(f"✅ {fmt} → JPEG:", file_path, "→", new_path)
                except (UnidentifiedImageError, OSError) as e:
                    failed += 1
                    print("⚠️ Failed to process:", file_path, "| Error:", e)

        print(f"✔️ Finished {directory}. Converted {converted}, Failed {failed}.")

# Run it on both train and test dirs
force_clean_dataset([working_train_dir, working_test_dir])

# Recheck formats
formats = {}
for root, _, files in os.walk(working_train_dir):
    for file in files:
        file_path = os.path.join(root, file)
        try:
            with Image.open(file_path) as img:
                fmt = img.format
                formats[fmt] = formats.get(fmt, 0) + 1
        except Exception as e:
            print("Unreadable:", file_path, e)

print("\n✅ Formats after cleaning:", formats)


🔍 Scanning /kaggle/working/train_dataset ...
✔️ Finished /kaggle/working/train_dataset. Converted 0, Failed 0.

🔍 Scanning /kaggle/working/test_dataset ...
✔️ Finished /kaggle/working/test_dataset. Converted 0, Failed 0.

✅ Formats after cleaning: {'JPEG': 6539, 'PNG': 8}


In [84]:
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    working_train_dir,        # use cleaned dataset
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=img_size,
    batch_size=batch_size
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    working_train_dir,        # use cleaned dataset
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=img_size,
    batch_size=batch_size
)

Found 6547 files belonging to 62 classes.
Using 5238 files for training.
Found 6547 files belonging to 62 classes.
Using 1309 files for validation.


In [85]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.1),
])

In [86]:
from tensorflow.keras import layers, models

# Load pretrained MobileNetV2 base
base_model = tf.keras.applications.MobileNetV2(
    input_shape=(224,224,3),
    include_top=False,
    weights="imagenet"
)
base_model.trainable = False  # Freeze base for now

# Build the full model
inputs = tf.keras.Input(shape=(224,224,3))
x = data_augmentation(inputs)               # Apply augmentation
x = tf.keras.applications.mobilenet_v2.preprocess_input(x)
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)                  # Regularization
outputs = layers.Dense(len(class_names), activation="softmax")(x)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

In [87]:
epochs = 15  # Increase if dataset is bigger

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

Epoch 1/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 2s/step - accuracy: 0.2630 - loss: 3.3060 - val_accuracy: 0.5615 - val_loss: 1.6945
Epoch 2/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 2s/step - accuracy: 0.5441 - loss: 1.6766 - val_accuracy: 0.6417 - val_loss: 1.3634
Epoch 3/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 2s/step - accuracy: 0.6236 - loss: 1.3684 - val_accuracy: 0.6585 - val_loss: 1.2550
Epoch 4/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m355s[0m 2s/step - accuracy: 0.6769 - loss: 1.1649 - val_accuracy: 0.6868 - val_loss: 1.1770
Epoch 5/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 2s/step - accuracy: 0.7017 - loss: 1.0511 - val_accuracy: 0.6921 - val_loss: 1.1187
Epoch 6/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 2s/step - accuracy: 0.7092 - loss: 0.9824 - val_accuracy: 0.7021 - val_loss: 1.0881
Epoch 7/15
[1m164/164

In [88]:
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    image_size=img_size,
    batch_size=batch_size
)

loss, acc = model.evaluate(test_ds)
print(f"Test accuracy: {acc*100:.2f}%")

Found 310 files belonging to 62 classes.
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step - accuracy: 0.5477 - loss: 1.8307
Test accuracy: 54.19%


In [91]:
from google.colab import files

model.save("plant_classifier_modelv1.keras")  # save as single file
files.download("plant_classifier_modelv1.keras")  # triggers browser download

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>