In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import os
import multiprocessing as mp
import hashlib
from PIL import Image
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

In [4]:
SPLITS_ID = 1

In [5]:
DATASET_DIR = "/kaggle/input/motocycledataset/Data"
BATCH_SIZE = 64
SEED = 42
WIDTH = 224
HEIGHT = 224

# DATA LOADING

In [6]:
train_csv_path = os.path.join(DATASET_DIR, f"MotocycleDataset-Splits-{SPLITS_ID}-Train.csv")
train_df = pd.read_csv(train_csv_path, header=None, names=["file_path", "class"])
train_df["file_path"] = train_df["file_path"].apply(lambda x: os.path.join(DATASET_DIR, x))
train_df["class"] = train_df["class"].apply(lambda x: str(x))

# PREPROCESSING

In [7]:
image_set = set()
num_duplicates = 0
num_errors = 0

def hash_numpy_array(arr):
    arr_bytes = arr.tobytes()
    hash_obj = hashlib.sha256(arr_bytes)
    hash_hex = hash_obj.hexdigest()
    return hash_hex

def validate_image(image_path, skip_duplicate):
    global num_duplicates, num_errors
    if not os.path.exists(image_path):
        return False
    if not os.path.isfile(image_path):
        return False
    try:
        with Image.open(image_path) as img:
            img.resize((WIDTH, HEIGHT))
            if skip_duplicate:
                hash_value = hash_numpy_array(np.array(img))
                if hash_value in image_set:
                    num_duplicates += 1
                    return False
                else:
                    image_set.add(hash_value)
        return True

    except Exception as e:
        num_errors += 1
        return False
        
def validate_images_multicore(df, num_processes, skip_duplicate=False):
    with mp.Pool(num_processes) as pool:
        results = pool.starmap(
            validate_image, 
            zip(df["file_path"], [skip_duplicate] * len(df))
        )
    return df[results]  

In [8]:
train_df = validate_images_multicore(
    train_df, 
    num_processes=12, 
    skip_duplicate=True
)

  self.pid = os.fork()
  self.pid = os.fork()


In [9]:
# Reset
image_set = set()
num_duplicates = 0
num_errors = 0

# TRAINING

In [11]:
data_generator = ImageDataGenerator(
    rescale=1/255,
    validation_split=0.2,
)

dataframe_config = {
    'dataframe': train_df,
    'x_col': 'file_path',
    'y_col': 'class',
    'target_size': (HEIGHT, WIDTH),
    'batch_size': BATCH_SIZE,
    'class_mode': 'categorical',
    'shuffle': True,
    'seed': SEED,
    'color_mode': 'rgb',
}

train_generator = data_generator.flow_from_dataframe(**dataframe_config, subset='training')
val_generator = data_generator.flow_from_dataframe(**dataframe_config, subset='validation')

Found 21368 validated image filenames belonging to 5 classes.
Found 5341 validated image filenames belonging to 5 classes.


In [12]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetV2S

base_model = EfficientNetV2S(
    include_top=False,  
    weights='imagenet', 
    input_shape=(224, 224, 3) 
)

model = models.Sequential()
model.add(base_model)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(5, activation='softmax'))  

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-s_notop.h5
[1m82420632/82420632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [13]:
learning_rate = 0.001
epochs = 10

In [14]:
history = model.fit(
    train_generator, 
    epochs=epochs,
    validation_data = val_generator
)

Epoch 1/10


  self._warn_if_super_not_called()
I0000 00:00:1719636848.173839     234 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1719636848.406160     234 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m 37/334[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3:19[0m 673ms/step - accuracy: 0.3379 - loss: 1.4886

W0000 00:00:1719637020.091191     233 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.5081 - loss: 1.1713

W0000 00:00:1719637231.094646     234 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m687s[0m 1s/step - accuracy: 0.5084 - loss: 1.1707 - val_accuracy: 0.0502 - val_loss: 2.6923
Epoch 2/10


W0000 00:00:1719637300.068099     231 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 834ms/step - accuracy: 0.7560 - loss: 0.6563 - val_accuracy: 1.8723e-04 - val_loss: 4.4970
Epoch 3/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 846ms/step - accuracy: 0.8440 - loss: 0.4472 - val_accuracy: 0.0000e+00 - val_loss: 3.8359
Epoch 4/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 860ms/step - accuracy: 0.8936 - loss: 0.3208 - val_accuracy: 0.4157 - val_loss: 2.1959
Epoch 5/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 830ms/step - accuracy: 0.9219 - loss: 0.2355 - val_accuracy: 0.2949 - val_loss: 3.2739
Epoch 6/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 831ms/step - accuracy: 0.9353 - loss: 0.1953 - val_accuracy: 0.0071 - val_loss: 3.3953
Epoch 7/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 834ms/step - accuracy: 0.9466 - loss: 0.1597 - val_accuracy: 0.1101 - val_loss: 5.7145
Epoch 8

# TESTING

In [15]:
test_csv_path = os.path.join(DATASET_DIR, f"MotocycleDataset-Splits-{SPLITS_ID}-Test.csv")
test_df = pd.read_csv(test_csv_path, header=None, names=["file_path", "class"])
test_df["file_path"] = test_df["file_path"].apply(lambda x: os.path.join(DATASET_DIR, x))
test_df["class"] = test_df["class"].apply(lambda x: str(x))

In [16]:
test_df = validate_images_multicore(test_df, num_processes=12)

  self.pid = os.fork()
  self.pid = os.fork()


In [17]:
data_generator = ImageDataGenerator(rescale=1/255)
dataframe_config = {
    'dataframe': test_df,
    'x_col': 'file_path',
    'y_col': 'class',
    'target_size': (HEIGHT, WIDTH),
    'batch_size': BATCH_SIZE,
    'class_mode': 'categorical',
    'shuffle': True,
    'seed': SEED,
    'color_mode': 'rgb',
}
test_generator = data_generator.flow_from_dataframe(**dataframe_config)

Found 6929 validated image filenames belonging to 5 classes.


In [19]:
loss, accuracy = model.evaluate(test_generator, steps=len(test_generator))
print(f"Test Accuracy: {accuracy:.2f}")

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 641ms/step - accuracy: 0.5684 - loss: 1.7599
Test Accuracy: 0.57
