In [20]:
import os
import pandas as pd
import multiprocessing
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models, optimizers, callbacks

In [21]:
SPLITS_ID = 1

In [22]:
DATASET_PATH = "/kaggle/input/motocycledataset/Data"

In [23]:
BATCH_SIZE = 64
WIDTH = 224
HEIGHT = 224
SEED = 42

# DATA LOADING

In [24]:
train_csv = os.path.join(DATASET_PATH, f"MotocycleDataset-Splits-{SPLITS_ID}-Train.csv")
test_csv = os.path.join(DATASET_PATH, f"MotocycleDataset-Splits-{SPLITS_ID}-Test.csv")

train_df = pd.read_csv(train_csv, header=None, names=["file_path", "class"])
test_df = pd.read_csv(test_csv, header=None, names=["file_path", "class"])

train_df["file_path"] = train_df["file_path"].apply(lambda x: os.path.join(DATASET_PATH, x))
test_df["file_path"] = test_df["file_path"].apply(lambda x: os.path.join(DATASET_PATH, x))

train_df["class"] = train_df["class"].astype(str)
test_df["class"] = test_df["class"].astype(str)

# PREPROCESSING

In [25]:
image_set = set()

def hash_numpy_array(arr):
    arr_bytes = arr.tobytes()
    hash_obj = hashlib.sha256(arr_bytes)
    hash_hex = hash_obj.hexdigest()
    return hash_hex

def validate_image(image_path, skip_duplicate):
    if not os.path.exists(image_path):
        return False
    if not os.path.isfile(image_path):
        return False
    try:
        with Image.open(image_path) as img:
            img.resize((WIDTH, HEIGHT))
            if skip_duplicate:
                hash_value = hash_numpy_array(np.array(img))
                if hash_value in image_set:
                    return False
                else:
                    image_set.add(hash_value)
        return True

    except Exception as e:
        print(e)
        return False
        
def validate_images_multicore(df, num_processes, skip_duplicate=False):
    with multiprocessing.Pool(num_processes) as pool:
        results = pool.starmap(
            validate_image, 
            zip(df["file_path"], [skip_duplicate] * len(df))
        )
    return df[results]  

In [26]:
train_df = validate_images_multicore(train_df, num_processes=16, skip_duplicate=False)

  self.pid = os.fork()


image file is truncated (8 bytes not processed)
cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.311.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.313.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/Others/22520968-22520996-22520999-22520929-22521373.Others.567.jpg'


  self.pid = os.fork()


In [27]:
image_set = set()

In [28]:
test_df = validate_images_multicore(test_df, num_processes=16)

  self.pid = os.fork()


cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.277.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.323.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/Others/22520968-22520996-22520999-22520929-22521373.Others.568.jpg'


# TRAINING

In [29]:
train_data_generator = ImageDataGenerator(
    rescale=1/255,
    validation_split=0.2,
)
test_data_generator = ImageDataGenerator(rescale=1/255)

dataframe_config = {
    'x_col': 'file_path',
    'y_col': 'class',
    'target_size': (HEIGHT, WIDTH),
    'batch_size': BATCH_SIZE,
    'class_mode': 'categorical',
    'shuffle': True,
    'seed': SEED,
    'color_mode': 'rgb',
}

train_generator = train_data_generator.flow_from_dataframe(train_df, **dataframe_config, subset='training')
val_generator = train_data_generator.flow_from_dataframe(train_df, **dataframe_config, subset='validation')
test_generator = train_data_generator.flow_from_dataframe(test_df, **dataframe_config)

Found 22155 validated image filenames belonging to 5 classes.
Found 5538 validated image filenames belonging to 5 classes.
Found 6929 validated image filenames belonging to 5 classes.


In [30]:
base_model = ResNet50(
    include_top=False,  
    weights='imagenet', 
    input_shape=(224, 224, 3) 
)

model = models.Sequential()
model.add(base_model)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))  

In [31]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("model.keras", monitor="val_loss", save_best_only=True, verbose=1)
callbacks = [checkpoint]

In [32]:
learning_rate = 0.0001
epochs = 15

In [33]:
model.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [34]:
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=val_generator
)

Epoch 1/15


  self._warn_if_super_not_called()
W0000 00:00:1719995545.165700     176 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m332/347[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m11s[0m 761ms/step - accuracy: 0.4857 - loss: 1.2070

W0000 00:00:1719995797.326723     178 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 751ms/step - accuracy: 0.4909 - loss: 1.1968

W0000 00:00:1719995811.717996     177 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 950ms/step - accuracy: 0.4913 - loss: 1.1961 - val_accuracy: 0.0000e+00 - val_loss: 2.1075
Epoch 2/15


W0000 00:00:1719995874.447523     177 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 826ms/step - accuracy: 0.8241 - loss: 0.4854 - val_accuracy: 0.1282 - val_loss: 2.9146
Epoch 3/15
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 827ms/step - accuracy: 0.9236 - loss: 0.2320 - val_accuracy: 0.3407 - val_loss: 3.0365
Epoch 4/15
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 822ms/step - accuracy: 0.9487 - loss: 0.1558 - val_accuracy: 0.3991 - val_loss: 3.1448
Epoch 5/15
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 843ms/step - accuracy: 0.9625 - loss: 0.1191 - val_accuracy: 0.3523 - val_loss: 3.7030
Epoch 6/15
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 812ms/step - accuracy: 0.9743 - loss: 0.0809 - val_accuracy: 0.4554 - val_loss: 2.6237
Epoch 7/15
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 808ms/step - accuracy: 0.9791 - loss: 0.0616 - val_accuracy: 0.3573 - val_loss: 3.9466
Epoch 8/15
[1m

# TESTING

In [35]:
loss, accuracy = model.evaluate(test_generator, steps=len(test_generator))
print(f"Test Accuracy: {accuracy:.2f}")

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 759ms/step - accuracy: 0.7510 - loss: 1.3153
Test Accuracy: 0.75


W0000 00:00:1720000053.435458     177 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


----------------------------------------------------

In [36]:
# test_df = validate_images_multicore(test_df, num_processes=12)

In [37]:
# data_generator = ImageDataGenerator(rescale=1/255)
# dataframe_config = {
#     'dataframe': test_df,
#     'x_col': 'file_path',
#     'y_col': 'class',
#     'target_size': (HEIGHT, WIDTH),
#     'batch_size': BATCH_SIZE,
#     'class_mode': 'categorical',
#     'shuffle': True,
#     'seed': SEED,
#     'color_mode': 'rgb',
# }
# test_generator = data_generator.flow_from_dataframe(**dataframe_config)

In [38]:
# loss, accuracy = model.evaluate(test_generator, steps=len(test_generator))
# print(f"Test Accuracy: {accuracy:.2f}")