In [1]:
import os
import pandas as pd
import multiprocessing
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models, optimizers, callbacks

2024-06-30 04:59:30.434409: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 04:59:30.434508: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 04:59:30.584787: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
SPLITS_ID = 1

In [3]:
DATASET_PATH = "/kaggle/input/motocycledataset/Data"

In [25]:
BATCH_SIZE = 64
WIDTH = 224
HEIGHT = 224
SEED = 42

In [5]:
train_csv = os.path.join(DATASET_PATH, f"MotocycleDataset-Splits-{SPLITS_ID}-Train.csv")
test_csv = os.path.join(DATASET_PATH, f"MotocycleDataset-Splits-{SPLITS_ID}-Test.csv")

train_df = pd.read_csv(train_csv, header=None, names=["file_path", "class"])
test_df = pd.read_csv(test_csv, header=None, names=["file_path", "class"])

train_df["file_path"] = train_df["file_path"].apply(lambda x: os.path.join(DATASET_PATH, x))
test_df["file_path"] = test_df["file_path"].apply(lambda x: os.path.join(DATASET_PATH, x))

train_df["class"] = train_df["class"].astype(str)
test_df["class"] = test_df["class"].astype(str)

In [6]:
train_df

Unnamed: 0,file_path,class
0,/kaggle/input/motocycledataset/Data/Honda/2252...,1
1,/kaggle/input/motocycledataset/Data/Honda/2252...,1
2,/kaggle/input/motocycledataset/Data/Honda/2252...,1
3,/kaggle/input/motocycledataset/Data/Honda/2252...,1
4,/kaggle/input/motocycledataset/Data/Honda/2252...,1
...,...,...
28037,/kaggle/input/motocycledataset/Data/Others/225...,0
28038,/kaggle/input/motocycledataset/Data/Others/225...,0
28039,/kaggle/input/motocycledataset/Data/Others/225...,0
28040,/kaggle/input/motocycledataset/Data/Others/225...,0


In [7]:
image_set = set()

def hash_numpy_array(arr):
    arr_bytes = arr.tobytes()
    hash_obj = hashlib.sha256(arr_bytes)
    hash_hex = hash_obj.hexdigest()
    return hash_hex

def validate_image(image_path, skip_duplicate):
    if not os.path.exists(image_path):
        return False
    if not os.path.isfile(image_path):
        return False
    try:
        with Image.open(image_path) as img:
            img.resize((WIDTH, HEIGHT))
            if skip_duplicate:
                hash_value = hash_numpy_array(np.array(img))
                if hash_value in image_set:
                    return False
                else:
                    image_set.add(hash_value)
        return True

    except Exception as e:
        print(e)
        return False
        
def validate_images_multicore(df, num_processes, skip_duplicate=False):
    with multiprocessing.Pool(num_processes) as pool:
        results = pool.starmap(
            validate_image, 
            zip(df["file_path"], [skip_duplicate] * len(df))
        )
    return df[results]  

In [8]:
train_df = validate_images_multicore(train_df, num_processes=16, skip_duplicate=False)

  self.pid = os.fork()


image file is truncated (8 bytes not processed)
cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.311.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.313.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/Others/22520968-22520996-22520999-22520929-22521373.Others.567.jpg'


In [9]:
image_set = set()

In [10]:
test_df = validate_images_multicore(test_df, num_processes=16)

  self.pid = os.fork()


cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.277.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/VinFast/22520968-22520996-22520999-22520929-22521373.VinFast.323.jpg'
cannot identify image file '/kaggle/input/motocycledataset/Data/Others/22520968-22520996-22520999-22520929-22521373.Others.568.jpg'


In [11]:
train_df, valid_df = train_test_split(
    train_df, 
    test_size=0.2, 
    stratify=train_df['class'], 
    random_state=SEED
)

In [26]:
train_data_generator = ImageDataGenerator(
    rescale=1/255,
    validation_split=0.2,
)
test_data_generator = ImageDataGenerator(rescale=1/255)

dataframe_config = {
    'x_col': 'file_path',
    'y_col': 'class',
    'target_size': (HEIGHT, WIDTH),
    'batch_size': BATCH_SIZE,
    'class_mode': 'categorical',
    'shuffle': True,
    'seed': SEED,
    'color_mode': 'rgb',
}

train_generator = train_data_generator.flow_from_dataframe(train_df, **dataframe_config, subset='training')
val_generator = train_data_generator.flow_from_dataframe(train_df, **dataframe_config, subset='validation')
test_generator = train_data_generator.flow_from_dataframe(test_df, **dataframe_config)

Found 17724 validated image filenames belonging to 5 classes.
Found 4430 validated image filenames belonging to 5 classes.
Found 6929 validated image filenames belonging to 5 classes.


In [54]:
base_model = ResNet50(
    include_top=False,  
    weights='imagenet', 
    input_shape=(224, 224, 3) 
)

model = models.Sequential()
model.add(base_model)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))  

In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("model.keras", monitor="val_loss", save_best_only=True, verbose=1)
callbacks = [checkpoint]

In [36]:
epochs = 10
learning_rate = 0.0001

In [37]:
model.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [38]:
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=val_generator
)

Epoch 1/10


W0000 00:00:1719728696.913796     180 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m114/277[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1:49[0m 673ms/step - accuracy: 0.4331 - loss: 1.3221

W0000 00:00:1719728794.934359     180 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 738ms/step - accuracy: 0.5199 - loss: 1.1542

W0000 00:00:1719728906.954676     177 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 929ms/step - accuracy: 0.5203 - loss: 1.1535 - val_accuracy: 0.1865 - val_loss: 1.7855
Epoch 2/10


W0000 00:00:1719728953.718204     180 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 821ms/step - accuracy: 0.8461 - loss: 0.4358 - val_accuracy: 0.2411 - val_loss: 2.1894
Epoch 3/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 810ms/step - accuracy: 0.9482 - loss: 0.1641 - val_accuracy: 0.5686 - val_loss: 1.3088
Epoch 4/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 807ms/step - accuracy: 0.9627 - loss: 0.1155 - val_accuracy: 0.7321 - val_loss: 0.9704
Epoch 5/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 805ms/step - accuracy: 0.9726 - loss: 0.0826 - val_accuracy: 0.7192 - val_loss: 1.1551
Epoch 6/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 813ms/step - accuracy: 0.9760 - loss: 0.0730 - val_accuracy: 0.7510 - val_loss: 1.0377
Epoch 7/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 814ms/step - accuracy: 0.9799 - loss: 0.0589 - val_accuracy: 0.7544 - val_loss: 1.0391
Epoch 8/10
[1m

In [39]:
loss, accuracy = model.evaluate(test_generator, steps=len(test_generator))
print(f"Test Accuracy: {accuracy:.2f}")

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 709ms/step - accuracy: 0.7317 - loss: 1.0393
Test Accuracy: 0.74


W0000 00:00:1719732621.727007     178 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
