In [1]:
import os
import time
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from keras import models, initializers
from keras.models import Sequential, Model
from keras.layers import Input, Flatten, Dense, Dropout, Conv2D, MaxPooling2D, Lambda, GlobalMaxPooling2D, GlobalAveragePooling2D, BatchNormalization, Activation, AveragePooling2D, Concatenate
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

2025-05-23 15:56:33.002305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748015793.195399      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748015793.253626      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers

# Constants
IMAGE_SIZE = 128
BATCH_SIZE = 32
EPOCHS = 50
PATCH_SIZE = 4  # For MobileViT
patch_size=4
inputShape = (IMAGE_SIZE, IMAGE_SIZE, 3)
CLASS_NAMES = [
    'Motorized2wheleer', 'ambasador_taxi', 'autorickshaw', 'bicycle', 'bus', 'car', 'minitruck', 'motarvan', 'rickshaw', 'toto', 'truck', 'van'
]
NUM_CLASSES = len(CLASS_NAMES)

# Data paths
train_data_dir = "/kaggle/input/indian-vehicle-data/Vehicle data/Train"
test_data_dir = "/kaggle/input/indian-vehicle-data/Vehicle data/Test"

In [3]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Image data generator for training with augmentation
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Image data generator for testing (only rescaling)
test_datagen = ImageDataGenerator(rescale=1.0 / 255.0)


# Data generators
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_directory(
    test_data_dir,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False  # Ensure predictions match labels during evaluation
)


Found 3893 images belonging to 12 classes.
Found 1296 images belonging to 12 classes.


In [4]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.swish)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x


def transformer_block(x, transformer_layers, projection_dim, num_heads=2):
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, x])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=[x.shape[-1] * 2, x.shape[-1]], dropout_rate=0.1,)
        # Skip connection 2.
        x = layers.Add()([x3, x2])

    return x


def conv_block(x, filters=16, kernel_size=3, strides=2):
    conv_layer = layers.Conv2D(
        filters, kernel_size, strides=strides, activation=tf.nn.swish, padding="same"
    )
    return conv_layer(x)


def mobilevit_block(x, num_blocks, projection_dim, strides=1):
    # Local projection with convolutions.
    local_features = x
    #local_features = conv_block(x, filters=projection_dim, strides=strides)
    local_features = conv_block(
        local_features, filters=projection_dim, kernel_size=1, strides=strides
    )

    # Unfold into patches and then pass through Transformers.
    num_patches = int((local_features.shape[1] * local_features.shape[2]) / patch_size)
    non_overlapping_patches = layers.Reshape((patch_size, num_patches, projection_dim))(
        local_features
    )
    global_features = transformer_block(
        non_overlapping_patches, num_blocks, projection_dim
    )

    # Fold into conv-like feature-maps.
    folded_feature_map = layers.Reshape((*local_features.shape[1:-1], projection_dim))(
        global_features
    )

    # Apply point-wise conv -> concatenate with the input features.
    #folded_feature_map = conv_block(
     #   folded_feature_map, filters=local_features.shape[-1], kernel_size=1, strides=strides
    #)
    local_global_features = layers.Concatenate(axis=-1)([local_features, folded_feature_map])

    # Fuse the local and global features using a convoluion layer.
    local_global_features = conv_block(
        local_global_features, filters=projection_dim,kernel_size=1, strides=strides
    )

    return local_global_features


base_model = MobileNetV2(weights='imagenet',input_shape=inputShape,include_top=False)
#base_model = MobileNetV2(weights=None,input_shape=inputShape,include_top=False)
base_model.trainable = True


def create_vit_classifier():
    inputs = layers.Input(shape=inputShape)
    # Augment data.
    MVx = base_model(inputs)
    #augmented = data_augmentation(inputs)
    x = mobilevit_block(MVx, num_blocks=1, projection_dim=64)

    x = layers.GlobalAvgPool2D()(x)
    outputs = layers.Dense(len(CLASS_NAMES), activation="softmax")(x)

    #x = layers.Conv2D(5, 1, activation="softmax")(x)
    #x = layers.BatchNormalization()(x)
    #x = layers.GlobalAvgPool2D()(x)
    #outputs = layers.Activation('softmax')(x)

    return keras.Model(inputs, outputs)


I0000 00:00:1748015854.654938      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [5]:
modelv2 = create_vit_classifier()
modelv2.compile(optimizer=optimizers.Adam(learning_rate=0.001),
              loss="categorical_crossentropy",
              metrics=["accuracy"])
modelv2.summary()

In [6]:
# Callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, min_lr=1e-6)

# Train the model
history = modelv2.fit(
    train_generator,
    validation_data=test_generator,
    epochs=EPOCHS,
    callbacks=[early_stopping, reduce_lr]
)

  self._warn_if_super_not_called()


Epoch 1/50


I0000 00:00:1748015915.783555     106 service.cc:148] XLA service 0x7a085c002290 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748015915.784479     106 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1748015919.590804     106 cuda_dnn.cc:529] Loaded cuDNN version 90300
E0000 00:00:1748015925.012987     106 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1748015925.213936     106 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m  1/122[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:05:14[0m 62s/step - accuracy: 0.0312 - loss: 2.7584

I0000 00:00:1748015938.145443     106 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m 46/122[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m24s[0m 320ms/step - accuracy: 0.5880 - loss: 1.3673

E0000 00:00:1748015959.971739     107 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1748015960.175207     107 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 627ms/step - accuracy: 0.6904 - loss: 1.0398 - val_accuracy: 0.2955 - val_loss: 5.7157 - learning_rate: 0.0010
Epoch 2/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 235ms/step - accuracy: 0.8558 - loss: 0.4809 - val_accuracy: 0.3966 - val_loss: 5.5509 - learning_rate: 0.0010
Epoch 3/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 234ms/step - accuracy: 0.9098 - loss: 0.3238 - val_accuracy: 0.5015 - val_loss: 2.7070 - learning_rate: 0.0010
Epoch 4/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 234ms/step - accuracy: 0.9181 - loss: 0.2645 - val_accuracy: 0.4066 - val_loss: 4.4338 - learning_rate: 0.0010
Epoch 5/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 230ms/step - accuracy: 0.9223 - loss: 0.2575 - val_accuracy: 0.5486 - val_loss: 2.2774 - learning_rate: 0.0010
Epoch 6/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [7]:
# Evaluate model
start_time = time.time()
test_loss, test_accuracy = modelv2.evaluate(test_generator)
evaluation_time = time.time() - start_time
print(f"Evaluation Time: {evaluation_time:.2f} seconds")
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Metrics
y_true = test_generator.classes
y_pred = np.argmax(modelv2.predict(test_generator), axis=-1)

overall_accuracy = accuracy_score(y_true, y_pred)
overall_precision = precision_score(y_true, y_pred, average='weighted')
overall_recall = recall_score(y_true, y_pred, average='weighted')
overall_f1_score = f1_score(y_true, y_pred, average='weighted')

print("\nOverall Metrics:")
print(f"Accuracy: {overall_accuracy:.2f}")
print(f"Precision: {overall_precision:.2f}")
print(f"Recall: {overall_recall:.2f}")
print(f"F1 Score: {overall_f1_score:.2f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES))


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 131ms/step - accuracy: 0.9856 - loss: 0.0806
Evaluation Time: 5.65 seconds
Test Loss: 0.1420656144618988, Test Accuracy: 0.977623462677002
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 193ms/step

Overall Metrics:
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98

Classification Report:
                   precision    recall  f1-score   support

Motorized2wheleer       1.00      0.98      0.99       158
   ambasador_taxi       1.00      1.00      1.00       148
     autorickshaw       1.00      0.99      1.00       180
          bicycle       0.91      0.98      0.94        41
              bus       0.99      0.98      0.99       188
              car       0.99      0.99      0.99       148
        minitruck       0.90      0.96      0.93        68
         motarvan       1.00      1.00      1.00         8
         rickshaw       0.96      0.98      0.97       154
             toto       1.00 

In [8]:
# After model.fit(...)
modelv2.save("vehicle_model.h5")        # Save as HDF5 file
modelv2.save("vehicle_model.keras")     # Save in native Keras format
