<a href="https://colab.research.google.com/github/Ruthuja-Gaikwad/DAUP/blob/main/Traffic_sign_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("valentynsichkar/traffic-signs-preprocessed")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/valentynsichkar/traffic-signs-preprocessed/versions/2


In [3]:
ls -l /root/.cache/kagglehub/datasets/valentynsichkar/traffic-signs-preprocessed/versions/2

total 6869484
-rw-r--r-- 1 root root  477332392 Mar 24 13:47 data0.pickle
-rw-r--r-- 1 root root 1279023059 Mar 24 13:47 data1.pickle
-rw-r--r-- 1 root root 1279023059 Mar 24 13:48 data2.pickle
-rw-r--r-- 1 root root 1279023059 Mar 24 13:48 data3.pickle
-rw-r--r-- 1 root root  852920232 Mar 24 13:48 data4.pickle
-rw-r--r-- 1 root root  426817448 Mar 24 13:48 data5.pickle
-rw-r--r-- 1 root root  426817448 Mar 24 13:48 data6.pickle
-rw-r--r-- 1 root root  426817448 Mar 24 13:48 data7.pickle
-rw-r--r-- 1 root root  426817448 Mar 24 13:48 data8.pickle
-rw-r--r-- 1 root root      25282 Mar 24 13:48 datasets_preparing.py
-rw-r--r-- 1 root root        999 Mar 24 13:48 label_names.csv
-rw-r--r-- 1 root root       1129 Mar 24 13:48 labels.pickle
-rw-r--r-- 1 root root       4284 Mar 24 13:48 mean_image_gray.pickle
-rw-r--r-- 1 root root      12475 Mar 24 13:48 mean_image_rgb.pickle
-rw-r--r-- 1 root root       4277 Mar 24 13:48 std_gray.pickle
-rw-r--r-- 1 root root      12468 Mar 24 13:48 std_

In [4]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
dataset_path = "/root/.cache/kagglehub/datasets/valentynsichkar/traffic-signs-preprocessed/versions/2/"
TRAIN_FILE = dataset_path + "train.pickle"
VALID_FILE = dataset_path + "valid.pickle"
TEST_FILE = dataset_path + "test.pickle"
LABELS_FILE = dataset_path + "labels.pickle"

In [6]:
# Load dataset from pickle files
def load_pickle_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    # Check if data is a dictionary before accessing 'features' and 'labels'
    if isinstance(data, dict):
        # Assuming data is a dictionary with keys 'features' and 'labels'
        # Adjust keys if your data structure is different.
        return data['features'], data['labels'] # Return features and labels separately
    # If data is not a dictionary, assume it is the label map
    else:
        return data  # Return the label map as is

# Load data
X_train, y_train = load_pickle_data(TRAIN_FILE)
X_valid, y_valid = load_pickle_data(VALID_FILE)
X_test, y_test = load_pickle_data(TEST_FILE)
label_map = load_pickle_data(LABELS_FILE) # label_map is now the loaded data from LABELS_FILE

In [7]:
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=1000, random_state=42, stratify=y_train)
X_test, _, y_test, _ = train_test_split(X_test, y_test, train_size=1000, random_state=42, stratify=y_test)

In [8]:
# Normalize pixel values
X_train, X_valid, X_test = X_train / 255.0, X_valid / 255.0, X_test / 255.0

In [9]:
print("Original X_train shape:", X_train.shape)

Original X_train shape: (1000, 32, 32, 3)


In [10]:
def resize_images(images, width, height):
    resized_images = []
    for img in images:
        img = cv2.resize(img.astype(np.uint8), (width, height))  # Resize
        img = np.expand_dims(img, axis=-1)  # Add channel dimension
        resized_images.append(img)
    return np.array(resized_images)


In [11]:
# Reduce image size for performance
IMG_WIDTH, IMG_HEIGHT = 128, 100  # Reduced from 256x200

# Resize images first
X_train = resize_images(X_train, IMG_WIDTH, IMG_HEIGHT)
X_valid = resize_images(X_valid, IMG_WIDTH, IMG_HEIGHT)
X_test = resize_images(X_test, IMG_WIDTH, IMG_HEIGHT)

# Normalize pixel values AFTER resizing
X_train, X_valid, X_test = X_train / 255.0, X_valid / 255.0, X_test / 255.0


In [12]:
# Convert to grayscale
X_train = np.mean(X_train, axis=-1, keepdims=True)
X_valid = np.mean(X_valid, axis=-1, keepdims=True)
X_test = np.mean(X_test, axis=-1, keepdims=True)

In [13]:

# Convert labels to categorical
num_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes)
y_valid = to_categorical(y_valid, num_classes)
y_test = to_categorical(y_test, num_classes)

In [14]:
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

Training samples: 1000, Testing samples: 1000


In [15]:

# Flatten images for ML models
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)


In [16]:
unique, counts = np.unique(np.argmax(y_train, axis=1), return_counts=True)
print(dict(zip(unique, counts)))
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, k_neighbors=2)  # Reduce k_neighbors
X_train_flat, y_train_flat = smote.fit_resample(X_train_flat, np.argmax(y_train, axis=1))
from collections import Counter
print("Class distribution after resampling:", Counter(y_train_flat))


{np.int64(0): np.int64(5), np.int64(1): np.int64(57), np.int64(2): np.int64(58), np.int64(3): np.int64(36), np.int64(4): np.int64(51), np.int64(5): np.int64(47), np.int64(6): np.int64(10), np.int64(7): np.int64(37), np.int64(8): np.int64(36), np.int64(9): np.int64(38), np.int64(10): np.int64(52), np.int64(11): np.int64(34), np.int64(12): np.int64(54), np.int64(13): np.int64(55), np.int64(14): np.int64(20), np.int64(15): np.int64(16), np.int64(16): np.int64(10), np.int64(17): np.int64(28), np.int64(18): np.int64(31), np.int64(19): np.int64(5), np.int64(20): np.int64(9), np.int64(21): np.int64(8), np.int64(22): np.int64(10), np.int64(23): np.int64(13), np.int64(24): np.int64(7), np.int64(25): np.int64(39), np.int64(26): np.int64(16), np.int64(27): np.int64(6), np.int64(28): np.int64(14), np.int64(29): np.int64(7), np.int64(30): np.int64(11), np.int64(31): np.int64(20), np.int64(32): np.int64(6), np.int64(33): np.int64(17), np.int64(34): np.int64(10), np.int64(35): np.int64(31), np.int64(

In [21]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from scipy.stats import norm, f_oneway
from imblearn.over_sampling import SMOTE


In [22]:
# Hyperparameter tuning for Random Forest
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}
rf_clf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
rf_clf.fit(X_train_flat, y_train_flat)
rf_preds = rf_clf.best_estimator_.predict(X_test_flat)

In [23]:
# Train SVM Classifier
svm_clf = SVC(kernel='linear', probability=True)
svm_clf.fit(X_train_flat, y_train_flat)
svm_preds = svm_clf.predict(X_test_flat)


In [32]:
print("Shape after SMOTE:", X_train_flat.shape)


Shape after SMOTE: (2494, 38400)


In [33]:
print("Original X_train shape before flattening:", X_train.shape)  # Should be (num_samples, 256, 200, 1)
print("Flattened X_train shape:", X_train_flat.shape)  # Should be (num_samples, 51200)


Original X_train shape before flattening: (3000, 128, 100, 1)
Flattened X_train shape: (2494, 38400)


In [37]:
# Cell 34: Correct X_train to have the same shape as y_train before flattening
X_train_resized = resize_images(X_train[:y_train.shape[0]], IMG_WIDTH, IMG_HEIGHT)
X_train_flat = X_train_resized.reshape(X_train_resized.shape[0], -1)

# Cell 35: Should work correctly if X_train_flat and y_train now have the same number of samples
smote = SMOTE(random_state=42)
X_train_flat, y_train_flat = smote.fit_resample(X_train_flat, np.argmax(y_train, axis=1))

In [38]:
# Get new number of samples after SMOTE
num_samples = X_train_flat.shape[0]

# Reshape to CNN input format
X_train = X_train_flat.reshape(num_samples, IMG_HEIGHT, IMG_WIDTH, 1)


In [39]:
print("Final X_train shape:", X_train.shape)  # Should be (2494, 256, 200, 1)
print("Final y_train shape:", y_train.shape)  # Should be (2494, num_classes)


Final X_train shape: (2494, 100, 128, 1)
Final y_train shape: (2494, 43)


In [43]:
print(f"After SMOTE - X_train shape: {X_train_flat.shape}, y_train shape: {y_train_flat.shape}")
print(f"Validation Data - X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")


After SMOTE - X_train shape: (2494, 12800), y_train shape: (2494,)
Validation Data - X_valid shape: (13230, 128, 100, 1), y_valid shape: (4410, 43)


In [50]:
# Flatten images before applying SMOTE
X_train_flat = X_train.reshape(X_train.shape[0], -1)

# Apply SMOTE for class balancing, adjusting k_neighbors
# Ensure k_neighbors <= n_samples - 1 for each class
# Calculate minimum number of samples in any class
min_samples = min(np.bincount(np.argmax(y_train[:X_train_flat.shape[0]], axis=1)))
k_neighbors = min(5, min_samples - 1) # Set k_neighbors to minimum between 5 and min_samples - 1
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
# Use y_train.shape[0] to get consistent number of samples
X_train_flat, y_train_flat = smote.fit_resample(X_train_flat, np.argmax(y_train[:X_train_flat.shape[0]], axis=1))

# Convert labels back to categorical after SMOTE
y_train = to_categorical(y_train_flat, num_classes)

# Reshape X_train back to CNN-compatible format
X_train = X_train_flat.reshape(-1, 100, 128, 1)

# Now, perform train-validation split correctly
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train_flat
)

# Print final shapes to confirm consistency
print(f"Final X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Final X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")

Final X_train shape: (1995, 100, 128, 1), y_train shape: (1995, 43)
Final X_valid shape: (499, 100, 128, 1), y_valid shape: (499, 43)


In [51]:
# Build CNN Model
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(100, 128, 1)),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_valid, y_valid))
cnn_preds = np.argmax(cnn_model.predict(X_test), axis=1)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 97ms/step - accuracy: 0.0177 - loss: 3.7614 - val_accuracy: 0.0220 - val_loss: 3.7612
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0231 - loss: 3.7614 - val_accuracy: 0.0220 - val_loss: 3.7612
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0229 - loss: 3.7613 - val_accuracy: 0.0220 - val_loss: 3.7613
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.0283 - loss: 3.7612 - val_accuracy: 0.0220 - val_loss: 3.7613
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0194 - loss: 3.7612 - val_accuracy: 0.0220 - val_loss: 3.7613
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0224 - loss: 3.7613 - val_accuracy: 0.0220 - val_loss: 3.7613
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━━━━━

In [54]:
# Save trained models
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_clf.best_estimator_, f)
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_clf, f)
cnn_model.save('cnn_model.keras')


In [55]:
# Evaluate models
rf_acc = accuracy_score(np.argmax(y_test, axis=1), rf_preds)
svm_acc = accuracy_score(np.argmax(y_test, axis=1), svm_preds)
# The issue is likely with the shape of X_test passed to the CNN for prediction.
# Ensure X_test has the same number of samples as y_test (1000).
# You might have modified X_test during reshaping/SMOTE.
# Use the original X_test with 1000 samples if available.
# Assuming X_test_original is your original X_test with 1000 samples:

# Predict using the original X_test with 1000 samples
cnn_preds = np.argmax(cnn_model.predict(X_test[:1000]), axis=1)
cnn_acc = accuracy_score(np.argmax(y_test, axis=1), cnn_preds)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step


In [56]:
print("Random Forest Accuracy:", rf_acc)
print("SVM Accuracy:", svm_acc)
print("CNN Accuracy:", cnn_acc)

Random Forest Accuracy: 0.089
SVM Accuracy: 0.013
CNN Accuracy: 0.015


In [57]:
# Confusion Matrices
rf_cm = confusion_matrix(np.argmax(y_test, axis=1), rf_preds)
svm_cm = confusion_matrix(np.argmax(y_test, axis=1), svm_preds)
cnn_cm = confusion_matrix(np.argmax(y_test, axis=1), cnn_preds)

In [58]:
# Classification Reports
print("\nRandom Forest Classification Report:\n", classification_report(np.argmax(y_test, axis=1), rf_preds))
print("\nSVM Classification Report:\n", classification_report(np.argmax(y_test, axis=1), svm_preds))
print("\nCNN Classification Report:\n", classification_report(np.argmax(y_test, axis=1), cnn_preds))



Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.69      0.19      0.30        57
           2       0.26      0.08      0.13        59
           3       0.05      0.03      0.04        36
           4       0.18      0.04      0.06        52
           5       0.00      0.00      0.00        50
           6       0.00      0.00      0.00        12
           7       0.00      0.00      0.00        36
           8       0.00      0.00      0.00        36
           9       0.14      0.11      0.12        38
          10       0.00      0.00      0.00        52
          11       0.00      0.00      0.00        33
          12       0.25      0.02      0.03        55
          13       0.67      0.32      0.43        57
          14       0.23      0.24      0.23        21
          15       0.00      0.00      0.00        17
          16       0.25      0.08      0.1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [59]:
# Perform ANOVA test
anova_test = f_oneway(rf_preds, svm_preds, cnn_preds)
print("ANOVA test result:", anova_test)

ANOVA test result: F_onewayResult(statistic=np.float64(46.3715564203917), pvalue=np.float64(1.4669272863291534e-20))


In [60]:
# Compute test errors
test1_error = 1 - rf_acc
test2_error = 1 - svm_acc
print("Test 1 Error (RF):", test1_error)
print("Test 2 Error (SVM):", test2_error)

Test 1 Error (RF): 0.911
Test 2 Error (SVM): 0.987


In [62]:
# Perform Z-test
from scipy.stats import zscore # Import the zscore function
z_test_statistic = zscore([rf_acc, svm_acc, cnn_acc])
print("Z-Test Statistics:", z_test_statistic)

Z-Test Statistics: [ 1.41383659 -0.73519503 -0.67864156]


In [65]:
from scipy.stats import norm, f_oneway, zscore, f
# Perform F-test
f_test_statistic = f_oneway(rf_preds, svm_preds, cnn_preds)
print("F-Test Statistics:", f_test_statistic)

F-Test Statistics: F_onewayResult(statistic=np.float64(46.3715564203917), pvalue=np.float64(1.4669272863291534e-20))
