In [None]:
import os
os.environ['KERAS_BACKEND'] = 'jax'

# Data handling and preprocessing
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Image processing
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Deep Learning
import keras
from keras import layers, models
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Data utilities
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Exploratory Data Analysis (EDA) for GTSRB dataset
DATA_DIR = 'dataset'
TRAIN_CSV = os.path.join(DATA_DIR, 'Train.csv')
TEST_CSV = os.path.join(DATA_DIR, 'Test.csv')
META_CSV = os.path.join(DATA_DIR, 'Meta.csv')

def read_csv_auto(path):
    """Read CSV robustly for both comma and semicolon delimiters."""
    df = pd.read_csv(path)
    if df.shape[1] == 1:
        df = pd.read_csv(path, sep=';')
    return df

train_df = read_csv_auto(TRAIN_CSV)
test_df = read_csv_auto(TEST_CSV)
meta_df = read_csv_auto(META_CSV)

print('--- Dataset Shapes ---')
print(f'Train: {train_df.shape}')
print(f'Test : {test_df.shape}')
print(f'Meta : {meta_df.shape}')

print('\n--- Column Names ---')
print('Train columns:', train_df.columns.tolist())
print('Test columns :', test_df.columns.tolist())
print('Meta columns :', meta_df.columns.tolist())

print('\n--- Missing Values (count) ---')
print('Train:\n', train_df.isna().sum())
print('\nTest:\n', test_df.isna().sum())
print('\nMeta:\n', meta_df.isna().sum())

print('\n--- Duplicate Rows ---')
print('Train duplicates:', train_df.duplicated().sum())
print('Test duplicates :', test_df.duplicated().sum())
print('Meta duplicates :', meta_df.duplicated().sum())

# Ensure class column naming is consistent
class_candidates = [c for c in train_df.columns if c.lower() in {'classid', 'class_id', 'label', 'class'}]
class_col = class_candidates[0] if class_candidates else None
if class_col is None:
    possible = [c for c in train_df.columns if 'class' in c.lower()]
    class_col = possible[0] if possible else None

if class_col is not None:
    print(f'\n--- Number of Classes ({class_col}) ---')
    print('Train unique classes:', train_df[class_col].nunique())

    class_counts = train_df[class_col].value_counts().sort_index()
    plt.figure(figsize=(14, 4))
    plt.bar(class_counts.index.astype(str), class_counts.values)
    plt.title('Train Class Distribution')
    plt.xlabel('Class')
    plt.ylabel('Number of images')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
else:
    print('\nCould not detect class column automatically.')

# Resolution analysis
if {'Width', 'Height'}.issubset(train_df.columns):
    train_df['aspect_ratio'] = train_df['Width'] / train_df['Height']
    fig, axes = plt.subplots(1, 3, figsize=(16, 4))

    axes[0].hist(train_df['Width'], bins=30, color='steelblue')
    axes[0].set_title('Width distribution')
    axes[0].set_xlabel('Width')

    axes[1].hist(train_df['Height'], bins=30, color='darkorange')
    axes[1].set_title('Height distribution')
    axes[1].set_xlabel('Height')

    axes[2].hist(train_df['aspect_ratio'], bins=30, color='seagreen')
    axes[2].set_title('Aspect ratio distribution')
    axes[2].set_xlabel('Width / Height')

    plt.tight_layout()
    plt.show()

    print('\n--- Resolution Summary (Train) ---')
    display(train_df[['Width', 'Height', 'aspect_ratio']].describe())

# Show random image samples
def resolve_image_path(rel_path):
    rel_path = str(rel_path).replace('\\', '/')
    if rel_path.startswith(DATA_DIR + '/'):
        return rel_path
    return os.path.join(DATA_DIR, rel_path)

if 'Path' in train_df.columns and class_col is not None:
    rng = np.random.default_rng(42)
    unique_classes = np.array(sorted(train_df[class_col].dropna().unique()))
    n_show = min(12, len(unique_classes))
    selected_classes = rng.choice(unique_classes, size=n_show, replace=False)

    sampled_rows = []
    for cls in selected_classes:
        cls_rows = train_df[train_df[class_col] == cls]
        if len(cls_rows) > 0:
            sampled_rows.append(cls_rows.sample(1, random_state=42))

    sampled = pd.concat(sampled_rows, ignore_index=True) if sampled_rows else pd.DataFrame()

    fig, axes = plt.subplots(3, 4, figsize=(14, 10))
    axes = axes.flatten()

    for i, row in sampled.iterrows():
        img_path = resolve_image_path(row['Path'])
        if os.path.exists(img_path):
            img = cv2.imread(img_path)
            if img is not None:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                axes[i].imshow(img)
            else:
                axes[i].text(0.5, 0.5, 'Unreadable image', ha='center', va='center')
        else:
            axes[i].text(0.5, 0.5, 'Image not found', ha='center', va='center')
        axes[i].set_title(f'Class {row[class_col]}')
        axes[i].axis('off')

    for j in range(len(sampled), len(axes)):
        axes[j].axis('off')

    plt.suptitle('Random Training Samples by Class', y=1.02)
    plt.tight_layout()
    plt.show()

# Pixel statistics on a subset for speed
if 'Path' in train_df.columns:
    sample_n = min(2000, len(train_df))
    sampled_rows = train_df.sample(sample_n, random_state=42)

    means, stds = [], []
    for _, row in sampled_rows.iterrows():
        img_path = resolve_image_path(row['Path'])
        if not os.path.exists(img_path):
            continue
        img = cv2.imread(img_path)
        if img is None:
            continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
        means.append(img.mean(axis=(0, 1)))
        stds.append(img.std(axis=(0, 1)))

    if means:
        mean_rgb = np.vstack(means).mean(axis=0)
        std_rgb = np.vstack(stds).mean(axis=0)
        print('\n--- Approximate RGB Statistics (subset) ---')
        print('Mean RGB:', np.round(mean_rgb, 4))
        print('Std  RGB:', np.round(std_rgb, 4))

print('\nEDA completed.')