In [2]:
# based on EDA, we do feature engineering at this notebook
import cv2
import numpy as np
from skimage.feature import local_binary_pattern, hog
from skimage import filters, morphology, measure
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
# extract new feature set (HOG on Digit ROI, Projection Profiles, LBP)
# target to get features which are beneficial to detect speed limit signs
def detect_digit_roi(img, min_area=50):
    """
    detect potential digit regions in traffic signs
    focus on central regions and areas with high contrast
    """
    # convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # apply adaptive thresholding to highlight text/digits
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                   cv2.THRESH_BINARY_INV, 11, 2)
    
    # find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # filter contours by area and aspect ratio (digits tend to be taller than wide)
    digit_rois = []
    img_center_x, img_center_y = img.shape[1] // 2, img.shape[0] // 2
    
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > min_area:
            x, y, w, h = cv2.boundingRect(contour)
            aspect_ratio = h / w if w > 0 else 0
            
            # check if it's in central region and has digit-like aspect ratio
            center_dist = np.sqrt((x + w/2 - img_center_x)**2 + (y + h/2 - img_center_y)**2)
            max_dist = min(img.shape[0], img.shape[1]) * 0.4
            
            if center_dist < max_dist and 0.5 <= aspect_ratio <= 3.0:
                # extract ROI
                roi = gray[y:y+h, x:x+w]
                if roi.size > 0:
                    digit_rois.append(roi)
    
    # if no ROIs found, use central region
    if not digit_rois:
        h, w = gray.shape
        center_h, center_w = h // 3, w // 3
        roi = gray[center_h:2*center_h, center_w:2*center_w]
        if roi.size > 0:
            digit_rois.append(roi)
    
    return digit_rois

def extract_hog_from_digit_roi(img):
    """extract HOG features from detected digit ROIs"""
    digit_rois = detect_digit_roi(img)
    
    hog_features = []
    for roi in digit_rois:
        # resize ROI to standard size for consistent HOG extraction
        if roi.size > 0:
            roi_resized = cv2.resize(roi, (32, 32))
            # extract HOG features
            hog_feat = hog(roi_resized, orientations=9, pixels_per_cell=(8, 8),
                          cells_per_block=(2, 2), block_norm='L2-Hys', 
                          feature_vector=True)
            hog_features.extend(hog_feat)
    
    # if no features extracted, return zeros
    if not hog_features:
        hog_features = [0] * 36  # Standard HOG feature size for 32x32 image
    
    # pad or truncate to fixed size (36 features)
    if len(hog_features) > 36:
        hog_features = hog_features[:36]
    elif len(hog_features) < 36:
        hog_features.extend([0] * (36 - len(hog_features)))
    
    return hog_features

def extract_projection_profiles(img):
    """extract horizontal and vertical projection profiles"""
    # convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # apply edge detection to highlight text/digit boundaries
    edges = cv2.Canny(gray, 50, 150)
    
    # calculate projection profiles
    h_profile = np.sum(edges, axis=1)  # horizontal projection (sum across width)
    v_profile = np.sum(edges, axis=0)  # vertical projection (sum across height)
    
    # normalize profiles
    h_profile = h_profile / (np.sum(h_profile) + 1e-8)
    v_profile = v_profile / (np.sum(v_profile) + 1e-8)
    
    # extract statistical features from profiles
    def profile_stats(profile):
        if len(profile) == 0:
            return [0, 0, 0, 0, 0]
        
        # basic statistics
        mean_val = np.mean(profile)
        std_val = np.std(profile)
        max_val = np.max(profile)
        
        # peak detection (local maxima)
        peaks = []
        for i in range(1, len(profile) - 1):
            if profile[i] > profile[i-1] and profile[i] > profile[i+1]:
                peaks.append(profile[i])
        
        num_peaks = len(peaks)
        peak_intensity = np.mean(peaks) if peaks else 0
        
        return [mean_val, std_val, max_val, num_peaks, peak_intensity]
    
    h_stats = profile_stats(h_profile)
    v_stats = profile_stats(v_profile)
    
    # combine horizontal and vertical statistics
    projection_features = h_stats + v_stats
    
    return projection_features

def extract_lbp_features(img):
    """extract Local Binary Pattern features"""
    # convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # extract LBP
    radius = 1
    n_points = 8 * radius
    lbp = local_binary_pattern(gray, n_points, radius, method='uniform')
    
    # calculate histogram of LBP
    n_bins = n_points + 2  # uniform patterns + non-uniform
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
    
    # normalize histogram
    hist = hist.astype(np.float32)
    hist /= (hist.sum() + 1e-8)
    
    return hist.tolist()

def extract_new_features_for_images(image_paths, base_path, dataset_name=""):
    """extract all new features for a list of images"""
    all_features = []
    
    for img_path in tqdm(image_paths, desc=f"Extracting new features ({dataset_name})"):
        full_path = os.path.join(base_path, img_path)
        
        try:
            # read image
            img = cv2.imread(full_path)
            if img is None:
                print(f"Could not read image: {full_path}")
                # Create zero features
                hog_digit_features = [0] * 36
                projection_features = [0] * 10
                lbp_features = [0] * 10
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                
                # extract features
                hog_digit_features = extract_hog_from_digit_roi(img)
                projection_features = extract_projection_profiles(img)
                lbp_features = extract_lbp_features(img)
            
            # combine all features
            combined_features = hog_digit_features + projection_features + lbp_features
            combined_features.insert(0, img_path)  # Add image path as first column
            all_features.append(combined_features)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            # create zero features
            hog_digit_features = [0] * 36
            projection_features = [0] * 10
            lbp_features = [0] * 10
            combined_features = hog_digit_features + projection_features + lbp_features
            combined_features.insert(0, img_path)
            all_features.append(combined_features)
    
    return all_features

# 2: using the pretrained CNN to extract deep features

def preprocess_image_for_cnn(img, target_size=(224, 224)):
    """Preprocess image for CNN feature extraction"""
    # resize image first
    img_resized = cv2.resize(img, target_size)
    
    # convert to array and expand dims for batch processing
    img_array = image.img_to_array(img_resized)
    img_array = np.expand_dims(img_array, axis=0)
    
    # preprocess for ResNet50
    img_array = preprocess_input(img_array)
    
    return img_array

def extract_deep_features_for_images(image_paths, base_path, base_model, dataset_name=""):
    """extract deep features using pre-trained ResNet50"""
    all_features = []
    
    for img_path in tqdm(image_paths, desc=f"Extracting deep features ({dataset_name})"):
        full_path = os.path.join(base_path, img_path)
        
        try:
            # read and preprocess image
            img = cv2.imread(full_path)
            if img is None:
                print(f"Could not read image: {full_path}")
                # create zero features (ResNet50 outputs 2048 features)
                deep_features = [0] * 2048
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img_preprocessed = preprocess_image_for_cnn(img)
                
                # extract features
                features = base_model.predict(img_preprocessed, verbose=0)
                deep_features = features.flatten().tolist()
            
            # add image path as first column
            deep_features.insert(0, img_path)
            all_features.append(deep_features)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            # create zero features
            deep_features = [0] * 2048
            deep_features.insert(0, img_path)
            all_features.append(deep_features)
    
    return all_features

In [4]:
# start to do extraction for train and test set simultaneously
# load training metadata
print("Loading training metadata...")
train_metadata = pd.read_csv("/Users/xingzhidu/Desktop/ML/ML A2/dataset/train/train_metadata.csv")
print(f"Loaded train metadata for {len(train_metadata)} images")

print("Loading test metadata...")
test_metadata = pd.read_csv("/Users/xingzhidu/Desktop/ML/ML A2/dataset/test/test_metadata.csv")
print(f"Loaded test metadata for {len(test_metadata)} images")

# get image paths
train_image_paths = train_metadata['image_path'].tolist()
test_image_paths = test_metadata['image_path'].tolist()

train_base_path = "/Users/xingzhidu/Desktop/ML/ML A2/dataset/train"
test_base_path = "/Users/xingzhidu/Desktop/ML/ML A2/dataset/test"

print(f"Processing {len(train_image_paths)} training images and {len(test_image_paths)} test images...")

# extract new features for training set
print("Processing training set...")
train_new_features_data = extract_new_features_for_images(train_image_paths, train_base_path, "Train")

# extract new features for test set
print("Processing test set...")
test_new_features_data = extract_new_features_for_images(test_image_paths, test_base_path, "Test")

# create column names for new features
hog_digit_cols = [f'hog_digit_{i}' for i in range(36)]
projection_cols = [f'h_proj_{i}' for i in range(5)] + [f'v_proj_{i}' for i in range(5)]
lbp_cols = [f'lbp_{i}' for i in range(10)]
new_feature_cols = ['image_path'] + hog_digit_cols + projection_cols + lbp_cols

# ceate DataFrames for new features
train_new_features_df = pd.DataFrame(train_new_features_data, columns=new_feature_cols)
test_new_features_df = pd.DataFrame(test_new_features_data, columns=new_feature_cols)

print(f"Train new features shape: {train_new_features_df.shape}")
print(f"Test new features shape: {test_new_features_df.shape}")
print("New features extracted successfully for both sets!")

# extract deep features

print("Loading pre-trained ResNet50 model...")
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# extract deep features for training set
print("Processing training set...")
train_deep_features_data = extract_deep_features_for_images(train_image_paths, train_base_path, base_model, "Train")

# extract deep features for test set
print("Processing test set...")
test_deep_features_data = extract_deep_features_for_images(test_image_paths, test_base_path, base_model, "Test")

# create column names for deep features
deep_feature_cols = ['image_path'] + [f'deep_feature_{i}' for i in range(2048)]

# create DataFrames for deep features
train_deep_features_df = pd.DataFrame(train_deep_features_data, columns=deep_feature_cols)
test_deep_features_df = pd.DataFrame(test_deep_features_data, columns=deep_feature_cols)

print(f"Train deep features shape: {train_deep_features_df.shape}")
print(f"Test deep features shape: {test_deep_features_df.shape}")
print("Deep features extracted successfully for both sets!")


Loading training metadata...
Loaded train metadata for 5488 images
Loading test metadata...
Loaded test metadata for 2353 images
Processing 5488 training images and 2353 test images...
Processing training set...


Extracting new features (Train): 100%|████| 5488/5488 [00:03<00:00, 1725.10it/s]


Processing test set...


Extracting new features (Test): 100%|█████| 2353/2353 [00:01<00:00, 1756.15it/s]


Train new features shape: (5488, 57)
Test new features shape: (2353, 57)
New features extracted successfully for both sets!
Loading pre-trained ResNet50 model...
Processing training set...


Extracting deep features (Train): 100%|█████| 5488/5488 [04:15<00:00, 21.51it/s]


Processing test set...


Extracting deep features (Test): 100%|██████| 2353/2353 [01:49<00:00, 21.49it/s]


Train deep features shape: (5488, 2049)
Test deep features shape: (2353, 2049)
Deep features extracted successfully for both sets!


In [9]:
# save all extracted feature results
# save training features
train_new_features_df.to_csv("/Users/xingzhidu/Desktop/ML/ML A2/dataset/train/new_features.csv", index=False)
train_deep_features_df.to_csv("/Users/xingzhidu/Desktop/ML/ML A2/dataset/train/deep_features.csv", index=False)

# save test features
test_new_features_df.to_csv("/Users/xingzhidu/Desktop/ML/ML A2/dataset/test/new_features.csv", index=False)
test_deep_features_df.to_csv("/Users/xingzhidu/Desktop/ML/ML A2/dataset/test/deep_features.csv", index=False)

print("Training features saved to: /train/new_features.csv and /train/deep_features.csv")
print("Test features saved to: /test/new_features.csv and /test/deep_features.csv")

# display the feature summary and analysis
print("\n=== Feature Summary ===")
print(f"New Features Set:")
print(f"  - HOG on Digit ROI: {len(hog_digit_cols)} features")
print(f"  - Projection Profiles: {len(projection_cols)} features")
print(f"  - Local Binary Patterns: {len(lbp_cols)} features")
print(f"  - Total: {len(hog_digit_cols) + len(projection_cols) + len(lbp_cols)} features")

print(f"\nDeep Features Set:")
print(f"  - ResNet50 features: {2048} features")

print(f"\nDataset sizes:")
print(f"  - Training: {len(train_image_paths)} samples")
print(f"  - Test: {len(test_image_paths)} samples")

Training features saved to: /train/new_features.csv and /train/deep_features.csv
Test features saved to: /test/new_features.csv and /test/deep_features.csv

=== Feature Summary ===
New Features Set:
  - HOG on Digit ROI: 36 features
  - Projection Profiles: 10 features
  - Local Binary Patterns: 10 features
  - Total: 56 features

Deep Features Set:
  - ResNet50 features: 2048 features

Dataset sizes:
  - Training: 5488 samples
  - Test: 2353 samples


In [10]:
# quick analysis of training features
print("\n=== Quick Analysis of Training Features ===")
train_new_features_numeric = train_new_features_df.drop('image_path', axis=1)
train_deep_features_numeric = train_deep_features_df.drop('image_path', axis=1)

print(f"Training New features statistics:")
print(f"Mean: {train_new_features_numeric.mean().mean():.4f}")
print(f"Std: {train_new_features_numeric.std().mean():.4f}")
print(f"Min: {train_new_features_numeric.min().min():.4f}")
print(f"Max: {train_new_features_numeric.max().max():.4f}")

print(f"\nTraining Deep features statistics:")
print(f"Mean: {train_deep_features_numeric.mean().mean():.4f}")
print(f"Std: {train_deep_features_numeric.std().mean():.4f}")
print(f"Min: {train_deep_features_numeric.min().min():.4f}")
print(f"Max: {train_deep_features_numeric.max().max():.4f}")

# check for missing values
missing_train_new = train_new_features_numeric.isnull().sum().sum()
missing_train_deep = train_deep_features_numeric.isnull().sum().sum()
missing_test_new = test_new_features_df.drop('image_path', axis=1).isnull().sum().sum()
missing_test_deep = test_deep_features_df.drop('image_path', axis=1).isnull().sum().sum()

print(f"\nMissing values:")
print(f"  - Train new features: {missing_train_new}")
print(f"  - Train deep features: {missing_train_deep}")
print(f"  - Test new features: {missing_test_new}")
print(f"  - Test deep features: {missing_test_deep}")


=== Quick Analysis of Training Features ===
Training New features statistics:
Mean: 0.4937
Std: 0.3265
Min: 0.0000
Max: 64.0000

Training Deep features statistics:
Mean: 0.4094
Std: 0.4903
Min: 0.0000
Max: 22.7477

Missing values:
  - Train new features: 0
  - Train deep features: 0
  - Test new features: 0
  - Test deep features: 0


In [11]:
# quick analysis of test features same as the training feature
print("\n=== Quick Analysis of test Features ===")
test_new_features_numeric = test_new_features_df.drop('image_path', axis=1)
test_deep_features_numeric = test_deep_features_df.drop('image_path', axis=1)

print(f"Training New features statistics:")
print(f"Mean: {test_new_features_numeric.mean().mean():.4f}")
print(f"Std: {test_new_features_numeric.std().mean():.4f}")
print(f"Min: {test_new_features_numeric.min().min():.4f}")
print(f"Max: {test_new_features_numeric.max().max():.4f}")

print(f"\nTraining Deep features statistics:")
print(f"Mean: {test_deep_features_numeric.mean().mean():.4f}")
print(f"Std: {test_deep_features_numeric.std().mean():.4f}")
print(f"Min: {test_deep_features_numeric.min().min():.4f}")
print(f"Max: {test_deep_features_numeric.max().max():.4f}")


=== Quick Analysis of test Features ===
Training New features statistics:
Mean: 0.4853
Std: 0.3266
Min: 0.0000
Max: 59.0000

Training Deep features statistics:
Mean: 0.4086
Std: 0.4907
Min: 0.0000
Max: 21.1844


In [None]:
# we find extracted features from  training dataset and test set have very similar distribution
# that means our extraction is really great and consistent
# our predicitons on test data in the future will be consistent as well