In [2]:
import pandas as pd

test_df = pd.read_csv('./test/_annotations.csv')
train_df = pd.read_csv('./train/_annotations.csv')
val_df = pd.read_csv('./valid/_annotations.csv')

print('Test shape:', test_df.shape)
print('Train shape:', train_df.shape)
print('Val shape:', val_df.shape)

Test shape: (1264, 8)
Train shape: (25534, 8)
Val shape: (1752, 8)


In [3]:
# Check columns
print('Test columns:', test_df.columns)
print('Train columns:', train_df.columns)
print('Val columns:', val_df.columns)

Test columns: Index(['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'], dtype='object')
Train columns: Index(['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'], dtype='object')
Val columns: Index(['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'], dtype='object')


In [4]:
concat_df = pd.concat([train_df, test_df, val_df], axis=0)

display(concat_df)

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,212_png.rf.3b554672260910904cadfa55a1a89d05.jpg,117,133,Bumps,3,3,117,133
1,IMG_5499_JPG.rf.3b4efc72b6c075076f44d97be0029b...,2048,2048,Traffic from Right Merges Ahead,195,200,777,804
2,205_PNG_jpg.rf.3b47748f9c1824e863c3037cb845eaa...,416,416,Pass either side,0,0,356,341
3,001453_jpg.rf.3b3ac9b7fd0195dbe93ea49058c8a799...,416,416,Speed limit,135,23,211,132
4,43-11-_jpg.rf.3b4639589699e19c666c38cb68434b57...,48,48,Slippery road,13,18,36,41
...,...,...,...,...,...,...,...,...
1747,petunjuk-lokasi-putar-balik-41-_jpg.rf.fc7f3b0...,640,640,U turn,194,169,435,502
1748,IMG_20220401_144938_jpg.rf.fede618723e7305ac71...,1916,1916,Double Bend to Left Ahead,663,139,1236,734
1749,P1840432_1_jpg.rf.ff8076f96e18b205c5a67be79be5...,32,32,U turn,0,0,32,32
1750,IMG_5547_JPG.rf.fee6e330b5f5b49ab5c03d5a7c81a9...,2048,2047,Double Bend to Left Ahead,387,68,821,779


In [5]:
class_counts = concat_df['class'].value_counts()
display(class_counts.head(5))

class
Speed limit         4517
Pass either side    2063
Obstruction         1989
Roadway diverges    1885
Bumps               1612
Name: count, dtype: int64

In [6]:
top_5 = concat_df['class'].value_counts().nlargest(5).index

filtered_df = concat_df[concat_df['class'].isin(top_5)]

sampled_df = filtered_df.groupby('class', group_keys=False).apply(lambda x: x.sample(n=200, random_state=42))

sampled_df.reset_index(drop=True, inplace=True)
sampled_df.to_csv('rs_dataset.csv')

print(sampled_df.head(5))

                                            filename  width  height  class  \
0    315_png.rf.40731859f4c7fc2ac33e95985b8beae1.jpg    180     177  Bumps   
1    285_png.rf.08e8f117bd0809161573634df1d43b4f.jpg    202     222  Bumps   
2  bump16_png.rf.21d4edc826d8dca561ea6c05fb194cf9...    163     235  Bumps   
3    397_png.rf.2a3998bffd08aa62b04c2157ff945ab7.jpg    227     346  Bumps   
4  bump79_png.rf.05fbda7c88ffad4c119c8c089f2fae39...    106     126  Bumps   

   xmin  ymin  xmax  ymax  
0     0     6   172   177  
1     9     8   197   222  
2     0     0   163   228  
3     0     0   227   306  
4     4     0    95   126  


  sampled_df = filtered_df.groupby('class', group_keys=False).apply(lambda x: x.sample(n=200, random_state=42))


In [18]:
import os
import pandas as pd
import shutil

# Paths
sourceFolders = ['./test', './train', './valid']
destinationFolder = './train_dataset'
testDatasetFolder = './test_dataset'

# Create destination folders if they don't exist
os.makedirs(destinationFolder, exist_ok=True)
os.makedirs(testDatasetFolder, exist_ok=True)

# Collect all existing image filenames from source folders into a set
existing_images = set()
for folder in sourceFolders:
    for file in os.listdir(folder):
        existing_images.add(file)

# Check if images exist during sampling
def sample_without_duplicates(filtered_df, n, existing_images):
    """
    Randomly samples `n` rows per class, ensuring filenames exist and are unique.
    """
    sampled_filenames = set()  # To track filenames already added
    sampled_rows = []

    for class_name, group in filtered_df.groupby('class'):
        valid_rows = group[group['filename'].isin(existing_images)]

        # Shuffle and iterate until `n` unique samples are collected
        valid_rows = valid_rows.sample(frac=1, random_state=42).reset_index(drop=True)
        class_sample = []

        for _, row in valid_rows.iterrows():
            if row['filename'] not in sampled_filenames:
                class_sample.append(row)
                sampled_filenames.add(row['filename'])

            if len(class_sample) == n:
                break

        sampled_rows.extend(class_sample)

    return pd.DataFrame(sampled_rows)

# Filter top 5 classes and perform sampling
top_5 = concat_df['class'].value_counts().nlargest(5).index
filtered_df = concat_df[concat_df['class'].isin(top_5)]

# Sample 420 images per class without duplicates
sampled_df = sample_without_duplicates(filtered_df, 420, existing_images)

# Ensure exactly 2000 images in train_dataset
if len(sampled_df) > 2000:
    sampled_df = sampled_df.sample(n=2000, random_state=42).reset_index(drop=True)
elif len(sampled_df) < 2000:
    print(f"Warning: Only {len(sampled_df)} images found instead of 2100!")

# Save dataset
sampled_df.to_csv('train_dataset.csv', index=False)

# Copy sampled images to re_dataset
for filename in sampled_df['filename']:
    for folder in sourceFolders:
        file_path = os.path.join(folder, filename)
        if os.path.exists(file_path):
            shutil.copy2(file_path, destinationFolder)
            break

# Sample 20 images from each class for test dataset (100 images total)
test_sample_df = sampled_df.groupby('class').sample(n=20, random_state=42).reset_index(drop=True)

# Save test dataset
test_sample_df.to_csv('test_dataset.csv', index=False)

# Copy sampled test images to test_dataset
for filename in test_sample_df['filename']:
    for folder in sourceFolders:
        file_path = os.path.join(folder, filename)
        if os.path.exists(file_path):
            shutil.copy2(file_path, testDatasetFolder)
            break

# Verify the number of images copied
re_dataset_files = os.listdir(destinationFolder)
test_dataset_files = os.listdir(testDatasetFolder)

# Count only image files
image_extensions = {'.jpg', '.jpeg', '.png'}
train_images = [f for f in re_dataset_files if os.path.splitext(f)[-1].lower() in image_extensions]
test_images = [f for f in test_dataset_files if os.path.splitext(f)[-1].lower() in image_extensions]

# Print confirmation
print(f"Number of images in train_dataset: {len(train_images)} (Expected: 2000)")
print(f"Number of images in test_dataset: {len(test_images)} (Expected: 100)")


Number of images in train_dataset: 2000 (Expected: 2000)
Number of images in test_dataset: 100 (Expected: 100)


In [22]:
import cv2
import pandas as pd
import os

# Define paths
csv_path = "test_dataset.csv"  # Your original annotation file
input_folder = "./test_dataset"  # Folder containing original images
output_folder = "./resized_test_dataset"  # Folder to save resized images
output_csv_path = "resized_test_dataset.csv"  # Updated annotation file

# Resize parameters
target_width, target_height = 320, 320  # Change based on your model input size

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Load CSV file
df = pd.read_csv(csv_path)

# Initialize list to store new annotation data
updated_annotations = []

# Loop through each row in the CSV
for index, row in df.iterrows():
    filename = row['filename']
    original_width = row['width']
    original_height = row['height']
    
    # Bounding box coordinates
    xmin, ymin, xmax, ymax = row['xmin'], row['ymin'], row['xmax'], row['ymax']

    # Read image
    img_path = os.path.join(input_folder, filename)
    img = cv2.imread(img_path)

    if img is None:
        print(f"Warning: Image {filename} not found, skipping...")
        continue

    # Resize image
    resized_img = cv2.resize(img, (target_width, target_height), interpolation=cv2.INTER_AREA)

    # Save resized image
    output_img_path = os.path.join(output_folder, filename)
    cv2.imwrite(output_img_path, resized_img)

    # Scale bounding box coordinates
    scale_x = target_width / original_width
    scale_y = target_height / original_height

    new_xmin = int(xmin * scale_x)
    new_ymin = int(ymin * scale_y)
    new_xmax = int(xmax * scale_x)
    new_ymax = int(ymax * scale_y)

    # Store updated annotation
    updated_annotations.append([filename, target_width, target_height, row['class'], new_xmin, new_ymin, new_xmax, new_ymax])

# Convert list to DataFrame
new_df = pd.DataFrame(updated_annotations, columns=['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])

# Save updated CSV
new_df.to_csv(output_csv_path, index=False)

print(f"Resizing complete! Updated annotations saved to {output_csv_path}")

Resizing complete! Updated annotations saved to resized_test_dataset.csv
