# Data Preprocessing for ReID Datasets

## Google Colab Setup

### 1. Mount Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

### 2. Clone Repository

In [None]:
%cd /content

%cd master-thesis-reid

### 3. Install Dependencies

In [None]:
!pip install -q -r requirements_colab.txt

### 4. Setup Paths

In [None]:
import os
import sys

sys.path.append('/content/master-thesis-reid')

DATA_ROOT = "/content/drive/MyDrive/reid_data"
MODEL_ROOT = "/content/drive/MyDrive/reid_models"
RESULTS_ROOT = "/content/drive/MyDrive/reid_results"

os.makedirs(DATA_ROOT, exist_ok=True)
os.makedirs(MODEL_ROOT, exist_ok=True)
os.makedirs(RESULTS_ROOT, exist_ok=True)

print("Setup completed successfully!")
print(f"Data root: {DATA_ROOT}")
print(f"Model root: {MODEL_ROOT}")
print(f"Results root: {RESULTS_ROOT}")

## Import Libraries

In [None]:
import cv2
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

from utils.data_loader import OpenCVAugmentation
from utils.config_loader import ConfigLoader

print("Libraries imported successfully!")

## Visualize Augmentations

In [None]:
sample_image_path = f"{DATA_ROOT}/market1501/bounding_box_train/0002_c1s1_000451_03.jpg"

if os.path.exists(sample_image_path):
    img = cv2.imread(sample_image_path)
    print(f"Loaded image: {sample_image_path}")
    print(f"Image shape: {img.shape}")
else:
    print(f"Sample image not found: {sample_image_path}")
    img = np.random.randint(0, 255, (256, 128, 3), dtype=np.uint8)
    print("Using dummy image for demonstration")

In [None]:
config_loader = ConfigLoader('config')
augment_config = config_loader.get_augmentation_config()

print("Augmentation config:")
for key, value in augment_config.items():
    print(f"  {key:15s}: {value}")

augmentor = OpenCVAugmentation(augment_config)

img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

fig, axes = plt.subplots(3, 4, figsize=(16, 12))
axes = axes.flatten()

axes[0].imshow(img_rgb)
axes[0].set_title('Original', fontsize=12, fontweight='bold')
axes[0].axis('off')

aug_funcs = [
    ('Rotation', lambda x: augmentor.random_rotation(x)),
    ('Horizontal Flip', lambda x: augmentor.random_horizontal_flip(x, p=1.0)),
    ('Brightness', lambda x: augmentor.random_brightness(x)),
    ('Contrast', lambda x: augmentor.random_contrast(x)),
    ('Saturation', lambda x: augmentor.random_saturation(x)),
    ('Hue', lambda x: augmentor.random_hue(x)),
    ('Gaussian Noise', lambda x: augmentor.random_gaussian_noise(x, sigma_range=(5, 15))),
    ('Gaussian Blur', lambda x: augmentor.random_gaussian_blur(x, p=1.0)),
    ('Crop & Resize', lambda x: augmentor.random_crop_and_resize(x)),
    ('Combined (All)', lambda x: augmentor(x))
]

for i, (title, aug_func) in enumerate(aug_funcs, 1):
    aug_img = aug_func(img.copy())
    aug_img_rgb = cv2.cvtColor(aug_img, cv2.COLOR_BGR2RGB)
    axes[i].imshow(aug_img_rgb)
    axes[i].set_title(title, fontsize=11)
    axes[i].axis('off')

axes[11].axis('off')

plt.tight_layout()
plt.savefig(f'{RESULTS_ROOT}/augmentation_examples.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nVisualization saved to: {RESULTS_ROOT}/augmentation_examples.png")

## Preprocessing Pipeline


In [None]:
def preprocess_dataset(
    input_dir,
    output_dir,
    target_height=256,
    target_width=128,
    apply_augmentation=False,
    num_augmentations=1,
    augmentor=None
):
    """
    Preprocess dataset with resizing and optional augmentation.

    Args:
        input_dir: Input directory with Market-1501 structure
        output_dir: Output directory for preprocessed images
        target_height: Target image height (default: 256)
        target_width: Target image width (default: 128)
        apply_augmentation: Whether to apply augmentations
        num_augmentations: Number of augmented versions per image
        augmentor: OpenCVAugmentation instance (created if None)
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    if augmentor is None:
        augmentor = OpenCVAugmentation()

    image_files = list(input_path.glob('*.jpg')) + list(input_path.glob('*.png'))

    if len(image_files) == 0:
        print(f"Warning: No images found in {input_dir}")
        return 0

    print(f"Processing {len(image_files)} images...")

    processed_count = 0
    for img_path in tqdm(image_files):
        img = cv2.imread(str(img_path))
        if img is None:
            print(f"Warning: Could not read {img_path}")
            continue

        img_resized = cv2.resize(img, (target_width, target_height), interpolation=cv2.INTER_LINEAR)

        output_file = output_path / img_path.name
        cv2.imwrite(str(output_file), img_resized)
        processed_count += 1

        if apply_augmentation:
            for i in range(num_augmentations):
                aug_img = augmentor(img_resized)
                aug_filename = output_path / f"{img_path.stem}_aug{i}{img_path.suffix}"
                cv2.imwrite(str(aug_filename), aug_img)
                processed_count += 1

    print(f"Preprocessing complete: {processed_count} images saved to {output_dir}")
    return processed_count

print("Preprocessing function defined!")

## Batch Preprocessing for Market-1501

Process all splits (train, test, query) and save to Google Drive.

In [None]:
DATASET_NAME = "market1501"
OUTPUT_DATASET_NAME = "market1501_preprocessed"

input_train = f"{DATA_ROOT}/{DATASET_NAME}/bounding_box_train"
input_test = f"{DATA_ROOT}/{DATASET_NAME}/bounding_box_test"
input_query = f"{DATA_ROOT}/{DATASET_NAME}/query"

output_train = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/bounding_box_train"
output_test = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/bounding_box_test"
output_query = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/query"

data_config = config_loader.get_data_loading_config()
target_height = data_config['height']
target_width = data_config['width']

print("Preprocessing configuration:")
print(f"  Dataset: {DATASET_NAME}")
print(f"  Output: {OUTPUT_DATASET_NAME}")
print(f"  Target size: {target_height}x{target_width}")
print(f"  Input train: {input_train}")
print(f"  Output train: {output_train}")
print()

for split, path in [("train", input_train), ("test", input_test), ("query", input_query)]:
    if os.path.exists(path):
        num_files = len(list(Path(path).glob('*.jpg')))
        print(f" {split:6s}: {num_files:5d} images found in {path}")
    else:
        print(f" {split:6s}: NOT FOUND - {path}")

In [None]:
augment_config = config_loader.get_augmentation_config()
augmentor = OpenCVAugmentation(augment_config)

print("="*60)
print("Starting Batch Preprocessing")
print("="*60)
print()

total_processed = 0

if os.path.exists(input_train):
    print("\n" + "="*60)
    print("Preprocessing Training Set")
    print("="*60)
    count = preprocess_dataset(
        input_train,
        output_train,
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=False,
        augmentor=augmentor
    )
    total_processed += count
else:
    print(f"\nSkipping training set - directory not found: {input_train}")

if os.path.exists(input_test):
    print("\n" + "="*60)
    print("Preprocessing Test Set (Gallery)")
    print("="*60)
    count = preprocess_dataset(
        input_test,
        output_test,
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=False,
        augmentor=augmentor
    )
    total_processed += count
else:
    print(f"\nSkipping test set - directory not found: {input_test}")

if os.path.exists(input_query):
    print("\n" + "="*60)
    print("Preprocessing Query Set")
    print("="*60)
    count = preprocess_dataset(
        input_query,
        output_query,
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=False,
        augmentor=augmentor
    )
    total_processed += count
else:
    print(f"\nSkipping query set - directory not found: {input_query}")

print("\n" + "="*60)
print("Preprocessing Complete!")
print("="*60)
print(f"Total images processed: {total_processed}")
print(f"Output directory: {DATA_ROOT}/{OUTPUT_DATASET_NAME}")
print(f"Target size: {target_height}x{target_width}")

## Verify Preprocessed Data

In [None]:
print("Verification of preprocessed dataset:")
print("="*60)

for split_name, split_path in [("Train", output_train), ("Test", output_test), ("Query", output_query)]:
    if os.path.exists(split_path):
        images = list(Path(split_path).glob('*.jpg'))
        print(f"\n{split_name} set: {len(images)} images")

        if images:
            sample = cv2.imread(str(images[0]))
            if sample is not None:
                h, w = sample.shape[:2]
                print(f"  Sample image size: {h}x{w}")
                if h == target_height and w == target_width:
                    print(f" Size is correct ({target_height}x{target_width})")
                else:
                    print(f" Size mismatch! Expected {target_height}x{target_width}")
    else:
        print(f"\n{split_name} set: NOT FOUND")

print("\n" + "="*60)
print("Verification complete!")

<h2>Create Augmented Dataset (save it on disk instead of on-the-fly)</h2>

In [None]:
OUTPUT_AUGMENTED = f"{DATA_ROOT}/market1501_augmented"
num_augmentations_per_image = 3

print("Creating augmented dataset...")
print(f"Each image will have {num_augmentations_per_image} augmented versions")
print()

if os.path.exists(input_train):
    print("Augmenting training set...")
    preprocess_dataset(
        input_train,
        f"{OUTPUT_AUGMENTED}/bounding_box_train",
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=True,
        num_augmentations=num_augmentations_per_image,
        augmentor=augmentor
    )

print("Augmented dataset created!")

# DukeMTMC-reID Dataset Preprocessing

In [None]:
DATASET_NAME = "DukeMTMC-reID"
OUTPUT_DATASET_NAME = "DukeMTMC-reID_preprocessed"

input_train = f"{DATA_ROOT}/{DATASET_NAME}/bounding_box_train"
input_test = f"{DATA_ROOT}/{DATASET_NAME}/bounding_box_test"
input_query = f"{DATA_ROOT}/{DATASET_NAME}/query"

output_train = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/bounding_box_train"
output_test = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/bounding_box_test"
output_query = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/query"

print("="*60)
print(f"Preprocessing {DATASET_NAME}")
print("="*60)
print(f"Target size: {target_height}x{target_width}")
print()

for split, path in [("train", input_train), ("test", input_test), ("query", input_query)]:
    if os.path.exists(path):
        num_files = len(list(Path(path).glob('*.jpg')))
        print(f"{split:6s}: {num_files:5d} images found")
    else:
        print(f"{split:6s}: NOT FOUND - {path}")

In [None]:
total_processed = 0

if os.path.exists(input_train):
    print("\n" + "="*60)
    print("Preprocessing Training Set")
    print("="*60)
    count = preprocess_dataset(input_train, output_train, target_height, target_width, False, 1, augmentor)
    total_processed += count

if os.path.exists(input_test):
    print("\n" + "="*60)
    print("Preprocessing Test Set")
    print("="*60)
    count = preprocess_dataset(input_test, output_test, target_height, target_width, False, 1, augmentor)
    total_processed += count

if os.path.exists(input_query):
    print("\n" + "="*60)
    print("Preprocessing Query Set")
    print("="*60)
    count = preprocess_dataset(input_query, output_query, target_height, target_width, False, 1, augmentor)
    total_processed += count

print("\n" + "="*60)
print("DukeMTMC-reID Preprocessing Complete!")
print("="*60)
print(f"Total images processed: {total_processed}")
print(f"Output directory: {DATA_ROOT}/{OUTPUT_DATASET_NAME}")

### DukeMTMC-reID: Create Augmented Dataset

In [None]:
OUTPUT_AUGMENTED = f"{DATA_ROOT}/DukeMTMC-reID_augmented"
num_augmentations_per_image = 3

print("Creating augmented dataset for DukeMTMC-reID...")
print(f"Each image will have {num_augmentations_per_image} augmented versions")
print()

DATASET_NAME = "DukeMTMC-reID"
input_train = f"{DATA_ROOT}/{DATASET_NAME}/bounding_box_train"

if os.path.exists(input_train):
    print("Augmenting training set...")
    preprocess_dataset(
        input_train,
        f"{OUTPUT_AUGMENTED}/bounding_box_train",
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=True,
        num_augmentations=num_augmentations_per_image,
        augmentor=augmentor
    )

print("Augmented dataset created!")

# VeRi-776 Dataset Preprocessing

In [None]:

DATASET_NAME = "VeRi-776"
OUTPUT_DATASET_NAME = "VeRi-776_preprocessed"

input_train = f"{DATA_ROOT}/{DATASET_NAME}/image_train"
input_test = f"{DATA_ROOT}/{DATASET_NAME}/image_test"
input_query = f"{DATA_ROOT}/{DATASET_NAME}/image_query"

output_train = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/image_train"
output_test = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/image_test"
output_query = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/image_query"

print("="*60)
print(f"Preprocessing {DATASET_NAME} (Vehicle ReID)")
print("="*60)
print(f"Target size: {target_height}x{target_width}")
print()

for split, path in [("train", input_train), ("test", input_test), ("query", input_query)]:
    if os.path.exists(path):
        num_files = len(list(Path(path).glob('*.jpg')))
        print(f"{split:6s}: {num_files:5d} images found")
    else:
        print(f"{split:6s}: NOT FOUND - {path}")

In [None]:
total_processed = 0

if os.path.exists(input_train):
    print("\n" + "="*60)
    print("Preprocessing Training Set")
    print("="*60)
    count = preprocess_dataset(input_train, output_train, target_height, target_width, False, 1, augmentor)
    total_processed += count

if os.path.exists(input_test):
    print("\n" + "="*60)
    print("Preprocessing Test Set")
    print("="*60)
    count = preprocess_dataset(input_test, output_test, target_height, target_width, False, 1, augmentor)
    total_processed += count

if os.path.exists(input_query):
    print("\n" + "="*60)
    print("Preprocessing Query Set")
    print("="*60)
    count = preprocess_dataset(input_query, output_query, target_height, target_width, False, 1, augmentor)
    total_processed += count

print("\n" + "="*60)
print("VeRi-776 Preprocessing Complete!")
print("="*60)
print(f"Total images processed: {total_processed}")
print(f"Output directory: {DATA_ROOT}/{OUTPUT_DATASET_NAME}")

### VeRi-776: Create Augmented Dataset

In [None]:
OUTPUT_AUGMENTED = f"{DATA_ROOT}/VeRi-776_augmented"
num_augmentations_per_image = 3

print("Creating augmented dataset for VeRi-776...")
print(f"Each image will have {num_augmentations_per_image} augmented versions")
print()

DATASET_NAME = "VeRi-776"
input_train = f"{DATA_ROOT}/{DATASET_NAME}/image_train"

if os.path.exists(input_train):
    print("Augmenting training set...")
    preprocess_dataset(
        input_train,
        f"{OUTPUT_AUGMENTED}/image_train",
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=True,
        num_augmentations=num_augmentations_per_image,
        augmentor=augmentor
    )

print("Augmented dataset created!")

# CityFlow Dataset Preprocessing

In [None]:
DATASET_NAME = "CityFlow"
OUTPUT_DATASET_NAME = "CityFlow_preprocessed"

input_train = f"{DATA_ROOT}/{DATASET_NAME}/image_train"
input_test = f"{DATA_ROOT}/{DATASET_NAME}/image_test"
input_query = f"{DATA_ROOT}/{DATASET_NAME}/image_query"

output_train = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/image_train"
output_test = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/image_test"
output_query = f"{DATA_ROOT}/{OUTPUT_DATASET_NAME}/image_query"

print("="*60)
print(f"Preprocessing {DATASET_NAME} (Vehicle ReID)")
print("="*60)
print(f"Target size: {target_height}x{target_width}")
print()

for split, path in [("train", input_train), ("test", input_test), ("query", input_query)]:
    if os.path.exists(path):
        num_files = len(list(Path(path).glob('*.jpg')))
        print(f"{split:6s}: {num_files:5d} images found")
    else:
        print(f"{split:6s}: NOT FOUND - {path}")

In [None]:
import shutil

total_processed = 0

if os.path.exists(input_train):
    print("\n" + "="*60)
    print("Preprocessing Training Set")
    print("="*60)
    count = preprocess_dataset(input_train, output_train, target_height, target_width, False, 1, augmentor)
    total_processed += count

if os.path.exists(input_test):
    print("\n" + "="*60)
    print("Preprocessing Test Set")
    print("="*60)
    count = preprocess_dataset(input_test, output_test, target_height, target_width, False, 1, augmentor)
    total_processed += count

if os.path.exists(input_query):
    print("\n" + "="*60)
    print("Preprocessing Query Set")
    print("="*60)
    count = preprocess_dataset(input_query, output_query, target_height, target_width, False, 1, augmentor)
    total_processed += count

# Copy XML label files (required for CityFlow dataset)
print("\n" + "="*60)
print("Copying XML Label Files")
print("="*60)

source_dataset_root = Path(DATA_ROOT) / DATASET_NAME
output_dataset_root = Path(DATA_ROOT) / OUTPUT_DATASET_NAME

xml_files = ['train_label.xml', 'test_label.xml', 'query_label.xml']
copied_xml = 0

for xml_file in xml_files:
    source_xml = source_dataset_root / xml_file
    dest_xml = output_dataset_root / xml_file

    if source_xml.exists():
        shutil.copy2(source_xml, dest_xml)
        print(f"Copied: {xml_file}")
        copied_xml += 1
    else:
        print(f"Not found: {xml_file}")

print(f"\nCopied {copied_xml} XML label files")

print("\n" + "="*60)
print("CityFlow Preprocessing Complete!")
print("="*60)
print(f"Total images processed: {total_processed}")
print(f"Total XML files copied: {copied_xml}")
print(f"Output directory: {DATA_ROOT}/{OUTPUT_DATASET_NAME}")

### CityFlow: Create Augmented Dataset

In [None]:
OUTPUT_AUGMENTED = f"{DATA_ROOT}/CityFlow_augmented"
num_augmentations_per_image = 3

print("Creating augmented dataset for CityFlow...")
print(f"Each image will have {num_augmentations_per_image} augmented versions")
print()

DATASET_NAME = "CityFlow"
input_train = f"{DATA_ROOT}/{DATASET_NAME}/image_train"

if os.path.exists(input_train):
    print("Augmenting training set...")
    preprocess_dataset(
        input_train,
        f"{OUTPUT_AUGMENTED}/image_train",
        target_height=target_height,
        target_width=target_width,
        apply_augmentation=True,
        num_augmentations=num_augmentations_per_image,
        augmentor=augmentor
    )

# Copy XML label files to augmented dataset
print("\nCopying XML label files to augmented dataset...")
source_dataset_root = Path(DATA_ROOT) / DATASET_NAME
output_augmented_root = Path(OUTPUT_AUGMENTED)

xml_files = ['train_label.xml', 'test_label.xml', 'query_label.xml']
for xml_file in xml_files:
    source_xml = source_dataset_root / xml_file
    dest_xml = output_augmented_root / xml_file

    if source_xml.exists():
        shutil.copy2(source_xml, dest_xml)
        print(f"Copied: {xml_file}")

print("Augmented dataset created!")

# Summary

In [None]:
print("="*60)
print("PREPROCESSING SUMMARY")
print("="*60)
print()

datasets = [
    ("Market-1501", "market1501_preprocessed", ["bounding_box_train", "bounding_box_test", "query"]),
    ("DukeMTMC-reID", "DukeMTMC-reID_preprocessed", ["bounding_box_train", "bounding_box_test", "query"]),
    ("VeRi-776", "VeRi-776_preprocessed", ["image_train", "image_test", "image_query"]),
    ("CityFlow", "CityFlow_preprocessed", ["image_train", "image_test", "image_query"])
]

for dataset_name, output_name, folders in datasets:
    dataset_path = Path(DATA_ROOT) / output_name

    if dataset_path.exists():
        print(f"\n{dataset_name}:")
        print(f"  Location: {dataset_path}")

        total_images = 0
        for folder in folders:
            folder_path = dataset_path / folder
            if folder_path.exists():
                num_images = len(list(folder_path.glob('*.jpg'))) + len(list(folder_path.glob('*.png')))
                total_images += num_images
                split_name = folder.replace("bounding_box_", "").replace("image_", "")
                print(f"    {split_name:10s}: {num_images:6d} images")

        print(f"  {'TOTAL':>12s}: {total_images:6d} images")
    else:
        print(f"\n{dataset_name}: NOT PREPROCESSED")

print("\n" + "="*60)
print(f"All preprocessed datasets are saved in: {DATA_ROOT}")
print("="*60)