Image preprocessing on the Potsdam image and mask dataset by [LUH Download](https://www.isprs.org/education/benchmarks/UrbanSemLab/Default.aspx). <br> Author: Kiunke

In [36]:
# Global Imports
import os
import zipfile
import shutil
from PIL import Image
from tqdm.notebook import tqdm





In [33]:
# Extract Potsdam dataset and only keep Labels and RGB Images
with zipfile.ZipFile('Potsdam.zip', 'r') as zip_ref:
    zip_ref.extractall('./')
    
os.remove('Potsdam/1_DSM.rar')
os.remove('Potsdam/1_DSM_normalisation.zip')
os.remove('Potsdam/3_Ortho_IRRG.zip')
os.remove('Potsdam/4_Ortho_RGBIR.zip')
os.remove('Potsdam/5_Labels_all_noBoundary.zip')
os.remove('Potsdam/5_Labels_for_participants.zip')
os.remove('Potsdam/5_Labels_for_participants_no_Boundary.zip')
os.remove('Potsdam/assess_classification_reference_implementation.tgz')

with zipfile.ZipFile('Potsdam/2_Ortho_RGB.zip', 'r') as zip_ref:
    zip_ref.extractall('Potsdam/')
with zipfile.ZipFile('Potsdam/5_Labels_all.zip', 'r') as zip_ref:
    zip_ref.extractall('Potsdam/5_Labels_all')

In [34]:
def mask_to_binary(path):
    files = [f for f in os.listdir(path) if f.endswith('.tif')]
    
    for file in tqdm(files, desc='Processing images'):
        file_path = os.path.join(path,file)
        img = Image.open(file_path)
        pixels = img.load()
        for i in range(img.size[0]):  
            for j in range(img.size[1]):  
                if pixels[i, j] == (0, 255, 0):  
                    pixels[i, j] = (255, 255, 255) 
                else:
                    pixels[i, j] = (0, 0, 0) 
        img.save(file_path, format='TIFF')
        print(f'Converted and saved image to: {file_path}')
        
        
mask_to_binary('Potsdam/5_Labels_all')

Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_7_13_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_6_13_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_6_7_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_5_14_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_7_9_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_5_11_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_4_13_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_6_11_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_2_13_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_4_12_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_7_10_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_6_10_label.tif
Converted and saved image to:Potsdam/5_Labels_all/top_potsdam_3_11_label.tif
C

In [35]:
def cut_tif_into_patches_with_hard_boundary(i, d, patch_size):
    if not os.path.exists(d):
        os.makedirs(d)
    
    files = [f for f in os.listdir(i) if f.endswith('.tif')]
    for file in tqdm(files, desc='Processing images'):
        file_path = os.path.join(i, file)
        img = Image.open(file_path)
        
        width, height = img.size
        x_patches = width // patch_size
        y_patches = height // patch_size
        
        for x in range(x_patches):
            for y in range(y_patches):
                left = x* patch_size
                upper = y * patch_size
                right = (x+1) * patch_size
                lower = (y+1) * patch_size
                
                patch = img.crop((left, upper, right, lower))
                base, suffix = os.path.splitext(file)
                parts = base.rsplit('_', 1)
                patch_filename = f'{parts[0]}_{x}_{y}_{parts[1]}{suffix}'
                patch.save(os.path.join(d, patch_filename))

def cut_tif_into_patches_with_overlap(i, d, patch_size, overlap=128):
    if not os.path.exists(d):
        os.makedirs(d)
    
    files = [f for f in os.listdir(i) if f.endswith('.tif')]
    for file in tqdm(files, desc='Processing images'):
        file_path = os.path.join(i, file)
        img = Image.open(file_path)
        
        width, height = img.size
        step = patch_size - overlap  # Step size is reduced by the overlap
        x_patches = (width - overlap) // step
        y_patches = (height - overlap) // step
        
        for x in range(x_patches):
            for y in range(y_patches):
                # Calculate the position of each patch considering the overlap
                left = x * step
                upper = y * step
                right = left + patch_size
                lower = upper + patch_size

                # Crop and save the patch
                patch = img.crop((left, upper, right, lower))
                base, suffix = os.path.splitext(file)
                parts = base.rsplit('_', 1)
                patch_filename = f'{parts[0]}_{x}_{y}_{parts[1]}{suffix}'
                patch.save(os.path.join(d, patch_filename))
                
                
cut_tif_into_patches_with_hard_boundary('Potsdam/5_Labels_all', 'Potsdam/5_Labels_all_Patched', 256)
cut_tif_into_patches_with_hard_boundary('Potsdam/2_Ortho_RGB', 'Potsdam/2_Ortho_RGB_Patched', 256)
  
    

In [37]:
# Group Labels and RGB Images together
images_folder = 'Potsdam/2_Ortho_RGB_Patched'
labels_folder = 'Potsdam/5_Labels_all_Patched'
output_folder = 'Potsdam/Train'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
images = [f for f in os.listdir(images_folder) if f.endswith('RGB.tif')]
labels = [f for f in os.listdir(labels_folder) if f.endswith('label.tif')]
images.sort()
labels.sort()
print(images.__len__())
print(labels.__len__())

for image in tqdm(images, desc='Processing images'):
    folder_name = image.replace("op_potsdam_", "").replace("_RGB.tif","")
    new_folder_path = os.path.join(os.path.join(output_folder, folder_name))
    os.makedirs(new_folder_path)
    label_file = image.replace('_RGB.tif', "_label.tif")
    shutil.copy(os.path.join(images_folder, image), os.path.join(new_folder_path, image))
    shutil.copy(os.path.join(labels_folder, label_file), os.path.join(new_folder_path, label_file))
    

20102
20102


Processing images:   0%|          | 0/20102 [00:00<?, ?it/s]