# Prepare images for annotation

In [1]:
import os

import numpy as np

import pandas as pd
import geopandas as gpd

from PIL import Image

from tqdm import tqdm

In [2]:
# Silence Pillow, as we do expect 100000000 pixels; this is not a decompression bomb!
Image.MAX_IMAGE_PIXELS = 10000**2

In [3]:
# Load the whole rooftop dataset
path = "../data/solkat/SOLKAT_DACH.gpkg"
gdf = gpd.read_file(
    path,
    layer="SOLKAT_CH_DACH",
    columns=[],
    engine="pyogrio",
    use_arrow=True,
)

# Simplify as points
centroids = gdf.centroid

# We will slice tiles into 10x10 sub-chunks (i.e. 100x100 meters)
gdf["i"] = (centroids.x / 1000).astype(int)
gdf["j"] = (centroids.y / 1000).astype(int)
gdf["u"] = (centroids.x / 100).astype(int) % 10
gdf["v"] = (centroids.y / 100).astype(int) % 10
gdf

Unnamed: 0,geometry,i,j,u,v
0,"MULTIPOLYGON (((2676475.451 1254000.615, 26764...",2676,1253,4,9
1,"MULTIPOLYGON (((2676475.043 1253999.79, 267647...",2676,1254,4,0
2,"MULTIPOLYGON (((2676475.043 1253999.79, 267646...",2676,1254,4,0
3,"MULTIPOLYGON (((2676552.815 1253992.39, 267655...",2676,1253,5,9
4,"MULTIPOLYGON (((2676556.556 1254000.741, 26765...",2676,1253,5,9
...,...,...,...,...,...
10071750,"MULTIPOLYGON (((2514423.645 1172206.735, 25143...",2514,1172,4,2
10071751,"MULTIPOLYGON (((2514398.35 1172208.981, 251439...",2514,1172,3,2
10071752,"MULTIPOLYGON (((2513394.318 1172545.786, 25133...",2513,1172,3,5
10071753,"MULTIPOLYGON (((2513395.31 1172544.195, 251339...",2513,1172,3,5


In [4]:
# Assuming some images have been downloaded
input_folder = "../data/swissimage/"
input_df = pd.DataFrame()
input_df["path"] = [os.path.join(input_folder, name) for name in os.listdir(input_folder) if name.endswith(".tif")]
ij_df = input_df["path"].str.extract(r"swissimage-dop10_(?P<year>\d+)_(?P<i>\d+)-(?P<j>\d+)").astype(int)
input_df["year"] = ij_df["year"]
input_df["i"] = ij_df["i"]
input_df["j"] = ij_df["j"]
input_df

Unnamed: 0,path,year,i,j
0,../data/swissimage/swissimage-dop10_2022_2756-...,2022,2756,1231
1,../data/swissimage/swissimage-dop10_2022_2687-...,2022,2687,1260
2,../data/swissimage/swissimage-dop10_2022_2752-...,2022,2752,1212
3,../data/swissimage/swissimage-dop10_2022_2703-...,2022,2703,1244
4,../data/swissimage/swissimage-dop10_2022_2677-...,2022,2677,1283
5,../data/swissimage/swissimage-dop10_2022_2722-...,2022,2722,1272
6,../data/swissimage/swissimage-dop10_2023_2580-...,2023,2580,1093
7,../data/swissimage/swissimage-dop10_2022_2716-...,2022,2716,1234
8,../data/swissimage/swissimage-dop10_2022_2754-...,2022,2754,1215
9,../data/swissimage/swissimage-dop10_2022_2681-...,2022,2681,1253


In [5]:
# Generate samples
output_folder = "../data/sample/"
num_patch_per_tile = 10
for _, row in tqdm(input_df.iterrows(), total=len(input_df)):
    sub_gdf = gdf[(gdf["i"] == row["i"]) & (gdf["j"] == row["j"])]

    # Load full tile
    image = Image.open(row["path"])
    assert image.mode == "RGB"
    assert image.size == (10000, 10000)
    image.load()

    # Select chunk coordinates
    # Note: we only take the most "populated" chunks in each tile
    uvs = sub_gdf[["u", "v"]].value_counts().iloc[:num_patch_per_tile].index

    # Generate chunks
    for u, v in uvs:
        sub_name = f"swissimage-dop10_{row['year']}_{row['i']}.{u}-{row['j']}.{v}.jpg"
        sub_path = os.path.join(output_folder, sub_name)
        sub_image = image.crop((u * 1000, (10 - v - 1) * 1000, (u + 1) * 1000, (10 - v) * 1000))
        sub_image.save(sub_path)

100%|██████████| 25/25 [00:30<00:00,  1.21s/it]
