In [4]:
import os
import rasterio
from rasterio.windows import Window
from tqdm import tqdm

In [5]:
pwd

'/home/airg/rabedi/thesis/diffusion'

In [6]:
input_root = "/home/airg/data/imagery/planet/tiles/"
output_root = "./pretrain_data_folder"
chip_size = 224
years_to_process = ["2022"]

In [7]:
def rechip_tile(input_path, output_folder):
    """
    Split a large TIFF into 224x224 chips and save them to output_folder.
    Skip chips that already exist.
    """
    basename = os.path.basename(input_path).replace('.tif', '')
    with rasterio.open(input_path) as src:
        width, height = src.width, src.height
        count = 0
        for row in range(0, height, chip_size):
            for col in range(0, width, chip_size):
                if row + chip_size > height or col + chip_size > width:
                    continue  # Skip edge chips

                chip_name = f"{basename}_{count}_cog.tif"
                chip_path = os.path.join(output_folder, chip_name)

                if os.path.exists(chip_path):
                    count += 1
                    continue  # Skip already processed chip

                window = Window(col, row, chip_size, chip_size)
                transform = src.window_transform(window)
                chip = src.read(window=window)

                profile = src.profile.copy()
                profile.update({
                    "height": chip_size,
                    "width": chip_size,
                    "transform": transform
                })

                os.makedirs(output_folder, exist_ok=True)

                with rasterio.open(chip_path, 'w', **profile) as dst:
                    dst.write(chip)

                count += 1


def process_selected_years(input_root, output_root, years):
    """
    Process a list of years, rechipping all .tif files in each year's folder.
    Skips missing folders automatically.
    """
    for year in years:
        year_path = os.path.join(input_root, str(year))
        if not os.path.isdir(year_path):
            print(f"Skipping missing year folder: {year_path}")
            continue

        out_year_path = os.path.join(output_root, str(year))
        os.makedirs(out_year_path, exist_ok=True)

        tif_files = [f for f in os.listdir(year_path) if f.endswith('.tif')]
        for tif in tqdm(tif_files, desc=f"Processing year {year}"):
            tif_path = os.path.join(year_path, tif)
            rechip_tile(tif_path, out_year_path)


def process_all_years(input_root, output_root):
    """
    Automatically find and process all subfolders (representing years) in input_root.
    """
    year_folders = [
        f for f in os.listdir(input_root)
        if os.path.isdir(os.path.join(input_root, f))
    ]
    process_selected_years(input_root, output_root, year_folders)


In [9]:
process_selected_years(input_root, output_root, years_to_process)

Processing year 2022:   9%|█▎             | 5005/55989 [00:30<05:11, 163.93it/s]


RasterioIOError: Read or write failed. /home/airg/data/imagery/planet/tiles/2022/tile752256_2022-09_buf179_cog.tif, band 1: IReadBlock failed at X offset 1, Y offset 2: TIFFReadEncodedTile() failed.