In [3]:
images_to_save = "/home/pablo.canosa/ssd/datasets_pablo/OpenEarthMap/OpenEarthMap/OpenEarthMap_wo_xBD/xbd_files.csv"
# Print the first 5 lines of the file
with open(images_to_save, 'r') as file:
    for _ in range(5):
        print(file.readline().strip())


pinery-bushfire_00000022_pre_disaster.png,adelaide_1.tif
pinery-bushfire_00000029_pre_disaster.png,adelaide_2.tif
pinery-bushfire_00000062_pre_disaster.png,adelaide_3.tif
pinery-bushfire_00000089_pre_disaster.png,adelaide_4.tif
pinery-bushfire_00000102_pre_disaster.png,adelaide_5.tif


In [1]:
import os
import shutil
import csv
from tqdm import tqdm  # specific for notebook progress bars

# ================= CONFIGURATION =================
# Path to the root OpenEarthMap directory containing the region folders (e.g., adelaide, tokyo)
BASE_DIR = '/home/pablo.canosa/ssd/datasets_pablo/OpenEarthMap/OpenEarthMap/OpenEarthMap_wo_xBD' 

# Paths to the folders where you downloaded/extracted the xBD images (tier1 and tier3 usually)
# The script will look for images in these folders sequentially.
XBD_SOURCE_DIRS = [
    '/home/pablo.canosa/ssd/datasets_pablo/OpenEarthMap/xview_full/geotiffs/hold/images',
    '/home/pablo.canosa/ssd/datasets_pablo/OpenEarthMap/xview_full/geotiffs/test/images',
    '/home/pablo.canosa/ssd/datasets_pablo/OpenEarthMap/xview_full/geotiffs/tier1/images',
    '/home/pablo.canosa/ssd/datasets_pablo/OpenEarthMap/xview_full/geotiffs/tier3/images'
]

# Path to the CSV file
CSV_PATH = os.path.join(BASE_DIR, 'xbd_files.csv')
# =================================================

def organize_xbd_images():
    # 1. Verify CSV exists
    if not os.path.exists(CSV_PATH):
        print(f"Error: CSV file not found at {CSV_PATH}")
        return

    print("Reading CSV and mapping files...")
    
    files_moved = 0
    files_missing = 0
    
    # Read the CSV file
    # Format: source_filename.png, target_filename.tif
    mappings = []
    with open(CSV_PATH, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) >= 2:
                mappings.append((row[0].strip(), row[1].strip()))

    print(f"Found {len(mappings)} file mappings. Starting copy process...")

    # Iterate with a progress bar
    for src_filename, target_filename in tqdm(mappings, desc="Processing Images"):
        
        # 2. Determine the Region Name
        # Logic: "adelaide_1.tif" -> region is "adelaide"
        # Logic: "santa_rosa_5.tif" -> region is "santa_rosa"
        # We split by the *last* underscore to separate region from ID
        file_base = os.path.splitext(target_filename)[0] # remove .tif
        if '_' in file_base:
            region_name = file_base.rsplit('_', 1)[0]
        else:
            # Fallback if no underscore (unlikely based on dataset specs)
            region_name = file_base

        # 3. Construct Destination Path
        # Structure: region/images/target_filename
        dest_folder = os.path.join(BASE_DIR, region_name, 'images')
        dest_path = os.path.join(dest_folder, target_filename)

        # Ensure destination folder exists (it should, based on dataset structure)
        if not os.path.exists(dest_folder):
            # Optional: Create it if it doesn't exist, or skip. 
            # Uncomment the next line to auto-create folders
            os.makedirs(dest_folder, exist_ok=True) 
            pass

        # 4. Find the Source Image
        found_source = False
        source_full_path = ""
        
        for source_dir in XBD_SOURCE_DIRS:
            potential_path = os.path.join(source_dir, src_filename)
            if os.path.exists(potential_path):
                source_full_path = potential_path
                found_source = True
                break
        
        if found_source:
            # 5. Copy the file
            try:
                #shutil.copy2(source_full_path, dest_path)
                files_moved += 1
            except Exception as e:
                print(f"\nError copying {src_filename}: {e}")
        else:
            # Log missing files (useful for debugging downloads)
            # print(f"\nMissing source file: {src_filename}") # Uncomment to see every missing file
            files_missing += 1

    print("-" * 30)
    print("Processing Complete.")
    print(f"Successfully copied: {files_moved}")
    print(f"Files not found:     {files_missing}")
    if files_missing > 0:
        print("Tip: Check if your XBD_SOURCE_DIRS cover all downloaded xBD subfolders (tier1, tier3, etc).")


In [2]:
organize_xbd_images()

Reading CSV and mapping files...
Found 1162 file mappings. Starting copy process...


Processing Images: 100%|██████████| 1162/1162 [00:00<00:00, 37121.98it/s]

------------------------------
Processing Complete.
Successfully copied: 0
Files not found:     1162
Tip: Check if your XBD_SOURCE_DIRS cover all downloaded xBD subfolders (tier1, tier3, etc).



