# THE FOLDER

In [1]:
THEFOLDER = "C:\\Users\\smdur\\OneDrive\\Desktop\\GlobalPCL23"

# Tile western Conus PCL

In [2]:
import os
from itertools import product
import rasterio as rio
from rasterio import windows

in_path = "C:\\Users\\smdur\\OneDrive\\Desktop\\PCLCONUS\\Input\\PCL\\"
input_filename = 'pcl_west_wgs.tif'

out_path = f"{THEFOLDER}\\PCLTILES\\"
output_filename = 'pcltile_{}-{}.tif'

widthtile = 5000
heighttile = 5000

def get_tiles(ds, width=widthtile, height=heighttile):
    nols, nrows = ds.meta['width'], ds.meta['height']
    offsets = product(range(0, nols, width), range(0, nrows, height))
    big_window = windows.Window(col_off=0, row_off=0, width=nols, height=nrows)
    for col_off, row_off in offsets:
        window = windows.Window(col_off=col_off, row_off=row_off, width=width, height=height).intersection(big_window)
        transform = windows.transform(window, ds.transform)
        yield window, transform

os.makedirs(out_path, exist_ok=True)

tile_numbers = []

with rio.open(os.path.join(in_path, input_filename)) as inds:
    tile_width, tile_height = widthtile, heighttile
    nodata = inds.nodata
    meta = inds.meta.copy()
    for window, transform in get_tiles(inds):
        data = inds.read(window=window)
        if nodata is not None and not (data == nodata).all():
            meta['transform'] = transform
            meta['width'], meta['height'] = window.width, window.height
            tile_number = f"{int(window.col_off)}-{int(window.row_off)}"
            tile_numbers.append(tile_number)
            outpath = os.path.join(out_path, output_filename.format(int(window.col_off), int(window.row_off)))
            with rio.open(outpath, 'w', **meta) as outds:
                outds.write(data)

# Print or store the tile numbers
TILENUMBER = tile_numbers

del in_path, input_filename, tile_numbers
del out_path, output_filename, widthtile, heighttile, tile_width, tile_height
del meta, nodata, window, inds, get_tiles

# Downlaod training data and create training samples

In [3]:
# import os
# import glob
# import subprocess
# from concurrent.futures import ThreadPoolExecutor
# from osgeo import gdal
# import rioxarray
# import planetary_computer
# from pystac_client import Client
# import osmnx as ox
# import rasterio
# from rasterio.features import rasterize
# from rasterio.windows import from_bounds, Window
# import numpy as np
# import scipy.ndimage
# from shapely.geometry import box, Point
# from geopandas import GeoDataFrame
# import matplotlib.pyplot as plt
# from rasterio.plot import show
# import json

# TILENUMBER = ['75000-35000', '75000-40000', '75000-45000']
# CHIP_SIZE = 128
# CHECKPOINT_FILE = r"C:\Users\smdur\OneDrive\Desktop\PCLTraining3\checkpoint.json"

# def load_checkpoint():
#     if os.path.exists(CHECKPOINT_FILE):
#         with open(CHECKPOINT_FILE, 'r') as f:
#             return json.load(f)
#     return {}

# def save_checkpoint(tile_number, step, data=None):
#     checkpoint = load_checkpoint()
#     if tile_number not in checkpoint:
#         checkpoint[tile_number] = {}
#     checkpoint[tile_number]['step'] = step
#     if data:
#         checkpoint[tile_number].update(data)
#     with open(CHECKPOINT_FILE, 'w') as f:
#         json.dump(checkpoint, f)

# def delete_non_resampled_files(resampled_files, tif_dir):
#     for file in os.listdir(tif_dir):
#         if file not in resampled_files and file.endswith('.tif'):
#             file_path = os.path.join(tif_dir, file)
#             try:
#                 os.remove(file_path)
#                 print(f"Deleted: {file_path}")
#             except Exception as e:
#                 print(f"Failed to delete {file_path}: {e}")

# def process_dem(tif_path, tif_dir, tile_number):
#     tif_data = rioxarray.open_rasterio(tif_path)
#     bbox_of_interest = tif_data.rio.bounds()
#     catalog = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
#     search = catalog.search(collections=["cop-dem-glo-30"], bbox=bbox_of_interest)
#     items = list(search.get_items())
    
#     def process_item(item, idx):
#         signed_asset = planetary_computer.sign(item.assets["data"])
#         data = rioxarray.open_rasterio(signed_asset.href).squeeze().drop("band")
#         data.rio.write_crs("EPSG:4326", inplace=True)
#         output_tif_path = os.path.join(tif_dir, f"output_dataDEM_{idx}.tif")
#         data.rio.to_raster(output_tif_path)
    
#     with ThreadPoolExecutor(max_workers=4) as executor:
#         for i, item in enumerate(items):
#             executor.submit(process_item, item, i)

#     output_tif = os.path.join(tif_dir, f"outputtile_DEM_{tile_number}.tif")
#     merge_command = [
#         "python", "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
#         "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
#         "-o", output_tif,
#         "-n", "-9999", "-a_nodata", "-9999"] + glob.glob(os.path.join(tif_dir, "output_dataDEM_*.tif"))

#     process_hag = subprocess.run(merge_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

#     if process_hag.returncode != 0:
#         print(f"Error in merging DEM: {process_hag.stderr}")
#         return None

#     src_ds = gdal.Open(output_tif, gdal.GA_ReadOnly)
#     target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
#     driver = gdal.GetDriverByName('GTiff')
#     output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataDEM_{tile_number}.tif")
#     out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
#     out_ds.SetGeoTransform(target_ds.GetGeoTransform())
#     out_ds.SetProjection(target_ds.GetProjection())
#     gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
#     src_ds, target_ds, out_ds = None, None, None

#     os.remove(output_tif)
#     for tif in glob.glob(os.path.join(tif_dir, "output_dataDEM_*.tif")):
#         try:
#             os.remove(tif)
#         except Exception as e:
#             print(f"Failed to delete {tif}: {e}")

#     return output_resampled_path

#     del tif_data, bbox_of_interest, catalog, search, items
#     del output_tif, merge_command, process_hag
#     del src_ds, target_ds, driver, output_resampled_path


# def process_lidar(tif_path, tif_dir, tile_number):
#     lidar_dir = r"C:\Users\smdur\OneDrive\Desktop\GlobalData\LIDAR2"
#     lidar_tifs = glob.glob(os.path.join(lidar_dir, "*.tif"))

#     # Get the bounding box of the input tif_path
#     with rasterio.open(tif_path) as src:
#         bbox = src.bounds
#         input_geom = box(bbox.left, bbox.bottom, bbox.right, bbox.top)

#     # Find overlapping LIDAR tiles
#     overlapping_tifs = []
#     for tif in lidar_tifs:
#         with rasterio.open(tif) as src:
#             lidar_bbox = src.bounds
#             lidar_geom = box(lidar_bbox.left, lidar_bbox.bottom, lidar_bbox.right, lidar_bbox.top)
#             if input_geom.intersects(lidar_geom):
#                 overlapping_tifs.append(tif)

#     if not overlapping_tifs:
#         print(f"No overlapping LIDAR tiles found for {tile_number}")
#         return None

#     output_tif = os.path.join(tif_dir, f"outputtile_lidar_{tile_number}.tif")
#     merge_command = [
#         "python", "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
#         "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
#         "-o", output_tif,
#         "-n", "255", "-a_nodata", "255"] + overlapping_tifs

#     process_hag = subprocess.run(merge_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

#     if process_hag.returncode != 0:
#         print(f"Error in merging LIDAR: {process_hag.stderr}")
#         return None

#     src_ds = gdal.Open(output_tif, gdal.GA_ReadOnly)
#     target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
#     driver = gdal.GetDriverByName('GTiff')
#     output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataLIDAR_{tile_number}.tif")
#     out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
#     out_ds.SetGeoTransform(target_ds.GetGeoTransform())
#     out_ds.SetProjection(target_ds.GetProjection())
#     gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
#     src_ds, target_ds, out_ds = None, None, None

#     os.remove(output_tif)
#     return output_resampled_path

#     del lidar_tifs, bbox, input_geom, overlapping_tifs, lidar_bbox, lidar_geom
#     del output_tif, merge_command, process_hag
#     del src_ds, target_ds, driver, output_resampled_path


# def process_rivers(tif_path, tif_dir, tile_number):
#     dem_data = rioxarray.open_rasterio(tif_path)
#     bbox = dem_data.rio.bounds()
#     custom_filter = '["waterway"~"river"]'
#     graph = ox.graph_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], custom_filter=custom_filter, simplify=True, retain_all=True, truncate_by_edge=True)
#     gdf = ox.graph_to_gdfs(graph, nodes=False)

#     with rasterio.open(tif_path) as src:
#         window = from_bounds(*src.bounds, src.transform)
#         transform = rasterio.windows.transform(window, src.transform)
#         raster = np.zeros((int(window.height), int(window.width)), dtype=np.uint8)
#         shapes = ((geom, 1) for geom in gdf['geometry'])
#         burned = rasterize(shapes, out=raster, fill=0, transform=transform, all_touched=True)
#         distance_grid = scipy.ndimage.distance_transform_edt(burned == 0)
#         decay_grid = np.exp(-0.07 * distance_grid)

#         clipped_meta = src.meta.copy()
#         clipped_meta.update({"driver": "GTiff", "height": int(window.height), "width": int(window.width), "transform": transform, "dtype": rasterio.float32, "count": 1, "compress": 'lzw'})
#         output_path = os.path.join(tif_dir, f'exponential_decay_CO_river_{tile_number}.tif')
#         with rasterio.open(output_path, 'w', **clipped_meta) as dst:
#             dst.write(decay_grid.astype(np.float32), 1)

#     src_ds = gdal.Open(output_path, gdal.GA_ReadOnly)
#     target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
#     driver = gdal.GetDriverByName('GTiff')
#     output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataRivers_{tile_number}.tif")
#     out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
#     out_ds.SetGeoTransform(target_ds.GetGeoTransform())
#     out_ds.SetProjection(target_ds.GetProjection())
#     gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
#     src_ds, target_ds, out_ds = None, None, None
#     os.remove(output_path)

#     return output_resampled_path

#     del dem_data, bbox, custom_filter, graph, gdf
#     del window, transform, raster, shapes, burned, distance_grid, decay_grid
#     del clipped_meta, output_path
#     del src_ds, target_ds, driver, output_resampled_path


# def process_roads(tif_path, tif_dir, tile_number):
#     extent_data = rioxarray.open_rasterio(tif_path)
#     bbox = extent_data.rio.bounds()
#     graph = ox.graph_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], network_type='drive', simplify=True)
#     gdf = ox.graph_to_gdfs(graph, nodes=False)

#     with rasterio.open(tif_path) as src:
#         window = from_bounds(*src.bounds, src.transform)
#         transform = rasterio.windows.transform(window, src.transform)
#         raster = np.zeros((int(window.height), int(window.width)), dtype=np.uint8)
#         shapes = ((geom, 1) for geom in gdf['geometry'])
#         burned = rasterize(shapes, out=raster, fill=0, transform=transform, all_touched=True)
#         distance_grid = scipy.ndimage.distance_transform_edt(burned == 0)
#         decay_grid = np.exp(-0.07 * distance_grid)

#         clipped_meta = src.meta.copy()
#         clipped_meta.update({"driver": "GTiff", "height": int(window.height), "width": int(window.width), "transform": transform, "dtype": rasterio.float32, "count": 1, "compress": 'lzw'})
#         output_path = os.path.join(tif_dir, f'exponential_decay_CO_roads_{tile_number}.tif')
#         with rasterio.open(output_path, 'w', **clipped_meta) as dst:
#             dst.write(decay_grid.astype(np.float32), 1)

#     src_ds = gdal.Open(output_path, gdal.GA_ReadOnly)
#     target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
#     driver = gdal.GetDriverByName('GTiff')
#     output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataRoads_{tile_number}.tif")
#     out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
#     out_ds.SetGeoTransform(target_ds.GetGeoTransform())
#     out_ds.SetProjection(target_ds.GetProjection())
#     gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
#     src_ds, target_ds, out_ds = None, None, None
#     os.remove(output_path)

#     return output_resampled_path

#     del extent_data, bbox, graph, gdf
#     del window, transform, raster, shapes, burned, distance_grid, decay_grid
#     del clipped_meta, output_path
#     del src_ds, target_ds, driver, output_resampled_path


# def generate_random_points(geometry, num_points):
#     points = []
#     min_x, min_y, max_x, max_y = geometry.bounds
#     while len(points) < num_points:
#         random_point = Point(np.random.uniform(min_x, max_x), np.random.uniform(min_y, max_y))
#         if random_point.within(geometry):
#             points.append(random_point)
#     return points


# def process_chips(tif_path, tif_dir, lat_long, chip_size=128):
#     resampled_lidar_path = os.path.join(tif_dir, f"output_resampled_dataLIDAR_{tile_number}.tif")
#     resampled_dem_path = os.path.join(tif_dir, f"output_resampled_dataDEM_{tile_number}.tif")
#     resampled_rivers_path = os.path.join(tif_dir, f"output_resampled_dataRivers_{tile_number}.tif")
#     resampled_roads_path = os.path.join(tif_dir, f"output_resampled_dataRoads_{tile_number}.tif")

#     training_chips_dir = os.path.join(THEFOLDER, "trainingchips")
#     os.makedirs(training_chips_dir, exist_ok=True)

#     for i, (lat, lon) in enumerate(lat_long):
#         try:
#             paths = [resampled_lidar_path, resampled_dem_path, resampled_rivers_path, resampled_roads_path, tif_path]
#             labels = ['lidar', 'dem', 'rivers', 'roads', 'pcllabels']
            
#             for path, label in zip(paths, labels):
#                 with rasterio.open(path) as src:
#                     col, row = src.index(lon, lat)
#                     window = Window(col - chip_size // 2, row - chip_size // 2, chip_size, chip_size)
#                     chip_data = src.read(1, window=window)
                    
#                     out_meta = src.meta.copy()
#                     out_meta.update({
#                         "driver": "GTiff",
#                         "height": chip_size,
#                         "width": chip_size,
#                         "transform": src.window_transform(window)
#                     })

#                     chip_output_dir = os.path.join(training_chips_dir, label)
#                     os.makedirs(chip_output_dir, exist_ok=True)
                    
#                     chip_output_path = os.path.join(chip_output_dir, f"{label.upper()}_Chip_{tile_number}_{i}.tif")

#                     if chip_data.shape == (chip_size, chip_size) and np.any(chip_data != src.nodata):
#                         with rasterio.open(chip_output_path, "w", **out_meta) as dest:
#                             dest.write(chip_data, 1)
#                     else:
#                         print(f"Skipping {label} chip {i} because it is not properly shaped or is filled with nodata.")
#         except Exception as e:
#             print(f"An error occurred while processing chip {i}: {e}")
            
#     del resampled_lidar_path, resampled_dem_path, resampled_rivers_path, resampled_roads_path
#     del training_chips_dir, paths, labels, col, row, window, chip_data, out_meta, chip_output_dir, chip_output_path


# if __name__ == "__main__":
#     checkpoint = load_checkpoint()
#     for tile_number in TILENUMBER:
#         tif_path = f"{THEFOLDER}\\PCLTILES\\pcltile_{tile_number}.tif"
#         tif_dir = f"{THEFOLDER}\\TIFFOUTPUT\\{tile_number}"
#         os.makedirs(tif_dir, exist_ok=True)

#         if tile_number not in checkpoint or checkpoint[tile_number]['step'] < 1:
#             resampled_dem_path = process_dem(tif_path, tif_dir, tile_number)
#             save_checkpoint(tile_number, 1, {'resampled_dem_path': resampled_dem_path})

#         if tile_number not in checkpoint or checkpoint[tile_number]['step'] < 2:
#             resampled_lidar_path = process_lidar(tif_path, tif_dir, tile_number)
#             save_checkpoint(tile_number, 2, {'resampled_lidar_path': resampled_lidar_path})

#         if tile_number not in checkpoint or checkpoint[tile_number]['step'] < 3:
#             resampled_rivers_path = process_rivers(tif_path, tif_dir, tile_number)
#             save_checkpoint(tile_number, 3, {'resampled_rivers_path': resampled_rivers_path})

#         if tile_number not in checkpoint or checkpoint[tile_number]['step'] < 4:
#             resampled_roads_path = process_roads(tif_path, tif_dir, tile_number)
#             save_checkpoint(tile_number, 4, {'resampled_roads_path': resampled_roads_path})

#         resampled_files = [
#             checkpoint[tile_number]['resampled_dem_path'],
#             checkpoint[tile_number]['resampled_lidar_path'],
#             checkpoint[tile_number]['resampled_rivers_path'],
#             checkpoint[tile_number]['resampled_roads_path']
#         ]

#         delete_non_resampled_files([os.path.basename(f) for f in resampled_files], tif_dir)

#         if tile_number not in checkpoint or checkpoint[tile_number]['step'] < 5:
#             # Generate random points within the tile bounds
#             with rasterio.open(tif_path) as src:
#                 bounds = src.bounds
#                 crs = src.crs
#                 img = src.read(1)

#             rect = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
#             buffered_rect = rect.buffer(-0.15)
#             random_points = generate_random_points(buffered_rect, 5000)
#             gdf_points = GeoDataFrame(geometry=random_points, crs=crs).to_crs(crs)
#             gdf_points_wgs84 = gdf_points.to_crs(epsg=4326)
#             lat_long = gdf_points_wgs84.geometry.apply(lambda geom: (geom.y, geom.x)).tolist()

#             save_checkpoint(tile_number, 5, {'lat_long': lat_long})

#         if tile_number in checkpoint and 'lat_long' in checkpoint[tile_number]:
#             lat_long = checkpoint[tile_number]['lat_long']
#         else:
#             with rasterio.open(tif_path) as src:
#                 bounds = src.bounds
#                 crs = src.crs
#                 img = src.read(1)

#             rect = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
#             buffered_rect = rect.buffer(-0.15)
#             random_points = generate_random_points(buffered_rect, 5000)
#             gdf_points = GeoDataFrame(geometry=random_points, crs=crs).to_crs(crs)
#             gdf_points_wgs84 = gdf_points.to_crs(epsg=4326)
#             lat_long = gdf_points_wgs84.geometry.apply(lambda geom: (geom.y, geom.x)).tolist()
#             save_checkpoint(tile_number, 5, {'lat_long': lat_long})

#         if tile_number not in checkpoint or checkpoint[tile_number]['step'] < 6:
#             process_chips(tif_path, tif_dir, lat_long)
#             save_checkpoint(tile_number, 6)

#         print(f"Processing for tile {tile_number} completed.")

#     print("All processing completed.")


In [4]:
import os
import glob
import subprocess
from concurrent.futures import ThreadPoolExecutor
from osgeo import gdal
import rioxarray
import planetary_computer
from pystac_client import Client
import osmnx as ox
import rasterio
from rasterio.features import rasterize
from rasterio.windows import from_bounds, Window
import numpy as np
import scipy.ndimage
from shapely.geometry import box, Point
from geopandas import GeoDataFrame
import matplotlib.pyplot as plt
from rasterio.plot import show

TILENUMBER = ['75000-35000', '75000-40000', '75000-45000']
CHIP_SIZE = 128

def delete_non_resampled_files(resampled_files, tif_dir):
    for file in os.listdir(tif_dir):
        if file not in resampled_files and file.endswith('.tif'):
            file_path = os.path.join(tif_dir, file)
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Failed to delete {file_path}: {e}")

def process_dem(tif_path, tif_dir, tile_number):
    tif_data = rioxarray.open_rasterio(tif_path)
    bbox_of_interest = tif_data.rio.bounds()
    catalog = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
    search = catalog.search(collections=["cop-dem-glo-30"], bbox=bbox_of_interest)
    items = list(search.get_items())
    
    def process_item(item, idx):
        signed_asset = planetary_computer.sign(item.assets["data"])
        data = rioxarray.open_rasterio(signed_asset.href).squeeze().drop("band")
        data.rio.write_crs("EPSG:4326", inplace=True)
        output_tif_path = os.path.join(tif_dir, f"output_dataDEM_{idx}.tif")
        data.rio.to_raster(output_tif_path)
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        for i, item in enumerate(items):
            executor.submit(process_item, item, i)

    output_tif = os.path.join(tif_dir, f"outputtile_DEM_{tile_number}.tif")
    merge_command = [
        "python", "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
        "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
        "-o", output_tif,
        "-n", "-9999", "-a_nodata", "-9999"] + glob.glob(os.path.join(tif_dir, "output_dataDEM_*.tif"))

    process_hag = subprocess.run(merge_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    if process_hag.returncode != 0:
        print(f"Error in merging DEM: {process_hag.stderr}")
        return None

    src_ds = gdal.Open(output_tif, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataDEM_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None

    os.remove(output_tif)
    for tif in glob.glob(os.path.join(tif_dir, "output_dataDEM_*.tif")):
        try:
            os.remove(tif)
        except Exception as e:
            print(f"Failed to delete {tif}: {e}")

    return output_resampled_path

    del tif_data, bbox_of_interest, catalog, search, items
    del output_tif, merge_command, process_hag
    del src_ds, target_ds, driver, output_resampled_path


def process_lidar(tif_path, tif_dir, tile_number):
    lidar_dir = r"C:\Users\smdur\OneDrive\Desktop\GlobalData\LIDAR2"
    lidar_tifs = glob.glob(os.path.join(lidar_dir, "*.tif"))

    # Get the bounding box of the input tif_path
    with rasterio.open(tif_path) as src:
        bbox = src.bounds
        input_geom = box(bbox.left, bbox.bottom, bbox.right, bbox.top)

    # Find overlapping LIDAR tiles
    overlapping_tifs = []
    for tif in lidar_tifs:
        with rasterio.open(tif) as src:
            lidar_bbox = src.bounds
            lidar_geom = box(lidar_bbox.left, lidar_bbox.bottom, lidar_bbox.right, lidar_bbox.top)
            if input_geom.intersects(lidar_geom):
                overlapping_tifs.append(tif)

    if not overlapping_tifs:
        print(f"No overlapping LIDAR tiles found for {tile_number}")
        return None

    output_tif = os.path.join(tif_dir, f"outputtile_lidar_{tile_number}.tif")
    merge_command = [
        "python", "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
        "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
        "-o", output_tif,
        "-n", "255", "-a_nodata", "255"] + overlapping_tifs

    process_hag = subprocess.run(merge_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    if process_hag.returncode != 0:
        print(f"Error in merging LIDAR: {process_hag.stderr}")
        return None

    src_ds = gdal.Open(output_tif, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataLIDAR_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None

    os.remove(output_tif)
    return output_resampled_path

    del lidar_tifs, bbox, input_geom, overlapping_tifs, lidar_bbox, lidar_geom
    del output_tif, merge_command, process_hag
    del src_ds, target_ds, driver, output_resampled_path


def process_rivers(tif_path, tif_dir, tile_number):
    dem_data = rioxarray.open_rasterio(tif_path)
    bbox = dem_data.rio.bounds()
    custom_filter = '["waterway"~"river"]'
    graph = ox.graph_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], custom_filter=custom_filter, simplify=True, retain_all=True, truncate_by_edge=True)
    gdf = ox.graph_to_gdfs(graph, nodes=False)

    with rasterio.open(tif_path) as src:
        window = from_bounds(*src.bounds, src.transform)
        transform = rasterio.windows.transform(window, src.transform)
        raster = np.zeros((int(window.height), int(window.width)), dtype=np.uint8)
        shapes = ((geom, 1) for geom in gdf['geometry'])
        burned = rasterize(shapes, out=raster, fill=0, transform=transform, all_touched=True)
        distance_grid = scipy.ndimage.distance_transform_edt(burned == 0)
        decay_grid = np.exp(-0.07 * distance_grid)

        clipped_meta = src.meta.copy()
        clipped_meta.update({"driver": "GTiff", "height": int(window.height), "width": int(window.width), "transform": transform, "dtype": rasterio.float32, "count": 1, "compress": 'lzw'})
        output_path = os.path.join(tif_dir, f'exponential_decay_CO_river_{tile_number}.tif')
        with rasterio.open(output_path, 'w', **clipped_meta) as dst:
            dst.write(decay_grid.astype(np.float32), 1)

    src_ds = gdal.Open(output_path, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataRivers_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None
    os.remove(output_path)

    return output_resampled_path

    del dem_data, bbox, custom_filter, graph, gdf
    del window, transform, raster, shapes, burned, distance_grid, decay_grid
    del clipped_meta, output_path
    del src_ds, target_ds, driver, output_resampled_path


def process_roads(tif_path, tif_dir, tile_number):
    extent_data = rioxarray.open_rasterio(tif_path)
    bbox = extent_data.rio.bounds()
    graph = ox.graph_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], network_type='drive', simplify=True)
    gdf = ox.graph_to_gdfs(graph, nodes=False)

    with rasterio.open(tif_path) as src:
        window = from_bounds(*src.bounds, src.transform)
        transform = rasterio.windows.transform(window, src.transform)
        raster = np.zeros((int(window.height), int(window.width)), dtype=np.uint8)
        shapes = ((geom, 1) for geom in gdf['geometry'])
        burned = rasterize(shapes, out=raster, fill=0, transform=transform, all_touched=True)
        distance_grid = scipy.ndimage.distance_transform_edt(burned == 0)
        decay_grid = np.exp(-0.07 * distance_grid)

        clipped_meta = src.meta.copy()
        clipped_meta.update({"driver": "GTiff", "height": int(window.height), "width": int(window.width), "transform": transform, "dtype": rasterio.float32, "count": 1, "compress": 'lzw'})
        output_path = os.path.join(tif_dir, f'exponential_decay_CO_roads_{tile_number}.tif')
        with rasterio.open(output_path, 'w', **clipped_meta) as dst:
            dst.write(decay_grid.astype(np.float32), 1)

    src_ds = gdal.Open(output_path, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataRoads_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None
    os.remove(output_path)

    return output_resampled_path

    del extent_data, bbox, graph, gdf
    del window, transform, raster, shapes, burned, distance_grid, decay_grid
    del clipped_meta, output_path
    del src_ds, target_ds, driver, output_resampled_path


def generate_random_points(geometry, num_points):
    points = []
    min_x, min_y, max_x, max_y = geometry.bounds
    while len(points) < num_points:
        random_point = Point(np.random.uniform(min_x, max_x), np.random.uniform(min_y, max_y))
        if random_point.within(geometry):
            points.append(random_point)
    return points

    del bounds, crs, img, rect, buffered_rect, random_points
    del gdf_points, gdf_points_wgs84, lat_long


def process_chips(tif_path, tif_dir, lat_long, chip_size=128):
    resampled_lidar_path = os.path.join(tif_dir, f"output_resampled_dataLIDAR_{tile_number}.tif")
    resampled_dem_path = os.path.join(tif_dir, f"output_resampled_dataDEM_{tile_number}.tif")
    resampled_rivers_path = os.path.join(tif_dir, f"output_resampled_dataRivers_{tile_number}.tif")
    resampled_roads_path = os.path.join(tif_dir, f"output_resampled_dataRoads_{tile_number}.tif")

    training_chips_dir = os.path.join(THEFOLDER, "trainingchips")
    os.makedirs(training_chips_dir, exist_ok=True)

    for i, (lat, lon) in enumerate(lat_long):
        try:
            paths = [resampled_lidar_path, resampled_dem_path, resampled_rivers_path, resampled_roads_path, tif_path]
            labels = ['lidar', 'dem', 'rivers', 'roads', 'pcllabels']
            
            for path, label in zip(paths, labels):
                with rasterio.open(path) as src:
                    col, row = src.index(lon, lat)
                    window = Window(col - chip_size // 2, row - chip_size // 2, chip_size, chip_size)
                    chip_data = src.read(1, window=window)
                    
                    out_meta = src.meta.copy()
                    out_meta.update({
                        "driver": "GTiff",
                        "height": chip_size,
                        "width": chip_size,
                        "transform": src.window_transform(window)
                    })

                    chip_output_dir = os.path.join(training_chips_dir, label)
                    os.makedirs(chip_output_dir, exist_ok=True)
                    
                    chip_output_path = os.path.join(chip_output_dir, f"{label.upper()}_Chip_{tile_number}_{i}.tif")

                    if chip_data.shape == (chip_size, chip_size):# and np.any(chip_data != src.nodata):
                        with rasterio.open(chip_output_path, "w", **out_meta) as dest:
                            dest.write(chip_data, 1)
                    else:
                        print(f"Skipping {label} chip {i} because it is not properly shaped or is filled with nodata.")
        except Exception as e:
            print(f"An error occurred while processing chip {i}: {e}")
            
    del resampled_lidar_path, resampled_dem_path, resampled_rivers_path, resampled_roads_path
    del training_chips_dir, paths, labels, col, row, window, chip_data, out_meta, chip_output_dir, chip_output_path


if __name__ == "__main__":
    for tile_number in TILENUMBER:
        tif_path = f"{THEFOLDER}\\PCLTILES\\pcltile_{tile_number}.tif"
        tif_dir = f"{THEFOLDER}\\TIFFOUTPUT\\{tile_number}"
        os.makedirs(tif_dir, exist_ok=True)

        resampled_files = [
            process_dem(tif_path, tif_dir, tile_number),
            process_lidar(tif_path, tif_dir, tile_number),
            process_rivers(tif_path, tif_dir, tile_number),
            process_roads(tif_path, tif_dir, tile_number)
        ]

        delete_non_resampled_files([os.path.basename(f) for f in resampled_files], tif_dir)

        # Generate random points within the tile bounds
        with rasterio.open(tif_path) as src:
            bounds = src.bounds
            crs = src.crs
            img = src.read(1)

        rect = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
        buffered_rect = rect.buffer(-0.15)
        random_points = generate_random_points(buffered_rect, 5000)
        gdf_points = GeoDataFrame(geometry=random_points, crs=crs).to_crs(crs)
        gdf_points_wgs84 = gdf_points.to_crs(epsg=4326)
        lat_long = gdf_points_wgs84.geometry.apply(lambda geom: (geom.y, geom.x)).tolist()

        process_chips(tif_path, tif_dir, lat_long)

        print(f"Processing for tile {tile_number} completed.")

    print("All processing completed.")




Processing for tile 75000-35000 completed.
Processing for tile 75000-40000 completed.
Processing for tile 75000-45000 completed.
All processing completed.


In [5]:
# import os
# import matplotlib.pyplot as plt
# import rasterio as rio
# from rasterio.plot import show

# # Define the directory containing the image chips
# tile_dir = "C:\\Users\\smdur\\OneDrive\\Desktop\\GlobalPCL17\\TIFFOUTPUT\\75000-35000"
# output_path = r"C:\Users\smdur\OneDrive\Desktop\PCLCONUS\figures\trainingtile2.png"

# def plot_stacked_tiles_varying_alpha(tile_dir):
#     # List all .tif files in the directory
#     tile_files = sorted([f for f in os.listdir(tile_dir) if f.endswith('.tif')])
    
#     fig, ax = plt.subplots(figsize=(12, 12))
    
#     # Colormaps for each image to distinguish them
#     colormaps = ['viridis', 'gray', 'inferno', 'magma']
    
#     n = len(tile_files)
    
#     for i, tile_file in enumerate(tile_files):
#         tile_path = os.path.join(tile_dir, tile_file)
#         with rio.open(tile_path) as src:
#             img = src.read(1)
#             transform = src.transform
#             extent = rio.plot.plotting_extent(src)
            
#             # Alternate colormaps for each image
#             cmap = colormaps[i % len(colormaps)]
            
#             # Adjust alpha dynamically (more for top layers, less for bottom)
#             alpha = 0.7 - (0.4 * (i / (n - 1)))  # Alpha ranges from 0.7 to 0.3
            
#             # Plot each image with a different colormap and varying transparency
#             show(img, ax=ax, transform=transform, extent=extent, cmap=cmap, alpha=alpha)

#     # Set labels for x and y axes
#     ax.set_xlabel('Longitude')
#     ax.set_ylabel('Latitude')

#     plt.savefig(output_path, dpi=300)  # Save the plot as a file
#     plt.show()  # Display the plot

# plot_stacked_tiles_varying_alpha(tile_dir)


In [6]:
# import os
# import matplotlib.pyplot as plt
# import rasterio as rio
# from rasterio.plot import show
# import imageio
# import numpy as np
# from rasterio.warp import calculate_default_transform, reproject, Resampling

# # Define the directory containing the image chips
# tile_dir = "C:\\Users\\smdur\\OneDrive\\Desktop\\GlobalPCL17\\TIFFOUTPUT\\75000-35000"

# def reproject_to_utm(src, dst_crs='EPSG:32633'):
#     transform, width, height = calculate_default_transform(
#         src.crs, dst_crs, src.width, src.height, *src.bounds)
#     kwargs = src.meta.copy()
#     kwargs.update({
#         'crs': dst_crs,
#         'transform': transform,
#         'width': width,
#         'height': height
#     })

#     data = np.zeros((height, width), dtype=rio.uint8)
#     reproject(
#         source=rio.band(src, 1),
#         destination=data,
#         src_transform=src.transform,
#         src_crs=src.crs,
#         dst_transform=transform,
#         dst_crs=dst_crs,
#         resampling=Resampling.nearest)
#     return data, transform

# def plot_stacked_tiles_varying_alpha(tile_dir, output_gif='C:\\Users\\smdur\\OneDrive\\Desktop\\GlobalPCL17\\scanning_box2.gif'):
#     # List all .tif files in the directory
#     tile_files = sorted([f for f in os.listdir(tile_dir) if f.endswith('.tif')])
    
#     # Load the first tile to get metadata and use it to set bounds correctly
#     with rio.open(os.path.join(tile_dir, tile_files[0])) as src:
#         meta = src.meta.copy()
#         img, transform = reproject_to_utm(src)  # Reproject first image to get metadata
#         bounds = src.bounds

#     box_width, box_height = 512, 512  # Adjust as necessary
#     step_size = 512  # Adjust as necessary
#     frames = []

#     # Create a plot for each position of the scanning box
#     for x in np.arange(bounds.left, bounds.right, step_size * transform[0]):
#         for y in np.arange(bounds.bottom, bounds.top, step_size * abs(transform[4])):
#             fig, ax = plt.subplots(figsize=(12, 12))
#             # Plot each tile
#             for i, tile_file in enumerate(tile_files):
#                 tile_path = os.path.join(tile_dir, tile_file)
#                 with rio.open(tile_path) as src:
#                     img, transform = reproject_to_utm(src)  # Reproject each image
#                     cmap = ['viridis', 'gray', 'inferno', 'magma'][i % len(tile_files)]
#                     alpha = 0.7 - (0.4 * (i / (len(tile_files) - 1)))
#                     show(img, ax=ax, transform=transform, cmap=cmap, alpha=alpha)

#             rect = plt.Rectangle((x, y), box_width, box_height, linewidth=3, edgecolor='red', facecolor='none')
#             ax.add_patch(rect)
#             ax.set_xticks([])
#             ax.set_yticks([])
#             plt.title("Scanning Spatial Image Chips")
#             fig.canvas.draw()
#             frame = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
#             frame = frame.reshape(fig.canvas.get_width_height()[::-1] + (3,))
#             frames.append(frame)
#             plt.close(fig)

#     imageio.mimsave(output_gif, frames, fps=40)

# plot_stacked_tiles_varying_alpha(tile_dir)


# Load chips

In [7]:
# import os
# import rasterio
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# training_chips_dir = os.path.join(THEFOLDER, "trainingchips")

# # Paths to datasets
# featurepath1 = os.path.join(training_chips_dir, "lidar")
# featurepath2 = os.path.join(training_chips_dir, "dem")
# featurepath3 = os.path.join(training_chips_dir, "roads")
# featurepath4 = os.path.join(training_chips_dir, "rivers")
# labelspath = os.path.join(training_chips_dir, "pcllabels")

# # Function to load GeoTIFF images as numpy arrays
# def load_geotiff(path):
#     with rasterio.open(path) as src:
#         return src.read(1)

# # Function to load and print progress
# def load_images(path):
#     files = [f for f in os.listdir(path) if f.endswith('.tif')]
#     images = []
#     for i, f in enumerate(files):
#         images.append(load_geotiff(os.path.join(path, f)))
#         if (i + 1) % 5000 == 0:
#             print(f"Loaded {i + 1} images from {path}")
#     return images

# # Load datasets
# hag_images = load_images(featurepath1)
# dem_images = load_images(featurepath2)
# roads_images = load_images(featurepath3)
# rivers_images = load_images(featurepath4)
# label_images = load_images(labelspath)

# # Convert lists to numpy arrays
# hag_images = np.array(hag_images).astype('float32')
# dem_images = np.array(dem_images).astype('float32')
# roads_images = np.array(roads_images).astype('float32')
# rivers_images = np.array(rivers_images).astype('float32')
# label_images = np.array(label_images).astype('float32')

# # Normalize images independently
# hag_max = hag_images.max()
# dem_max = dem_images.max()
# roads_max = roads_images.max()
# rivers_max = rivers_images.max()

# hag_images /= hag_max
# dem_images /= dem_max
# roads_images /= roads_max
# rivers_images /= rivers_max

# print(f"HAG max value: {hag_max}")
# print(f"DEM max value: {dem_max}")
# print(f"Roads max value: {roads_max}")
# print(f"Rivers max value: {rivers_max}")









import os
import rasterio
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

training_chips_dir = os.path.join(THEFOLDER, "trainingchips")

# Paths to datasets
featurepath1 = os.path.join(training_chips_dir, "lidar")
featurepath2 = os.path.join(training_chips_dir, "dem")
featurepath3 = os.path.join(training_chips_dir, "roads")
featurepath4 = os.path.join(training_chips_dir, "rivers")
labelspath = os.path.join(training_chips_dir, "pcllabels")

# Function to load GeoTIFF images as numpy arrays
def load_geotiff(path):
    with rasterio.open(path) as src:
        data = src.read(1)
        if np.all(data == 0):
            return None  # Return None if the image is all zeros
        return data

# Function to load and print progress
def load_images(path, skip_zeros=False):
    files = [f for f in os.listdir(path) if f.endswith('.tif')]
    images = []
    for i, f in enumerate(files):
        image = load_geotiff(os.path.join(path, f))
        if image is not None:
            images.append(image)
        elif skip_zeros:
            print(f"Skipping {f} because it is all zeros")
        if (i + 1) % 5000 == 0:
            print(f"Loaded {i + 1} images from {path}")
    return images

# Load datasets
hag_images = load_images(featurepath1)
dem_images = load_images(featurepath2, skip_zeros=True)  # Skip DEM images that are all zeros
roads_images = load_images(featurepath3)
rivers_images = load_images(featurepath4)
label_images = load_images(labelspath)

# Ensure all datasets have the same number of images
min_length = min(len(hag_images), len(dem_images), len(roads_images), len(rivers_images), len(label_images))
hag_images = hag_images[:min_length]
dem_images = dem_images[:min_length]
roads_images = roads_images[:min_length]
rivers_images = rivers_images[:min_length]
label_images = label_images[:min_length]

# Convert lists to numpy arrays
hag_images = np.array(hag_images).astype('float32')
dem_images = np.array(dem_images).astype('float32')
roads_images = np.array(roads_images).astype('float32')
rivers_images = np.array(rivers_images).astype('float32')
label_images = np.array(label_images).astype('float32')

# Normalize images independently
hag_max = hag_images.max()
dem_max = dem_images.max()
roads_max = roads_images.max()
rivers_max = rivers_images.max()

hag_images /= hag_max
dem_images /= dem_max
roads_images /= roads_max
rivers_images /= rivers_max

print(f"HAG max value: {hag_max}")
print(f"DEM max value: {dem_max}")
print(f"Roads max value: {roads_max}")
print(f"Rivers max value: {rivers_max}")


Loaded 5000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\lidar
Loaded 10000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\lidar
Loaded 15000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\lidar
Loaded 5000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\dem
Loaded 10000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\dem
Loaded 15000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\dem
Loaded 5000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\roads
Loaded 10000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\roads
Loaded 15000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\roads
Loaded 5000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\rivers
Loaded 10000 images from C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\trainingchips\rivers
Loaded 15000 images from C:\U

In [8]:
# import os
# import sys

# # Set the environment variable before importing gdal
# os.environ['USE_PATH_FOR_GDAL_PYTHON'] = 'YES'
# os.add_dll_directory(os.path.join(os.getenv('CONDA_PREFIX'), 'Library', 'bin'))

# import rasterio
# import numpy as np
# from concurrent.futures import ThreadPoolExecutor
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# training_chips_dir = os.path.join(THEFOLDER, "trainingchips")

# # Paths to datasets
# featurepath1 = os.path.join(training_chips_dir, "lidar")
# featurepath2 = os.path.join(training_chips_dir, "dem")
# featurepath3 = os.path.join(training_chips_dir, "roads")
# featurepath4 = os.path.join(training_chips_dir, "rivers")
# labelspath = os.path.join(training_chips_dir, "pcllabels")

# # Function to load GeoTIFF images as numpy arrays
# def load_geotiff(path):
#     with rasterio.open(path) as src:
#         return src.read(1)

# # Function to load images in parallel with progress printing
# def load_images(path):
#     files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.tif')]
#     num_files = len(files)
#     images = []

#     def load_and_count(file):
#         image = load_geotiff(file)
#         if (load_and_count.counter + 1) % 5000 == 0:
#             print(f"Loaded {load_and_count.counter + 1} images from {path}")
#         load_and_count.counter += 1
#         return image

#     load_and_count.counter = 0

#     max_workers = min(32, os.cpu_count() + 4)  # Default value if not specified

#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
#         images = list(executor.map(load_and_count, files))

#     return np.array(images).astype('float32')

# # Load datasets
# hag_images = load_images(featurepath1)
# dem_images = load_images(featurepath2)
# roads_images = load_images(featurepath3)
# rivers_images = load_images(featurepath4)
# label_images = load_images(labelspath)

# # Normalize images independently
# hag_max = hag_images.max()
# dem_max = dem_images.max()
# roads_max = roads_images.max()
# rivers_max = rivers_images.max()

# hag_images /= hag_max
# dem_images /= dem_max
# roads_images /= roads_max
# rivers_images /= rivers_max


# print(f"HAG max value: {hag_max}")
# print(f"DEM max value: {dem_max}")
# print(f"Roads max value: {roads_max}")
# print(f"Rivers max value: {rivers_max}")

# Train the model 

In [9]:
# import gc
# gc.collect()

# # Stack features along the last dimension
# feature_images = np.stack((hag_images, dem_images, roads_images, rivers_images), axis=-1)

# # Free up memory by deleting the original arrays
# # del hag_images
# # del dem_images
# # del roads_images
# # del rivers_images

# # If you want to ensure that the memory is freed immediately
# import gc
# gc.collect()

# CHIP_SIZE=128

# # Normalize labels if they range from 0 to 100
# label_images /= 100

# # Reshape labels for CNN input
# label_images = np.expand_dims(label_images, axis=-1)

# # Define the CNN model
# model = Sequential([
#     #Conv2D(16, (3, 3), activation='relu', input_shape=(128, 128, 4)),
#     Conv2D(16, (3, 3), activation='relu', input_shape=(CHIP_SIZE, CHIP_SIZE, 4)),
#     MaxPooling2D((2, 2)),
#     Dropout(0.25),
#     Conv2D(32, (3, 3), activation='relu'),
#     MaxPooling2D((2, 2)),
#     Dropout(0.25),
#     Conv2D(64, (3, 3), activation='relu'),
#     Flatten(),
#     Dropout(0.5),
#     #Dense(128 * 128, activation='sigmoid'),
#     #tf.keras.layers.Reshape((128, 128, 1))
#     Dense(CHIP_SIZE * CHIP_SIZE, activation='sigmoid'),
#     tf.keras.layers.Reshape((CHIP_SIZE, CHIP_SIZE, 1))
# ])

# # # Define custom weights for each feature
# # weights = np.array([1.0, 0.8, 0.5, 0.3])  
# # sample_weights = np.dot(feature_images, weights)

# lr = 0.0005
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# # Compile and train the model with sample weights
# model.compile(optimizer=optimizer, loss='mse')
# model.fit(feature_images, label_images, batch_size=64, epochs=10, validation_split=0.3)#, sample_weight=sample_weights)







import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
import gc

gc.collect()

# Stack features along the last dimension
feature_images = np.stack((hag_images, dem_images, roads_images, rivers_images), axis=-1)

# Free up memory by deleting the original arrays
del hag_images, dem_images, roads_images, rivers_images
gc.collect()

CHIP_SIZE = 128

# Normalize labels if they range from 0 to 100
label_images /= 100

# Reshape labels for CNN input
label_images = np.expand_dims(label_images, axis=-1)

# Check shapes of input and label data
print("Feature images shape:", feature_images.shape)
print("Label images shape:", label_images.shape)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Concatenate, Dropout

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Concatenate, Dropout, Dense, Reshape
from tensorflow.keras.optimizers import Adam

def unet_model(input_size=(128, 128, 4)):
    inputs = Input(input_size)
    
    # Encoder
    conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
    pool1 = Dropout(0.25)(pool1)
    
    conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    pool2 = Dropout(0.25)(pool2)
    
    conv3 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
    pool3 = Dropout(0.5)(pool3)
    
    conv4 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool3)
    conv4 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)
    pool4 = Dropout(0.5)(pool4)
    
    # Bottleneck
    conv5 = Conv2D(1024, (3, 3), activation='relu', padding='same')(pool4)
    conv5 = Conv2D(1024, (3, 3), activation='relu', padding='same')(conv5)
    
    # Decoder
    up6 = UpSampling2D(size=(2, 2))(conv5)
    up6 = Concatenate()([up6, conv4])
    conv6 = Conv2D(512, (3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv6)
    
    up7 = UpSampling2D(size=(2, 2))(conv6)
    up7 = Concatenate()([up7, conv3])
    conv7 = Conv2D(256, (3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv7)
    
    up8 = UpSampling2D(size=(2, 2))(conv7)
    up8 = Concatenate()([up8, conv2])
    conv8 = Conv2D(128, (3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv8)
    
    up9 = UpSampling2D(size=(2, 2))(conv8)
    up9 = Concatenate()([up9, conv1])
    conv9 = Conv2D(64, (3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv9)
    
    # Final output layer
    conv10 = Conv2D(1, (1, 1), activation='linear')(conv9)
    
    # Reshape layer
    output = Reshape((input_size[0], input_size[1], 1))(conv10)
    
    model = Model(inputs=[inputs], outputs=[output])
    
    return model

# Instantiate the U-Net model
chip_size = 128
model = unet_model(input_size=(chip_size, chip_size, 4))

# Compile the model
lr = 0.001
optimizer = Adam(learning_rate=lr)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_squared_error'])

# Print model summary
model.summary()

# Example training code (you should adjust this as needed)
# history = model.fit(feature_images, label_images, batch_size=64, epochs=10, validation_split=0.3)


# Train the model
history = model.fit(feature_images, label_images, batch_size=64, epochs=1, validation_split=0.3)

# Check if the model is learning
print("Training loss:", history.history['loss'])
print("Validation loss:", history.history['val_loss'])

# Clean up
del feature_images, label_images
gc.collect()





Feature images shape: (14298, 128, 128, 4)
Label images shape: (14298, 128, 128, 1)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128, 128, 4)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 128, 128, 64)         2368      ['input_1[0][0]']             
                                                                                                  
 conv2d_1 (Conv2D)           (None, 128, 128, 64)         36928     ['conv2d[0][0]']              
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 64, 64, 64)           0         ['conv2d_1[0][0]']            
 D)       

1744

In [10]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset, random_split
# import numpy as np

# # Collect garbage
# import gc
# gc.collect()

# CHIP_SIZE = 128

# # Assuming hag_images, dem_images, roads_images, and rivers_images are numpy arrays
# # Stack features along the last dimension
# feature_images = np.stack((hag_images, dem_images, roads_images, rivers_images), axis=-1)

# # Normalize labels if they range from 0 to 100
# label_images /= 100

# # Reshape labels for CNN input
# label_images = np.expand_dims(label_images, axis=-1)

# # Convert numpy arrays to PyTorch tensors
# feature_images = torch.tensor(feature_images, dtype=torch.float32)
# label_images = torch.tensor(label_images, dtype=torch.float32)

# # Permute the dimensions of feature_images to (N, C, H, W)
# feature_images = feature_images.permute(0, 3, 1, 2)

# # Remove the extra dimension from label_images
# label_images = label_images.squeeze()

# # Define the dataset
# dataset = TensorDataset(feature_images, label_images)

# # Split the dataset into training and validation sets
# train_size = int(0.7 * len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# # Define the dataloaders
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# # Define the CNN model
# class CNNModel(nn.Module):
#     def __init__(self, chip_size):
#         super(CNNModel, self).__init__()
#         self.conv1 = nn.Conv2d(4, 16, kernel_size=3, padding=1)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.dropout1 = nn.Dropout(0.25)
#         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
#         self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
#         self.dropout2 = nn.Dropout(0.5)
#         self.fc1 = nn.Linear(64 * (chip_size // 4) * (chip_size // 4), chip_size * chip_size)
#         self.sigmoid = nn.Sigmoid()
#         self.reshape = lambda x: x.view(-1, chip_size, chip_size)

#     def forward(self, x):
#         x = self.pool(nn.ReLU()(self.conv1(x)))
#         x = self.dropout1(x)
#         x = self.pool(nn.ReLU()(self.conv2(x)))
#         x = self.dropout1(x)
#         x = nn.ReLU()(self.conv3(x))
#         x = x.view(-1, 64 * (CHIP_SIZE // 4) * (CHIP_SIZE // 4))
#         x = self.dropout2(x)
#         x = self.sigmoid(self.fc1(x))
#         x = self.reshape(x)
#         return x

# # Instantiate the model, define the optimizer and loss function
# model = CNNModel(CHIP_SIZE)
# optimizer = optim.Adam(model.parameters(), lr=0.0006)
# criterion = nn.MSELoss()

# # Training loop
# epochs = 3
# for epoch in range(epochs):
#     model.train()
#     running_loss = 0.0
#     for i, (inputs, labels) in enumerate(train_loader):
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         # Add an extra dimension to labels to match the output shape
#         labels = labels.unsqueeze(1)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % 100 == 99:  # Print every 100 batches
#             print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 100:.4f}')
#             running_loss = 0.0

#     # Validation loop
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for inputs, labels in val_loader:
#             outputs = model(inputs)
#             # Add an extra dimension to labels to match the output shape
#             labels = labels.unsqueeze(1)
#             loss = criterion(outputs, labels)
#             val_loss += loss.item()
#     print(f'Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader):.4f}')

# print('Finished Training')


In [11]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset, random_split
# import numpy as np

# # Collect garbage
# import gc
# gc.collect()

# CHIP_SIZE = 128

# # Assuming hag_images, dem_images, roads_images, and rivers_images are numpy arrays
# # Stack features along the last dimension
# feature_images = np.stack((hag_images, dem_images, roads_images, rivers_images), axis=-1)

# # Normalize labels if they range from 0 to 100
# label_images /= 100

# # Reshape labels for CNN input
# label_images = np.expand_dims(label_images, axis=-1)

# # Convert numpy arrays to PyTorch tensors
# feature_images = torch.tensor(feature_images, dtype=torch.float32)
# label_images = torch.tensor(label_images, dtype=torch.float32)

# # Permute the dimensions of feature_images to (N, C, H, W)
# feature_images = feature_images.permute(0, 3, 1, 2)

# # Remove the extra dimension from label_images
# label_images = label_images.squeeze(-1)

# # Define the dataset
# dataset = TensorDataset(feature_images, label_images)

# # Split the dataset into training and validation sets
# train_size = int(0.7 * len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# # Define the dataloaders
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# # Define the CNN model
# class CNNModel(nn.Module):
#     def __init__(self, chip_size):
#         super(CNNModel, self).__init__()
#         self.conv1 = nn.Conv2d(4, 16, kernel_size=3, padding=1)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.dropout1 = nn.Dropout(0.25)
#         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
#         self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
#         self.dropout2 = nn.Dropout(0.5)
#         self.fc1 = nn.Linear(64 * (chip_size // 4) * (chip_size // 4), chip_size * chip_size)
#         self.sigmoid = nn.Sigmoid()
#         self.reshape = lambda x: x.view(-1, chip_size, chip_size)

#     def forward(self, x):
#         x = self.pool(nn.ReLU()(self.conv1(x)))
#         x = self.dropout1(x)
#         x = self.pool(nn.ReLU()(self.conv2(x)))
#         x = self.dropout1(x)
#         x = nn.ReLU()(self.conv3(x))
#         x = x.view(-1, 64 * (CHIP_SIZE // 4) * (CHIP_SIZE // 4))
#         x = self.dropout2(x)
#         x = self.sigmoid(self.fc1(x))
#         x = self.reshape(x)
#         return x

# # Instantiate the model, define the optimizer and loss function
# model = CNNModel(CHIP_SIZE)
# optimizer = optim.Adam(model.parameters(), lr=0.0006)
# criterion = nn.MSELoss()

# # Training loop
# epochs = 1
# for epoch in range(epochs):
#     model.train()
#     running_loss = 0.0
#     for i, (inputs, labels) in enumerate(train_loader):
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % 100 == 99:  # Print every 100 batches
#             print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 100:.4f}')
#             running_loss = 0.0

#     # Validation loop
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for inputs, labels in val_loader:
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             val_loss += loss.item()
#     print(f'Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader):.4f}')

# print('Finished Training')


In [12]:
# I think this ran correctly 

# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset, random_split
# import numpy as np
# import gc
# # Important
# import os
# from osgeo import gdal


# # Function to clear memory
# def clear_memory():
#     gc.collect()
#     torch.cuda.empty_cache()

# # Set chip size
# CHIP_SIZE = 128

# # # Example placeholder arrays (replace with your actual data)
# # hag_images = np.random.rand(10, CHIP_SIZE, CHIP_SIZE)
# # dem_images = np.random.rand(10, CHIP_SIZE, CHIP_SIZE)
# # roads_images = np.random.rand(10, CHIP_SIZE, CHIP_SIZE)
# # rivers_images = np.random.rand(10, CHIP_SIZE, CHIP_SIZE)
# # label_images = np.random.rand(10, CHIP_SIZE, CHIP_SIZE)

# # Normalize labels if they range from 0 to 100
# label_images /= 100

# # Stack features along the last dimension
# feature_images = np.stack((hag_images, dem_images, roads_images, rivers_images), axis=-1)

# # Reshape labels for CNN input
# label_images = np.expand_dims(label_images, axis=-1)

# # Convert numpy arrays to PyTorch tensors
# feature_images = torch.tensor(feature_images, dtype=torch.float32)
# label_images = torch.tensor(label_images, dtype=torch.float32)

# # Permute the dimensions of feature_images to (N, C, H, W)
# feature_images = feature_images.permute(0, 3, 1, 2)

# # Remove the extra dimension from label_images
# label_images = label_images.squeeze(-1)

# # Define the dataset
# dataset = TensorDataset(feature_images, label_images)

# # Split the dataset into training and validation sets
# train_size = int(0.7 * len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# # Define the dataloaders
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# # Define the CNN model
# class CNNModel(nn.Module):
#     def __init__(self, chip_size):
#         super(CNNModel, self).__init__()
#         self.conv1 = nn.Conv2d(4, 16, kernel_size=3, padding=1)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.dropout1 = nn.Dropout(0.25)
#         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
#         self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
#         self.dropout2 = nn.Dropout(0.5)
#         self.fc1 = nn.Linear(64 * (chip_size // 4) * (chip_size // 4), chip_size * chip_size)
#         self.sigmoid = nn.Sigmoid()
#         self.reshape = lambda x: x.view(-1, chip_size, chip_size)

#     def forward(self, x):
#         x = self.pool(nn.ReLU()(self.conv1(x)))
#         x = self.dropout1(x)
#         x = self.pool(nn.ReLU()(self.conv2(x)))
#         x = self.dropout1(x)
#         x = nn.ReLU()(self.conv3(x))
#         x = x.view(-1, 64 * (CHIP_SIZE // 4) * (CHIP_SIZE // 4))
#         x = self.dropout2(x)
#         x = self.sigmoid(self.fc1(x))
#         x = self.reshape(x)
#         return x

# # Instantiate the model, define the optimizer and loss function
# model = CNNModel(CHIP_SIZE)
# optimizer = optim.Adam(model.parameters(), lr=0.0006)
# criterion = nn.MSELoss()

# # Training loop
# epochs = 1
# for epoch in range(epochs):
#     model.train()
#     running_loss = 0.0
#     for i, (inputs, labels) in enumerate(train_loader):
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % 100 == 99:  # Print every 100 batches
#             print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 100:.4f}')
#             running_loss = 0.0

#     # Validation loop
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for inputs, labels in val_loader:
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             val_loss += loss.item()
#     print(f'Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader):.4f}')

# print('Finished Training')

# # Clear memory after training
# clear_memory()


In [13]:
import gc

gc.collect()

0

In [14]:
# MAKE DATA
import os
import glob
import subprocess
from concurrent.futures import ThreadPoolExecutor
from osgeo import gdal
import rioxarray
import planetary_computer
from pystac_client import Client
import osmnx as ox
import rasterio
from rasterio.features import rasterize
from rasterio.windows import from_bounds, Window
import numpy as np
import scipy.ndimage
from shapely.geometry import box
from geopandas import GeoDataFrame
import matplotlib.pyplot as plt
from rasterio.plot import show
import rasterio as rio
from itertools import product
from rasterio import windows

TILENUMBER = ['70000-40000']
CHIP_SIZE = 128  

def delete_non_resampled_files(resampled_files, tif_dir):
    for file in os.listdir(tif_dir):
        if file not in resampled_files and file.endswith('.tif'):
            file_path = os.path.join(tif_dir, file)
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Failed to delete {file_path}: {e}")

def process_dem(tif_path, tif_dir, tile_number):
    tif_data = rioxarray.open_rasterio(tif_path)
    bbox_of_interest = tif_data.rio.bounds()
    catalog = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
    search = catalog.search(collections=["cop-dem-glo-30"], bbox=bbox_of_interest)
    items = list(search.get_items())
    
    def process_item(item, idx):
        signed_asset = planetary_computer.sign(item.assets["data"])
        data = rioxarray.open_rasterio(signed_asset.href).squeeze().drop("band")
        data.rio.write_crs("EPSG:4326", inplace=True)
        output_tif_path = os.path.join(tif_dir, f"output_dataDEM_{idx}.tif")
        data.rio.to_raster(output_tif_path)
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        for i, item in enumerate(items):
            executor.submit(process_item, item, i)

    output_tif = os.path.join(tif_dir, f"outputtile_DEM_{tile_number}.tif")
    merge_command = [
        "python", "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
        "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
        "-o", output_tif,
        "-n", "-9999", "-a_nodata", "-9999"] + glob.glob(os.path.join(tif_dir, "output_dataDEM_*.tif"))

    process_hag = subprocess.run(merge_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    if process_hag.returncode != 0:
        print(f"Error in merging DEM: {process_hag.stderr}")
        return None

    src_ds = gdal.Open(output_tif, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataDEM_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None

    os.remove(output_tif)
    for tif in glob.glob(os.path.join(tif_dir, "output_dataDEM_*.tif")):
        try:
            os.remove(tif)
        except Exception as e:
            print(f"Failed to delete {tif}: {e}")

    return output_resampled_path
    del tif_data, bbox_of_interest, catalog, search, items
    del output_tif, merge_command, process_hag
    del src_ds, target_ds, driver, output_resampled_path

def process_lidar(tif_path, tif_dir, tile_number):
    lidar_dir = r"C:\Users\smdur\OneDrive\Desktop\GlobalData\LIDAR2"
    lidar_tifs = glob.glob(os.path.join(lidar_dir, "*.tif"))

    # Get the bounding box of the input tif_path
    with rasterio.open(tif_path) as src:
        bbox = src.bounds
        input_geom = box(bbox.left, bbox.bottom, bbox.right, bbox.top)

    # Find overlapping LIDAR tiles
    overlapping_tifs = []
    for tif in lidar_tifs:
        with rasterio.open(tif) as src:
            lidar_bbox = src.bounds
            lidar_geom = box(lidar_bbox.left, lidar_bbox.bottom, lidar_bbox.right, lidar_bbox.top)
            if input_geom.intersects(lidar_geom):
                overlapping_tifs.append(tif)

    if not overlapping_tifs:
        print(f"No overlapping LIDAR tiles found for {tile_number}")
        return None

    output_tif = os.path.join(tif_dir, f"outputtile_lidar_{tile_number}.tif")
    merge_command = [
        "python", "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
        "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
        "-o", output_tif,
        "-n", "255", "-a_nodata", "255"] + overlapping_tifs

    process_hag = subprocess.run(merge_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    if process_hag.returncode != 0:
        print(f"Error in merging LIDAR: {process_hag.stderr}")
        return None

    src_ds = gdal.Open(output_tif, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataLIDAR_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None

    os.remove(output_tif)
    return output_resampled_path
    del lidar_tifs, bbox, input_geom, overlapping_tifs, lidar_bbox, lidar_geom
    del output_tif, merge_command, process_hag
    del src_ds, target_ds, driver, output_resampled_path

def process_rivers(tif_path, tif_dir, tile_number):
    dem_data = rioxarray.open_rasterio(tif_path)
    bbox = dem_data.rio.bounds()
    custom_filter = '["waterway"~"river"]'
    graph = ox.graph_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], custom_filter=custom_filter, simplify=True, retain_all=True, truncate_by_edge=True)
    gdf = ox.graph_to_gdfs(graph, nodes=False)

    with rasterio.open(tif_path) as src:
        window = from_bounds(*src.bounds, src.transform)
        transform = rasterio.windows.transform(window, src.transform)
        raster = np.zeros((int(window.height), int(window.width)), dtype=np.uint8)
        shapes = ((geom, 1) for geom in gdf['geometry'])
        burned = rasterize(shapes, out=raster, fill=0, transform=transform, all_touched=True)
        distance_grid = scipy.ndimage.distance_transform_edt(burned == 0)
        decay_grid = np.exp(-0.07 * distance_grid)

        clipped_meta = src.meta.copy()
        clipped_meta.update({"driver": "GTiff", "height": int(window.height), "width": int(window.width), "transform": transform, "dtype": rasterio.float32, "count": 1, "compress": 'lzw'})
        output_path = os.path.join(tif_dir, f'exponential_decay_CO_river_{tile_number}.tif')
        with rasterio.open(output_path, 'w', **clipped_meta) as dst:
            dst.write(decay_grid.astype(np.float32), 1)

    src_ds = gdal.Open(output_path, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataRivers_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None
    os.remove(output_path)

    return output_resampled_path
    del dem_data, bbox, custom_filter, graph, gdf
    del window, transform, raster, shapes, burned, distance_grid, decay_grid
    del clipped_meta, output_path
    del src_ds, target_ds, driver, output_resampled_path

def process_roads(tif_path, tif_dir, tile_number):
    extent_data = rioxarray.open_rasterio(tif_path)
    bbox = extent_data.rio.bounds()
    graph = ox.graph_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], network_type='drive', simplify=True)
    gdf = ox.graph_to_gdfs(graph, nodes=False)

    with rasterio.open(tif_path) as src:
        window = from_bounds(*src.bounds, src.transform)
        transform = rasterio.windows.transform(window, src.transform)
        raster = np.zeros((int(window.height), int(window.width)), dtype=np.uint8)
        shapes = ((geom, 1) for geom in gdf['geometry'])
        burned = rasterize(shapes, out=raster, fill=0, transform=transform, all_touched=True)
        distance_grid = scipy.ndimage.distance_transform_edt(burned == 0)
        decay_grid = np.exp(-0.07 * distance_grid)

        clipped_meta = src.meta.copy()
        clipped_meta.update({"driver": "GTiff", "height": int(window.height), "width": int(window.width), "transform": transform, "dtype": rasterio.float32, "count": 1, "compress": 'lzw'})
        output_path = os.path.join(tif_dir, f'exponential_decay_CO_roads_{tile_number}.tif')
        with rasterio.open(output_path, 'w', **clipped_meta) as dst:
            dst.write(decay_grid.astype(np.float32), 1)

    src_ds = gdal.Open(output_path, gdal.GA_ReadOnly)
    target_ds = gdal.Open(tif_path, gdal.GA_ReadOnly)
    driver = gdal.GetDriverByName('GTiff')
    output_resampled_path = os.path.join(tif_dir, f"output_resampled_dataRoads_{tile_number}.tif")
    out_ds = driver.Create(output_resampled_path, target_ds.RasterXSize, target_ds.RasterYSize, 1, src_ds.GetRasterBand(1).DataType)
    out_ds.SetGeoTransform(target_ds.GetGeoTransform())
    out_ds.SetProjection(target_ds.GetProjection())
    gdal.ReprojectImage(src_ds, out_ds, src_ds.GetProjection(), target_ds.GetProjection(), gdal.GRA_Bilinear)
    src_ds, target_ds, out_ds = None, None, None
    os.remove(output_path)

    return output_resampled_path
    del extent_data, bbox, graph, gdf
    del window, transform, raster, shapes, burned, distance_grid, decay_grid
    del clipped_meta, output_path
    del src_ds, target_ds, driver, output_resampled_path

# OFFSET_X = 20  # Horizontal offset
# OFFSET_Y = 20  # Vertical offset

def get_tiles(ds, width=CHIP_SIZE, height=CHIP_SIZE):
    offset_x = 128  # Horizontal offset
    offset_y = 128 # Vertical offset

    
    nols, nrows = ds.meta['width'], ds.meta['height']
    #offsets = product(range(0, nols, width), range(0, nrows, height))
    offsets = product(range(0, nols, offset_x), range(0, nrows, offset_y))

    big_window = windows.Window(col_off=0, row_off=0, width=nols, height=nrows)
    for col_off, row_off in offsets:
        window = windows.Window(col_off=col_off, row_off=row_off, width=width, height=height).intersection(big_window)
        transform = windows.transform(window, ds.transform)
        yield window, transform

def process_file(label, input_filepath, output_folder):
    #OFFSET_X = 20
    #OFFSET_Y = 20
    with rio.open(input_filepath) as inds:
        nodata = inds.nodata  # Get the NoData value from the dataset
        meta = inds.meta.copy()
        
        for window, transform in get_tiles(inds):
            if window.width == CHIP_SIZE and window.height == CHIP_SIZE:  # Check if the tile dimensions are as expected
                data = inds.read(window=window)
                if nodata is not None:
                    valid_data_mask = (data != nodata)
                else:
                    valid_data_mask = (data == data)
                
                if valid_data_mask.any():  # Check if there's any valid data within the tile
                    meta['transform'] = transform
                    meta['width'], meta['height'] = window.width, window.height
                    outpath = os.path.join(output_folder, output_filename.format(int(window.col_off), int(window.row_off)))
                    with rio.open(outpath, 'w', **meta) as outds:
                        outds.write(data)
    print(f"Processing for {label} completed.")

if __name__ == "__main__":
    for tile_number in TILENUMBER:
        tif_path = f"{THEFOLDER}\\PCLTILES\\pcltile_{tile_number}.tif"
        tif_dir = f"{THEFOLDER}\\INFERENCETILES\\{tile_number}"
        os.makedirs(tif_dir, exist_ok=True)

        resampled_files = [
            process_dem(tif_path, tif_dir, tile_number),
            process_lidar(tif_path, tif_dir, tile_number),
            process_rivers(tif_path, tif_dir, tile_number),
            process_roads(tif_path, tif_dir, tile_number)
        ]

        delete_non_resampled_files([os.path.basename(f) for f in resampled_files], tif_dir)

        # Define input files as a dictionary
        input_files = {
            'lidar': f'output_resampled_dataLIDAR_{tile_number}.tif',
            'dem': f'output_resampled_dataDEM_{tile_number}.tif',
            'roads': f'output_resampled_dataRoads_{tile_number}.tif',
            'rivers': f'output_resampled_dataRivers_{tile_number}.tif'
        }
        output_filename = 'tile_{}-{}.tif'

        # Define the base output path
        out_base_path = f"{THEFOLDER}\\INFERENCETILES"
        os.makedirs(out_base_path, exist_ok=True)

        # Process each file
        for label, filename in input_files.items():
            input_filepath = os.path.join(tif_dir, filename)
            output_folder = os.path.join(out_base_path, label)
            os.makedirs(output_folder, exist_ok=True)
            process_file(label, input_filepath, output_folder)

        print(f"Processing for tile {tile_number} completed.")

    print("All processing completed.")


Processing for lidar completed.
Processing for dem completed.
Processing for roads completed.
Processing for rivers completed.
Processing for tile 70000-40000 completed.
All processing completed.


In [15]:
import os
import re

# Define the directory path
#directory_path = 'C:\\Users\\smdur\\OneDrive\\Desktop\\PCLCONUS\\Input\\inferencetiles\\hag'

# Regular expression to extract the identifier part of the filename 'tile_{identifier}.tif'
pattern = re.compile(r'tile_(\d+-\d+)\.tif')

# List all files in the directory
files = os.listdir(output_folder)

# Use a set to avoid duplicate identifiers
identifiers = set()

# Extract identifiers from filenames
for file in files:
    match = pattern.search(file)
    if match:
        identifiers.add(match.group(1))

# Convert the set to a sorted list
identifier_list = sorted(list(identifiers))
print(len(identifier_list))
print("done")


1521
done


In [16]:
# hag_max = 30.0
# dem_max = 4379.1279296875
# roads_max = 1.0
# rivers_max = 1.0

In [17]:
#predict

import os
import rasterio
import numpy as np
import tensorflow as tf

# tilename = '0-0'
# input_hag_path = f"C:\\Users\\smdur\\OneDrive\\Desktop\\GLOBALPCL\\CNNPCLDEMO\\inferencetiles\\hag\\tile_{tilename}.tif"
# input_dem_path = f"C:\\Users\\smdur\\OneDrive\\Desktop\\GLOBALPCL\\CNNPCLDEMO\\inferencetiles\\dem\\tile_{tilename}.tif"
# input_roads_path = f"C:\\Users\\smdur\\OneDrive\\Desktop\\GLOBALPCL\\CNNPCLDEMO\\inferencetiles\\roads\\tile_{tilename}.tif"
# input_rivers_path = f"C:\\Users\\smdur\\OneDrive\\Desktop\\GLOBALPCL\\CNNPCLDEMO\\inferencetiles\\rivers\\tile_{tilename}.tif"

def load_and_preprocess_image(hag_path, dem_path, roads_path, rivers_path):
    with rasterio.open(hag_path) as src:
        hag_image = src.read(1)
    with rasterio.open(dem_path) as src:
        dem_image = src.read(1)
    with rasterio.open(roads_path) as src:
        roads_image = src.read(1)
    with rasterio.open(rivers_path) as src:
        rivers_image = src.read(1)

    # Normalize and stack the images
    hag_image = np.array(hag_image).astype('float32') / hag_max
    dem_image = np.array(dem_image).astype('float32') / dem_max
    roads_image = np.array(roads_image).astype('float32') / roads_max
    rivers_image = np.array(rivers_image).astype('float32') / rivers_max

    # Stack images along the last dimension
    combined_image = np.stack([hag_image, dem_image, roads_image, rivers_image], axis=-1)

    # Add batch dimension
    combined_image = np.expand_dims(combined_image, axis=0)
    return combined_image



for i in range(len(identifier_list)):
    tilename = identifier_list[i]
    #print(tilename)
    #input_hag_path = f"{out_base_path}\\lidar\\tile_{tilename}.tif"
    #input_dem_path = f"{out_base_path}\\dem\\tile_{tilename}.tif"
    #input_roads_path = f"{out_base_path}\\roads\\tile_{tilename}.tif"
    #input_rivers_path = f"{out_base_path}\\tile_{tilename}.tif"

    input_hag_path = os.path.join(out_base_path, "lidar", f"tile_{tilename}.tif")
    input_dem_path = os.path.join(out_base_path, "dem", f"tile_{tilename}.tif")
    input_roads_path = os.path.join(out_base_path, "roads", f"tile_{tilename}.tif")
    input_rivers_path = os.path.join(out_base_path, "rivers", f"tile_{tilename}.tif")


    input_image = load_and_preprocess_image(input_hag_path, input_dem_path, input_roads_path, input_rivers_path)
    predicted_image = model.predict(input_image)
    predicted_image = np.squeeze(predicted_image)
    
    # Debug print to check if all outputs are the same
    #print("Unique values in predicted output:", np.unique(predicted_image))
    
    # Adjust the scaling factor based on how the labels were scaled during training
    predicted_image *= 100
    
    #output_image_path = f"{THEFOLDER}\\predictions\\predicted_tile_{tilename}.tif"
    predictions_folder = os.path.join(THEFOLDER, "predictions")
    os.makedirs(predictions_folder, exist_ok=True)
    output_image_path = os.path.join(predictions_folder, f"predicted_tile_{tilename}.tif")

    
    with rasterio.open(input_dem_path) as src: 
        profile = src.profile
    
    with rasterio.open(output_image_path, 'w', **profile) as dst:
        dst.write(predicted_image.astype(rasterio.uint8), 1)

print("Done")
# #Function to load and preprocess image pytorch
# def load_and_preprocess_image(hag_path, dem_path, roads_path, rivers_path):
#     with rasterio.open(hag_path) as src:
#         hag_image = src.read(1)
#     with rasterio.open(dem_path) as src:
#         dem_image = src.read(1)
#     with rasterio.open(roads_path) as src:
#         roads_image = src.read(1)
#     with rasterio.open(rivers_path) as src:
#         rivers_image = src.read(1)

#     # Normalize and stack the images
#     hag_image = np.array(hag_image).astype('float32') / hag_max
#     dem_image = np.array(dem_image).astype('float32') / dem_max
#     roads_image = np.array(roads_image).astype('float32') / roads_max
#     rivers_image = np.array(rivers_image).astype('float32') / rivers_max

#     # Stack images along the last dimension
#     combined_image = np.stack([hag_image, dem_image, roads_image, rivers_image], axis=-1)

#     # Convert to PyTorch tensor and add batch dimension
#     combined_image = torch.tensor(combined_image, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)
#     return combined_image

# for i in range(len(identifier_list)):
#     tilename = identifier_list[i]

#     input_hag_path = os.path.join(out_base_path, "lidar", f"tile_{tilename}.tif")
#     input_dem_path = os.path.join(out_base_path, "dem", f"tile_{tilename}.tif")
#     input_roads_path = os.path.join(out_base_path, "roads", f"tile_{tilename}.tif")
#     input_rivers_path = os.path.join(out_base_path, "rivers", f"tile_{tilename}.tif")

#     input_image = load_and_preprocess_image(input_hag_path, input_dem_path, input_roads_path, input_rivers_path)

#     # Set the model to evaluation mode
#     model.eval()
    
#     # Get the prediction from the model
#     with torch.no_grad():
#         predicted_image = model(input_image)
    
#     predicted_image = predicted_image.squeeze().cpu().numpy()
    
#     # Adjust the scaling factor based on how the labels were scaled during training
#     predicted_image *= 100
    
#     predictions_folder = os.path.join(THEFOLDER, "predictions")
#     os.makedirs(predictions_folder, exist_ok=True)
#     output_image_path = os.path.join(predictions_folder, f"predicted_tile_{tilename}.tif")

#     with rasterio.open(input_dem_path) as src: 
#         profile = src.profile
    
#     with rasterio.open(output_image_path, 'w', **profile) as dst:
#         dst.write(predicted_image.astype(rasterio.uint8), 1)

# print("Done")

Done


In [18]:
import os
import glob
import subprocess
TILENUMBER = ['70000-40000']

# Define the base folder and output paths
#THEFOLDER = r"C:\Users\smdur\OneDrive\Desktop\GlobalPCL"
predictions_folder = os.path.join(THEFOLDER, "predictions")
output_dir = os.path.join(THEFOLDER, "mergedoutput")

# Create necessary directories if they don't exist
os.makedirs(predictions_folder, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

output_base_name = "predMerged_"  # Base name for output files

# Get a list of TIFF files
tifs = glob.glob(os.path.join(predictions_folder, "*.tif"))

# Define chunk size for processing
chunk_size = 300

# Calculate the number of chunks needed
num_chunks = len(tifs) // chunk_size
if len(tifs) % chunk_size != 0:
    num_chunks += 1  # Add one more chunk for the remaining files

# Loop through the TIFF files in chunks
for chunk_id in range(num_chunks):
    start_idx = chunk_id * chunk_size
    end_idx = min((chunk_id + 1) * chunk_size, len(tifs))
    chunk_tifs = tifs[start_idx:end_idx]
    
    output_tif = os.path.join(output_dir, f"{output_base_name}{chunk_id + 1}.tif")

    merge_command_hag = [
        "python",
        "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
        "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
        "-o", output_tif,
        "-n", "-9999",
        "-a_nodata", "-9999",
    ] + chunk_tifs

    # Run the gdal_merge command for the current chunk
    process_hag = subprocess.run(merge_command_hag, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Check if the command for the current chunk was successful
    if process_hag.returncode != 0:
        # An error occurred, print the error
        print(f"Error occurred while merging TIFF files for chunk {chunk_id + 1}:")
        print(process_hag.stderr)
    else:
        print(f"TIFF files merged successfully for chunk {chunk_id + 1}. Output: {output_tif}")

# Merge all chunks into a final output file
final_output_tif = os.path.join(THEFOLDER, "FINALOUTPUTTILES", f"predMerged_PCL_{TILENUMBER}.tif")
os.makedirs(os.path.dirname(final_output_tif), exist_ok=True)

chunk_tifs = glob.glob(os.path.join(output_dir, "*.tif"))

merge_command_final = [
    "python",
    "C:\\Users\\smdur\\anaconda3\\envs\\globalpcl\\Scripts\\gdal_merge.py",
    "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
    "-o", final_output_tif,
    "-n", "-9999",
    "-a_nodata", "-9999",
] + chunk_tifs

# Run the gdal_merge command for the final merge
process_final = subprocess.run(merge_command_final, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

# Check if the command for the final merge was successful
if process_final.returncode != 0:
    # An error occurred, print the error
    print("Error occurred while merging final TIFF files:")
    print(process_final.stderr)
else:
    print("Final TIFF files merged successfully.")

# # Clean up temporary chunk files
# for tif in chunk_tifs:
#     try:
#         os.remove(tif)
#         print(f"Deleted {tif}")
#     except Exception as e:
#         print(f"Failed to delete {tif}: {e}")

# print("Done")

# import shutil

# if os.path.exists(predictions_folder):
#     shutil.rmtree(predictions_folder)
# os.makedirs(predictions_folder, exist_ok=True)


TIFF files merged successfully for chunk 1. Output: C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\mergedoutput\predMerged_1.tif
TIFF files merged successfully for chunk 2. Output: C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\mergedoutput\predMerged_2.tif
TIFF files merged successfully for chunk 3. Output: C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\mergedoutput\predMerged_3.tif
TIFF files merged successfully for chunk 4. Output: C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\mergedoutput\predMerged_4.tif
TIFF files merged successfully for chunk 5. Output: C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\mergedoutput\predMerged_5.tif
TIFF files merged successfully for chunk 6. Output: C:\Users\smdur\OneDrive\Desktop\GlobalPCL23\mergedoutput\predMerged_6.tif
Final TIFF files merged successfully.
