In [1]:
import os
import shutil
import geopandas as gpd
import rasterio
from rasterio.features import geometry_mask
from shapely.geometry import box

In [2]:
source_path = "./perfect_data"
target_path = "./dataset"

In [3]:
for road_folder in os.listdir(source_path):
    road_folder_path = os.path.join(source_path, road_folder)
    if os.path.isdir(road_folder_path):
        for date in os.listdir(road_folder_path):
            target_date_path = os.path.join(target_path, date)
            if os.path.exists(target_date_path) == False:
                os.makedirs(target_date_path)
            date_path = os.path.join(road_folder_path, date)
            if os.path.isdir(date_path):
                for file in os.listdir(date_path):
                    file_path = os.path.join(date_path, file)
                    if file.endswith(".tif"):
                        new_name = os.path.join(target_date_path, road_folder + "_" + date + ".tif")
                        shutil.copy(file_path, new_name)

In [9]:
shapefile_path = "./train/pac_2024_training.shp"
shapes = gpd.read_file(shapefile_path)

In [10]:
for date in os.listdir(target_path):
    date_path = os.path.join(target_path, date)
    raster_files = [os.path.join(date_path, f) for f in os.listdir(date_path) if f.endswith('.tif')]

    raster_bounds = []

    for raster_file in raster_files:
        with rasterio.open(raster_file) as src:
            bounds = src.bounds 
            raster_bounds.append(box(bounds.left, bounds.bottom, bounds.right, bounds.top))

    raster_geometries = gpd.GeoSeries(raster_bounds)

    filtered_shapes = shapes[shapes.geometry.apply(lambda x: raster_geometries.intersects(x).any())]
    filtered_shapefile_path = os.path.join("./shapefiles", date + ".shp")
    filtered_shapes.to_file(filtered_shapefile_path)

In [5]:
for file in os.listdir(target_path):
    file_path = os.path.join(target_path, file)
    if file.endswith(".tif.aux.xml"):
        if os.path.exists(file_path):
            os.remove(file_path)

In [6]:
smallest_width = []
smallest_height = []
for road_folder in os.listdir(source_path):
    road_folder_path = os.path.join(source_path, road_folder)
    if os.path.isdir(road_folder_path):
        for date in os.listdir(road_folder_path):
            date_path = os.path.join(road_folder_path, date)
            if os.path.isdir(date_path):
                for file in os.listdir(date_path):
                    file_path = os.path.join(date_path, file)
                    if file.endswith(".tif"):
                        with rasterio.open(file_path) as dataset:
                            pixel_width = dataset.width
                            pixel_height = dataset.height
                            smallest_width.append(pixel_width)
                            smallest_height.append(pixel_height)

sorted_width = sorted(smallest_width)
sorted_height = sorted(smallest_height)

# print top 20 smallest
print(sorted_width[:20])
print(sorted_height[:20])


[262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 282, 282, 282, 282, 282, 289, 320, 320, 320, 320]
[263, 263, 263, 263, 263, 263, 263, 263, 263, 263, 284, 284, 284, 284, 284, 289, 320, 320, 320, 320]
