In [33]:
import geopandas as gpd
import pandas as pd
import sys
import os
import rasterio
from rasterio.mask import mask
import numpy as np


In [34]:
# Get the current working directory
current_dir = os.path.abspath('')

# Search for the 'constants.py' file starting from the current directory and moving up the hierarchy
project_root = current_dir
while not os.path.isfile(os.path.join(project_root, 'constants.py')):
    project_root = os.path.dirname(project_root)

# Add the project root to the Python path
sys.path.append(project_root)

In [35]:
from constants import DATA_PATH

In [36]:
dissolved_putid = r"C:\Users\bsf31\Documents\post-meds\data\policy-data\processing\clean_dissolved_clean_putid.gpkg"

In [37]:
# Load the dataset
gdf = gpd.read_file(dissolved_putid)

In [38]:
gdf.geometry = gdf.buffer(-60, join_style= 2)
gdf.geometry = gdf.buffer(60, join_style= 2)


In [39]:
gdf.crs

<Projected CRS: EPSG:32721>
Name: WGS 84 / UTM zone 21S
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 60°W and 54°W, southern hemisphere between 80°S and equator, onshore and offshore. Argentina. Bolivia. Brazil. Falkland Islands (Malvinas). Paraguay. Uruguay.
- bounds: (-60.0, -80.0, -54.0, 0.0)
Coordinate Operation:
- name: UTM zone 21S
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [40]:
# For Visual Check in Qgis

'''
output_path = os.path.join(DATA_PATH,'processing')


# Create the directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)
    # Save the GeoDataFrame as a GeoPackage
# Define the filename for the GeoPackage

filename = os.path.join(output_path, "clean_dissolved_clean_putid.gpkg")
gdf.to_file(filename, driver="GPKG")'''

'\noutput_path = os.path.join(DATA_PATH,\'processing\')\n\n\n# Create the directory if it doesn\'t exist\nif not os.path.exists(output_path):\n    os.makedirs(output_path)\n    # Save the GeoDataFrame as a GeoPackage\n# Define the filename for the GeoPackage\n\nfilename = os.path.join(output_path, "clean_dissolved_clean_putid.gpkg")\ngdf.to_file(filename, driver="GPKG")'

In [41]:
raster = r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\rasters\binary_deforestation_raster.tif"

In [42]:
def select_validation_set(gdf, raster_path, distance_threshold=5000, validation_fraction=0.1):
    # Load the raster to get its CRS and total count of non-NA pixels
    with rasterio.open(raster_path) as src:
        raster_crs = src.crs  # This should be EPSG:4326
        raster_data = src.read(1)  # Assuming the data of interest is in the first band
        total_pixels = (raster_data != src.nodata).sum()
        print(f"Total non-NA pixels in raster: {total_pixels}")

    # Ensure the vector data is in its native CRS, which should be EPSG:32721
    if gdf.crs != 'EPSG:32721':
        raise ValueError("The GeoDataFrame should be in EPSG:32721.")

    # Initialize an empty GeoDataFrame for the validation set
    validation_set = gpd.GeoDataFrame(columns=gdf.columns)
    remaining_set = gdf.copy()

    # Calculate the target number of pixels for the validation set
    target_pixel_count = validation_fraction * total_pixels
    print(f"Target pixel count for validation set: {target_pixel_count}")

    # Initialize a counter for the accumulated pixel count in the validation set
    accumulated_pixel_count = 0

    while accumulated_pixel_count < target_pixel_count and not remaining_set.empty:
        # Randomly select a property from the remaining set
        selected_property = remaining_set.sample(1)
        remaining_set = remaining_set.drop(selected_property.index)

        print(f"Selected property ID: {selected_property.index.values[0]}")

        # Reproject the selected property to the raster's CRS for masking
        selected_property_reprojected = selected_property.to_crs(raster_crs)

        # Mask the raster with the reprojected selected property to count its pixels
        with rasterio.open(raster_path) as src:
            out_image, out_transform = mask(src, selected_property_reprojected.geometry, crop=True)
            selected_pixels = (out_image != src.nodata).sum()
            print(f"Pixels covered by selected property: {selected_pixels}")

        # Update the accumulated pixel count and validation set
        if accumulated_pixel_count + selected_pixels <= target_pixel_count:
            validation_set = pd.concat([validation_set, selected_property], ignore_index=True)
            accumulated_pixel_count += selected_pixels
            print(f"Accumulated pixel count: {accumulated_pixel_count}")

        # Calculate the distance from the selected property to all properties in the remaining set
        distances = remaining_set.distance(selected_property.geometry.squeeze())
        print(f"Minimum distance to next property: {distances.min()}")

        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set.loc[distances > distance_threshold]

        if remaining_set.empty:
            print("No more properties available for selection.")

    print(f"Total properties selected: {len(validation_set)}")
    return validation_set

In [43]:
validation_set = select_validation_set(gdf, raster)


Total non-NA pixels in raster: 84378079
Target pixel count for validation set: 8437807.9
Selected property ID: 1474
Pixels covered by selected property: 3922
Accumulated pixel count: 3922
Minimum distance to next property: 13.239111885520593
Selected property ID: 768
Pixels covered by selected property: 63120
Accumulated pixel count: 67042
Minimum distance to next property: 0.0
Selected property ID: 220
Pixels covered by selected property: 42505
Accumulated pixel count: 109547
Minimum distance to next property: 0.0
Selected property ID: 743
Pixels covered by selected property: 10381
Accumulated pixel count: 119928
Minimum distance to next property: 0.0
Selected property ID: 500
Pixels covered by selected property: 66829
Accumulated pixel count: 186757
Minimum distance to next property: 1243.7201760102437
Selected property ID: 1455
Pixels covered by selected property: 56824
Accumulated pixel count: 243581
Minimum distance to next property: 0.0
Selected property ID: 1041
Pixels covered b

In [44]:

# Save validation set to a GeoPackage file
validation_set.to_file("validation_set.gpkg", driver="GPKG")    

In [45]:
total_non_na_pixels = 84378079  # total non-NA pixels in the raster
accumulated_pixel_count = 8434934  # final accumulated pixel count from the process

# Calculate the current percentage of the total raster area covered
covered_percentage = (accumulated_pixel_count / total_non_na_pixels) * 100

# Calculate the missing percentage to reach at least 10%
missing_percentage = 10 - covered_percentage

print(f"Covered Percentage: {covered_percentage}%")
print(f"Missing Percentage to reach 10%: {missing_percentage}%")

Covered Percentage: 9.996594020586793%
Missing Percentage to reach 10%: 0.003405979413207305%


# Process Rasters

In [4]:
validation_set = gpd.read_file(r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\validation_set.gpkg")

In [46]:
def process_rasters(raster_folder, validation_set, output_folder1, output_folder2):
    # Ensure the output folder exists
    os.makedirs(output_folder1, exist_ok=True)
    os.makedirs(output_folder2, exist_ok=True)


    validation_set = validation_set.to_crs('EPSG:4326')

    # Iterate over each raster file in the folder
    for raster_file in os.listdir(raster_folder):
        if raster_file.endswith('.tif'):  # Check for TIFF files
            raster_path = os.path.join(raster_folder, raster_file)

            with rasterio.open(raster_path) as src:
                # Read the full raster data
                raster_data = src.read()
                nodata_value = src.nodata


                # Mask the raster with the validation set
                masked_data, masked_transform = mask(src, validation_set.geometry, crop=True)
                masked_meta = src.meta.copy()
                masked_meta.update({"driver": "GTiff",
                                    "height": masked_data.shape[1],
                                    "width": masked_data.shape[2],
                                    "transform": masked_transform})

                # Save the masked raster
                masked_output_path = os.path.join(output_folder2, f'val_{raster_file}')
                with rasterio.open(masked_output_path, 'w', **masked_meta) as dest:
                    dest.write(masked_data)


                '''# For the inverse mask, we need to manually set the validation set areas to nodata
                inverse_masked_data = raster_data.copy()
                for geom in validation_set.geometry:
                    filled = rasterio.features.geometry_mask([geom], transform=src.transform, invert=True, out_shape=src.shape)
                    inverse_masked_data[:, filled] = nodata_value

                # Save the inverse masked (clipped) raster
                clipped_output_path = os.path.join(output_folder2, f'val_{raster_file}')
                with rasterio.open(clipped_output_path, 'w', **src.meta) as dest:
                    dest.write(inverse_masked_data)'''


                # Create and save the inverse masked (clipped) raster
                inverse_masked_data, inverse_transform = mask(src, validation_set.geometry, invert=True)
                inverse_meta = src.meta.copy()
                inverse_meta.update({"driver": "GTiff",
                                     "height": inverse_masked_data.shape[1],
                                     "width": inverse_masked_data.shape[2],
                                     "transform": inverse_transform})
                                     
                # Save the inverse masked (clipped) raster
                clipped_output_path = os.path.join(output_folder1, f'train_{raster_file}')
                with rasterio.open(clipped_output_path, 'w', **inverse_meta) as dest:
                    dest.write(inverse_masked_data)

                

                 

In [47]:
process_rasters(r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\rasters", validation_set, r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\training", r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\validation")


In [None]:
'''def select_validation_set(gdf, distance_threshold=5000, validation_fraction=0.1):
    # Initialize an empty GeoDataFrame for the validation set
    validation_set = gpd.GeoDataFrame(columns=gdf.columns)
    # Make a copy of the original GeoDataFrame to work as the remaining set
    remaining_set = gdf.copy()
    
    # Calculate the target size of the validation set based on the specified fraction
    target_size = validation_fraction * len(gdf)

    while len(validation_set) < target_size:
        # Randomly select a property from the remaining set
        selected_property = remaining_set.sample(1)

        # Append the selected property to the validation set
        validation_set = pd.concat([validation_set, selected_property], ignore_index=True)
        
        # Calculate the distance from the selected property to all properties in the remaining set
        distances = remaining_set.distance(selected_property.geometry.squeeze())
        
        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set.loc[distances > distance_threshold]

        # If the remaining set is empty, break the loop to prevent infinite iterations
        if remaining_set.empty:
            break

    return validation_set, remaining_set'''


In [None]:
#validation_set, remaining_set = select_validation_set(gdf)


In [None]:
# Save training set to a GeoPackage file
#remaining_set.to_file("training_set.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
#validation_set.to_file("validation_set.gpkg", driver="GPKG")

In [None]:
# Get validation set
validation_gdf = select_validation_set(gdf)

# Get training set by excluding validation set
train_gdf = gdf[~gdf.index.isin(validation_gdf.index)]

# Save training set to a GeoPackage file
train_gdf.to_file("training_setb.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_gdf.to_file("validation_setb.gpkg", driver="GPKG")

In [None]:
print(f"Training set size: {len(train_gdf)}")
print(f"Validation set size: {len(validation_gdf)}")

# Chips

In [None]:

def create_chips(raster_path, chip_size, output_dir):
    """
    Create chips from a raster.

    :param raster_path: Path to the input raster file.
    :param chip_size: Size of the square chip (number of pixels).
    :param output_dir: Directory where the chips will be saved.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with rasterio.open(raster_path) as src:
        width, height = src.width, src.height
        transform = src.transform

        # Calculate the number of chips in x and y directions
        x_chips = width // chip_size
        y_chips = height // chip_size

        for x in range(x_chips):
            for y in range(y_chips):
                # Calculate the window position
                window = rasterio.windows.Window(x * chip_size, y * chip_size, chip_size, chip_size)
                chip = src.read(window=window)

                # Update the transformation for the chip
                new_transform = rasterio.windows.transform(window, transform)

                # Define the output path for the chip
                chip_path = os.path.join(output_dir, f'chip_{x}_{y}.tif')

                # Save the chip
                with rasterio.open(
                    chip_path,
                    'w',
                    driver='GTiff',
                    height=chip_size,
                    width=chip_size,
                    count=src.count,
                    dtype=chip.dtype,
                    crs=src.crs,
                    transform=new_transform,
                ) as chip_file:
                    chip_file.write(chip)



In [None]:
# Example usage
raster_file = r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\binary_deforestation_raster.tif"
chip_size = 256  # Define the desired chip size (256x256 pixels)
output_directory = r"C:\Users\bsf31\Documents\post-meds\data\policy-data\ml_data\chips"
create_chips(raster_file, chip_size, output_directory)
