In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from shapely.geometry import shape
from shapely.validation import explain_validity

In [2]:
# Get the current working directory
current_dir = os.path.abspath('')

# Search for the 'constants.py' file starting from the current directory and moving up the hierarchy
project_root = current_dir
while not os.path.isfile(os.path.join(project_root, 'constants.py')):
    project_root = os.path.dirname(project_root)

# Add the project root to the Python path
sys.path.append(project_root)

In [3]:
from constants import LUP_LABELED, DATA_PATH

In [4]:
lup_labeled = gpd.read_file(LUP_LABELED)



In [5]:
lup_labeled.crs

<Projected CRS: EPSG:32721>
Name: WGS 84 / UTM zone 21S
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 60°W and 54°W, southern hemisphere between 80°S and equator, onshore and offshore. Argentina. Bolivia. Brazil. Falkland Islands (Malvinas). Paraguay. Uruguay.
- bounds: (-60.0, -80.0, -54.0, 0.0)
Coordinate Operation:
- name: UTM zone 21S
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [None]:
#hansen = rasterio.open(HANSEN_LOSSYEAR_FILEPATH)

In [None]:
#hansen.meta

In [None]:
'''dst_crs = 'EPSG:32721'
# Calculate the transformation needed for the reprojection
transform, width, height = calculate_default_transform(
    hansen.crs, dst_crs, hansen.width, hansen.height, *hansen.bounds)
kwargs = hansen.meta.copy()
kwargs.update({
    'crs': dst_crs,
    'transform': transform,
    'width': width,
    'height': height
})
'''

In [None]:
'''output_path = os.path.join(DATA_PATH, 'processing','hansen_output', 'hansen_reprojected.tiff')
# Write the new file with the updated metadata
with rasterio.open(output_path, 'w', **kwargs) as dst:
    # Assuming you want to reproject and write the first band
    reproject(
        source=rasterio.band(hansen, 1),
        destination=rasterio.band(dst, 1),
        src_transform=hansen.transform,
        src_crs=hansen.crs,
        dst_transform=transform,
        dst_crs=dst_crs,
        resampling=Resampling.nearest
    )'''
        

In [16]:
lup_labeled.shape

(70842, 5)

In [10]:
lup_labeled.geometry.is_empty.sum()

0

In [22]:
# A result of qgis processing creating sliver polygons removed with negative and positive buffers. 
lup_labeled[lup_labeled['geometry'].isnull()].shape

(9972, 5)

In [21]:
lup_labeled[lup_labeled['geometry'].notnull()].shape

(60870, 5)

In [6]:
lup_labeled = lup_labeled[lup_labeled['geometry'].notnull()]
# Create a list of geometries from the 'lup_labeled' DataFrame
geometries = lup_labeled['geometry'].values

In [7]:
# Check each geometry for validity and print any that are invalid
for geom in geometries:
    if not shape(geom).is_valid:
        print(explain_validity(shape(geom)))


In [9]:
output_path = os.path.join(DATA_PATH, 'processing','hansen_output', 'hansen_masked.tiff')

# Open the reprojected raster file
with rasterio.open(HANSEN_REPROJECTED) as src:
    # Read the metadata of the file
    meta = src.meta.copy()
    
    # Set the 'nodata' value for the output file to a number that represents 'nodata'
    # This could be a specific value that you choose to represent 'nodata'
    nodata_value = 255
    meta.update(nodata=nodata_value)
    
    # Create a list of geometries from the 'lup_labeled' DataFrame
    geometries = lup_labeled['geometry'].values
    
    # Create a mask for the raster using the geometries
    out_image, out_transform = mask(src, geometries, crop=False, nodata=nodata_value, invert=True)
    
    # Update the metadata with the new transform and dimensions
    meta.update({"driver": "GTiff",
                 "height": out_image.shape[1],
                 "width": out_image.shape[2],
                 "transform": out_transform})

    # Write the masked raster to a new TIFF file
    with rasterio.open(output_path, 'w', **meta) as dst:
        dst.write(out_image)

In [None]:
# Initialize an empty array to store the final accumulated deforestation data
accumulated_deforestation = np.zeros_like(destination_array, dtype=np.uint8)

# Loop through each polygon
for index, polygon in lup_labeled.iterrows():
    # Determine the ten-year interval for the polygon
    start_year = polygon['anho_capa']
    end_year = start_year + 10

    # Mask the 'lossyear' raster with the polygon
    masked_lossyear, out_transform = mask(hansen, [polygon['geometry']], crop=True, all_touched=True)

    # Select pixels within the ten-year interval
    interval_mask = (masked_lossyear >= start_year) & (masked_lossyear < end_year)

    # Update the accumulated deforestation array with the selected pixels
    accumulated_deforestation[interval_mask] = masked_lossyear[interval_mask]

# 'accumulated_deforestation' now contains the deforestation data for the ten-year intervals

In [None]:
# Initialize an empty dictionary to store the accumulated pixels for each polygon


accumulated_pixels = {}

# Loop through each file
for deforestation_file in deforestation_files:
    # Construct the full path to the deforestation file
    deforestation_path = os.path.join(HANSEN_TEN_INTERVALS_DIR, deforestation_file)

    # Extract the starting year from the file name
    start_year = int(deforestation_file.split('_')[1].split('-')[0])
    start_year = 2000 + start_year
    print(start_year)

        

         
        # Loop through each polygon that corresponds to the current interval
        for index, polygon in lup_labeled[lup_labeled['anho_capa'] == start_year].iterrows():
            # Mask the raster with the polygon
            out_image, out_transform = mask(destination_array, [polygon['geometry']], crop=True, all_touched=True)
            
            # Check if the polygon has been processed before
            if index not in accumulated_pixels:
                accumulated_pixels[index] = out_image.copy()
            else:
                # Otherwise, add the current mask to the accumulated pixels
                accumulated_pixels[index] += out_image

# 'accumulated_pixels' now contains the accumulated deforestation pixels for each polygon

# Training and Validation sets

In [None]:
# Function to select properties for validation set based on distance constraint

def select_validation_set(gdf, distance_threshold=5000, validation_fraction=0.1):
    validation_set = []
    remaining_set = gdf.copy()
    
    while len(validation_set) < validation_fraction * len(gdf):
        # Randomly select a property
        selected_property = remaining_set.sample(1)
        
        # Append to validation set
        validation_set.append(selected_property)
        
        # Calculate distance between the boundary of the selected property and boundaries of remaining properties
        distances = remaining_set.boundary.distance(selected_property.boundary.squeeze())
        
        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set[distances > distance_threshold]
        
        # Break if no more properties can be added
        if len(remaining_set) == 0:
            break
    
    return gpd.GeoDataFrame(pd.concat(validation_set, ignore_index=True))

# Get validation set
validation_gdf = select_validation_set(dissolved_clean_years)

# Get training set by excluding validation set
train_gdf = dissolved_clean_years[~dissolved_clean_years.index.isin(validation_gdf.index)]

print(f"Training set size: {len(train_gdf)}")
print(f"Validation set size: {len(validation_gdf)}")

# Save training set to a GeoPackage file
train_gdf.to_file("training_set.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_gdf.to_file("validation_set.gpkg", driver="GPKG")

In [None]:
# Function to select properties for validation set based on distance constraint
def select_validation_set(gdf, distance_threshold=5000, validation_fraction=0.1):
    validation_set = []
    remaining_set = gdf.copy()
    
    while len(validation_set) < validation_fraction * len(gdf):
        # Randomly select a property
        selected_property = remaining_set.sample(1)
        
        # Append to validation set
        validation_set.append(selected_property)
        
        # Calculate distance between the centroid of the selected property and centroids of remaining properties
        distances = remaining_set.centroid.distance(selected_property.centroid.squeeze())
        
        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set[distances > distance_threshold]
        
        # Break if no more properties can be added
        if len(remaining_set) == 0:
            break
    
    return gpd.GeoDataFrame(pd.concat(validation_set, ignore_index=True))

# Get validation set
validation_gdf = select_validation_set(gdf)

# Get training set by excluding validation set
train_gdf = gdf[~gdf.index.isin(validation_gdf.index)]

print(f"Training set size: {len(train_gdf)}")
print(f"Validation set size: {len(validation_gdf)}")

# Save training set to a GeoPackage file
train_gdf.to_file("training_set.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_gdf.to_file("validation_set.gpkg", driver="GPKG")
