In [1]:
import os
import rasterio
from rasterio.mask import mask
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import box

In [2]:
# Calculate unique values and counts for the masked data
def calculate_unique_values_and_counts(masked_data):
    unique_values, counts = np.unique(masked_data, return_counts=True)
    return unique_values, counts

raster_dir = r'Z:\z_resources\esa-people\burn_severity'
shapefile_path = r"C:\Users\admin\Downloads\forest_condition_specific_eu_bioregions\forest_condition_bioregions_eu_EPSG4326.shp"
output_path = r"C:\Users\admin\Downloads\unique_values"

# Iterate over each raster in the directory
for raster_filename in os.listdir(raster_dir):
    raster_path = os.path.join(raster_dir, raster_filename)

    # Check if the file is a raster (you may need to adjust the condition)
    if raster_filename.endswith('.tif'):
        # Read the raster file
        with rasterio.open(raster_path) as raster_file:
            # Dataframe for the data
            final_result_df = pd.DataFrame()

            # Read the shapefile
            gdf = gpd.read_file(shapefile_path)
            print("Shapefile opened")

            # Dictionary to accumulate counts for each unique value across tiles and polygons
            cumulative_counts = {}

            for idx, row in gdf.iterrows():
                print("Processing {} polygon {} of {}".format(row["FIRST_code"], idx + 1, len(gdf)))

                geo_row = gpd.GeoSeries(row['geometry']) # This is the polygon geometry.
                out_image, out_transform = rasterio.mask.mask(raster_file, geo_row, crop=True) 

                # Access the value from the "FIRST_code" column
                first_code_value = row['FIRST_code']

                # Obtain the number of tiles in both directions.
                height = out_image.shape[1]
                width  = out_image.shape[2]

                #check the size of the raster image
                if out_image.nbytes > (3* 10**9):
                    print("the polygon {} exceeds 3Gb of memory, we will split the array in tiles of 1000. Current size is GB: {} ".format(row["FIRST_code"], (out_image.nbytes) / np.power(10.0,9)))

                    unique_values = {}
                    tilesize = 1000
                    for i in range(0, width, tilesize): #tilesize marks from where to where in width
                        for j in range(0, height, tilesize):
                             # Read the tile data
                            tile_width = min(tilesize, width - i)
                            tile_height = min(tilesize, height - j)
                            data = out_image[:, j:j+tile_height, i:i+tile_width]

                            # get the unique values and the count
                            unique_values, counts = calculate_unique_values_and_counts(data)

                            for unique_value, count in zip(unique_values, counts):
                                cumulative_counts[unique_value] = cumulative_counts.get(unique_value, 0) + count
                else:
                    
                    unique_values, counts = calculate_unique_values_and_counts(out_image)
                    for unique_value, count in zip(unique_values, counts):
                        cumulative_counts[unique_value] = cumulative_counts.get(unique_value, 0) + count

                # Convert the cumulative_counts dictionary to a DataFrame
                result_df = pd.DataFrame.from_dict(cumulative_counts, orient='index', columns=[first_code_value])

                # Concatenate the current result with the final result DataFrame, ensuring indices match
                final_result_df = pd.concat([final_result_df, result_df], axis=1, join='outer', sort=True)

            # Join output_path and the raster file name (without extension) as the CSV output file name
            csv_output_filename = os.path.join(output_path, f'{os.path.splitext(raster_filename)[0]}_final_result.csv')

            # Save the final result to a CSV file
            final_result_df.to_csv(csv_output_filename, index=True)

Shapefile opened
Processing polygon 1 of 13
Processing polygon 2 of 13
Processing polygon 3 of 13
Processing polygon 4 of 13
Processing polygon 5 of 13
Processing polygon 6 of 13
Processing polygon 7 of 13
Processing polygon 8 of 13
Processing polygon 9 of 13
Processing polygon 10 of 13
the polygon Outside exceeds 3Gb of memory, we will split the array in tiles of 1000. Current size is GB: 6.31570952 
Processing polygon 11 of 13
Processing polygon 12 of 13
Processing polygon 13 of 13
Shapefile opened
Processing polygon 1 of 13
Processing polygon 2 of 13
Processing polygon 3 of 13
Processing polygon 4 of 13
Processing polygon 5 of 13
Processing polygon 6 of 13
Processing polygon 7 of 13
Processing polygon 8 of 13
Processing polygon 9 of 13
Processing polygon 10 of 13
the polygon Outside exceeds 3Gb of memory, we will split the array in tiles of 1000. Current size is GB: 6.31570952 
Processing polygon 11 of 13
Processing polygon 12 of 13
Processing polygon 13 of 13
Shapefile opened
Proce