In [1]:
from osgeo import gdal
import rasterio
from rasterio import features
from rasterio.transform import from_origin
from osgeo import gdal, ogr, osr
import os
from difflib import SequenceMatcher
import numpy as np
import random
import itertools
import pandas as pd
import geopandas as gpd
import re
import csv

In [None]:
def create_folder_if_not_exists(folder_path):
    """
    Create a folder if it doesn't exist.

    Parameters:
    folder_path (str): The path of the folder to be created.
    """
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder created at: {folder_path}")
    else:
        print(f"Folder already exists at: {folder_path}")

def get_vector_file_list(path):
    """
    Get a list of the vector files inside the folder
    Parameters:
    - path (str): path of the folder with the resources.

    Returns:
    - File_list (list). list of the resources.
    """
    File_list = [] #f for f in os.listdir(path) if os.isfile(mypath,f)
    for file in os.listdir(path):
        # "anat" is just to get here necessary ones
        if file.endswith(".shp"):
            if file not in File_list:
                File_list.append(os.path.join(path,file))
        else:
            pass
    return File_list


def rasterize_geodataframe_by_column(gdf, value_to_index, resolution, nodata_value, data_type, output_path):
    """
    Rasterizes a geodataframe based on the column field.

    Parameters:
    - gdf (GeoDataFrame): GeoDataFrame to be rasterized.
    - column_name (str): String of the column name.
    - resolution (int): resolution of the raster.
    - output_path (str): output of the raster file.

    Returns:
    - None. Rasterizes the geodataframe.
    """
       
    # Get the bounds of the GeoDataFrame
    xmin, ymin, xmax, ymax = gdf.total_bounds
    # Calculate the number of pixels in x and y directions
    cols = int((xmax - xmin) / resolution)
    rows = int((ymax - ymin) / resolution)
    # Create a transform for the raster
    transform = from_origin(xmin, ymax, resolution, resolution)

    # Create an empty array to hold the rasterized values
    # rasterized_array = np.zeros((rows, cols), dtype=data_type) # if bigger, change the dtype. This is crucial. # np.uint8
    rasterized_array = np.full((rows, cols), nodata_value, dtype=data_type)

    total_values = len(value_to_index)

    # Rasterize each unique value separately
    for idx, (text_value, value) in enumerate(value_to_index.items()):
        print(f"Processing {idx + 1} out of {total_values}")
        mask = gdf['raster_val'] == value
        shapes = gdf.loc[mask, 'geometry']
        if shapes.empty:
            # The value_to_index contains all the possible parameters, but there are some that don't exists in a certain file
            continue

        temp_raster = features.rasterize(
            shapes=shapes,
            out_shape=(rows, cols),
            transform=transform,
            all_touched=True, # Esto asegura que si toca la linea del poligono, se genera el pixel
            default_value=value,
            dtype=data_type, # must be equal to the zeros # np.uint8
        )
        rasterized_array = np.maximum(rasterized_array, temp_raster)

    crs = gdf.crs

    # Define the metadata for the raster
    profile = {
        'driver': 'GTiff',
        'height': rows,
        'width': cols,
        'count': 1,
        'dtype': data_type,
        'crs': crs, #CRS.from_epsg(32628),
        'transform': transform,
        'nodata': nodata_value,  # Set the nodata value in the profile metadata
        'compress': 'deflate',  # Compression method
        'tiled': True,  # Enable tiling
        'legend': {str(key): value for key, value in value_to_index.items()}
    }

    # Write the raster array to a GeoTIFF file
    with rasterio.open(output_path, 'w', **profile) as dst:
        dst.write(rasterized_array, 1)

        # Set nodata values in the raster
        # rasterized_array[rasterized_array == 0] = nodata_value
        # dst.write(rasterized_array, 1)
    

def gdal_rasterize_from_shapefile(shapefile_path, resolution, nodata_value, data_type, output_path, cols=None, rows=None):
    """
    Rasterizes a GeoDataFrame using GDAL directly.

    Parameters:
    - shapefile_path (string): path of the vector file.
    - resolution (int or float): Resolution of the raster (pixel size).
    - nodata_value: The value to use for no-data pixels.
    - data_type: Data type for the output raster (e.g., gdal.GDT_Float32).
    - output_path (str): Path to save the output raster.
    - cols (int, optional): Number of columns in the output raster.
    - rows (int, optional): Number of rows in the output raster.

    Returns:
    - None. The function writes the raster to the specified output path.
    """

    # Open the Shapefile using OGR
    shapefile = ogr.Open(shapefile_path)
    layer = shapefile.GetLayer()

    # Get the bounds of the Shapefile (same as GeoDataFrame's total_bounds)
    xmin, xmax, ymin, ymax = layer.GetExtent()

    # If cols and rows are not provided, calculate them based on resolution
    if cols is None or rows is None:
        cols = int((xmax - xmin) / resolution)
        rows = int((ymax - ymin) / resolution)

    # Create a new raster dataset
    raster_ds = gdal.GetDriverByName('GTiff').Create(
        output_path, cols, rows, 1, data_type,
        options=['COMPRESS=DEFLATE', 'TILED=YES']
    )

    # Set the geotransform (affine transform for the raster)
    geotransform = (xmin, resolution, 0, ymax, 0, -resolution)
    raster_ds.SetGeoTransform(geotransform)

    # Set the CRS (coordinate reference system) from the Shapefile
    srs = layer.GetSpatialRef()
    if srs:
        raster_ds.SetProjection(srs.ExportToWkt())

    # Create the raster band and set no-data value
    band = raster_ds.GetRasterBand(1)
    band.SetNoDataValue(nodata_value)

    # Rasterize the shapefile
    gdal.RasterizeLayer(
        raster_ds,  # Output raster dataset
        [1],        # Raster band to write to
        layer,      # Input OGR layer to rasterize
        options=['ATTRIBUTE=raster_val', 'ALL_TOUCHED=TRUE']
    )

    # Flush and close the raster dataset
    band.FlushCache()
    raster_ds = None  # Close the file and save

    # Close the shapefile
    shapefile = None

    print(f"Rasterization complete: {output_path}")
    
def update_names_based_on_similarity(unique_names, gdf, column_name, similarity_threshold=0):
    """
    Update names in gdf based on similarity to names in unique list.

    Parameters:
    - unique_names (list): list of the unique names.
    - gdf (GeoDataFrame): GeoDataFrame whose names need to be updated.
    - column_name (str): String of the column.
    - similarity_threshold (float): Threshold for similarity ratio.

    Returns:
    - gdf. Updates gdf in place.
    """
    # Add a new column 'valid_text' with None values
    gdf['valid_text'] = None

    total_elements = len(gdf)  # Get total number of elements

    # Iterate through rows of gdf2
    for index, row in gdf.iterrows():

        # Get the value of the column for the current row
        name_gdf = row[column_name]
        highest_similarity_ratio = 0
        best_matching_name = None
        # Iterate through unique names in gdf1
        for unique_name in unique_names:
            # Calculate similarity ratio between names in gdf2 and gdf1
            similarity_ratio = SequenceMatcher(None, unique_name, name_gdf).ratio()
            # Update best matching name if similarity ratio is higher
            if similarity_ratio > highest_similarity_ratio:
                highest_similarity_ratio = similarity_ratio
                best_matching_name = unique_name

        if highest_similarity_ratio >= similarity_threshold:
            # confirmation = input(f"Similarity found: '{name_gdf2}' -> '{name_gdf1}'Is this okay? (y/n): ").strip().lower()
            # if confirmation == "y":
            # print(f"{highest_similarity_ratio} for {name_gdf1} to {best_matching_name}")
            gdf.at[index, 'valid_text'] = best_matching_name

        print(f"Processing element {index + 1}/{total_elements}", end="\r") # This is to track the process

    return gdf
    
def map_gdf_based_on_column_type(gdf, column_name, names_dictionary, names_list):
    """
    Maps values of a column in a GeoDataFrame (gdf) to unique identifiers based on the data type of the column.
    
    Parameters:
    gdf (GeoDataFrame): The input GeoDataFrame.
    column_name (str): The name of the column to be mapped.
    names_dictionary (dict): A dictionary where keys are valid text and values are unique raster values.

    Returns:
    GeoDataFrame: The modified GeoDataFrame with a new column 'raster_val' representing unique identifiers.
    """
    if gdf[column_name].dtype == object:
        print(f"The column '{column_name}' contains strings.")
        gdf = update_names_based_on_similarity(names_list, gdf, column_name, similarity_threshold=0.5)
        print("names updated")
        # Add a new column to the GeoDataFrame containing the unique identifiers
        gdf['raster_val'] = gdf["valid_text"].map(names_dictionary)
    else:
        print(f"The column '{column_name}' does not contain strings.")
        # Get unique values/strings from the specified column, they are always sorted.
        unique_values = sorted(gdf[column_name].unique())
        # Create a dictionary one to one
        value_to_index = {value: value for value in unique_values}
        gdf['raster_val'] = gdf[column_name].map(value_to_index)
    return gdf



def check_same_dimensions(raster_files):
    """
    Check the if the dimensions of all the input rasters have the same dimensions.

    Parameters:
    - raster_files (list): List of raster files.

    Returns:
    - dimensions_list (list). The list with all the dimensions.
    """
    dimensions_list = []

    # Open the first raster file in the list
    for file_path in raster_files[:]:
        with rasterio.open(file_path) as src:
            shape_dimensions = src.shape
            dimensions_list.append(shape_dimensions)

    if len(set(dimensions_list)) == 1:
        print(f"All the elements have the same dimensions{dimensions_list[0]}")
        dimensions_list
    else:
        print("The dimensions are note the same")
        return dimensions_list

In [None]:
"""Specify all the inputs"""
#"Y:\z_resources\im-nca-senegal\v2_shp_occsol_anat\23-12-22\shp_occsol_anat"
input_path = r"Y:\z_resources\ruben\ladncover_test" # Put the year a the end of the files
output_path = input_path + r"\output_files"

create_folder_if_not_exists(output_path)

# Specify the column of the vector file
column_name = 'leyenda'
# Define the resolution of your raster.
resolution = 30  # in meters
# Define the nodata value of your raster.
nodata_value = 0

# Define the data type of the raster.
data_type = gdal.GDT_UInt16
"""
gdal.GDT_Byte,
gdal.GDT_Int16,
gdal.GDT_UInt16,
gdal.GDT_Int32,
gdal.GDT_UInt32,
gdal.GDT_Float32,
gdal.GDT_Float64
"""

# Define the rows and columns for the rasterization. Assign None if you don't want to specify it
rows = 9999
columns = 9999

Folder already exists at: Y:\z_resources\ruben\ladncover_test\output_files


In [None]:
"""Direct way: Transform the vector files and convert them into rasters""" # Direct way
vector_file_list = get_vector_file_list(input_path)
for file in vector_file_list[:]:
    output_path_file = os.path.join(output_path, os.path.basename(file).replace(".shp", ".tif"))
    print(output_path_file)
    gdf = gpd.read_file(file)
    print(f"{os.path.basename(file)} opened")
    # Prepare the dict to raster based on the column
    gdf = map_gdf_based_on_column_type(gdf, column_name, names_dictionary, names_list)
    print("Start rasterization")
    # Fast method
    gdal_rasterize_from_shapefile(file, resolution, nodata_value, data_type, output_path_file, cols=None, rows=None)


In [8]:
"""Check the dimensions of the rasters"""
raster_list = get_raster_file_list(output_path)
check_same_dimensions(raster_list)

All the elements have the same dimensions(129, 262)
