# Raster optimizer
- We want to import some rasters inside and make them all full optimized
- Here we cover all the possible issues we could get from a raster layer
- The process gives an optimized raster as an output

In [1]:
"""This script goes for every tif file in a folder"""

import os
import sys
from sys import path
from osgeo import gdal
from numpy import random
import numpy as np
import time

In [2]:
"""input and output"""
path = r"C:\Users\admin\Downloads\wetlands"
output_path = path + "\output"

In [3]:
def get_raster_file_list(path):
    """Get a list of the raster files inside the folder"""
    File_list = [] #f for f in os.listdir(path) if os.isfile(mypath,f)
    for file in os.listdir(path):
        if file.endswith(".tif") or file.endswith(".tiff"):
            if file not in File_list:
                File_list.append(os.path.join(path,file))
        else:
            pass
    return File_list

def create_output_folder(output_path):
    """Create the output folder"""
    if not os.path.exists(output_path):
        print("created the output folder")    
        return os.mkdir(output_path)
    else:
        return print("there is already an output folder")


In [4]:
def get_optimal_dtype(raster_path, block_size=1024):
    # Open the raster dataset
    dataset = gdal.Open(raster_path)
    if dataset is None:
        raise ValueError("Unable to open the raster file.")

    # Get raster properties
    num_cols = dataset.RasterXSize
    num_rows = dataset.RasterYSize
    num_bands = dataset.RasterCount

    # Analyze the range of values and determine the optimal data type
    min_val = None
    max_val = None

    for y in range(0, num_rows, block_size):
        rows = min(block_size, num_rows - y)
        for x in range(0, num_cols, block_size):
            cols = min(block_size, num_cols - x)

            # Read a block of raster data
            block_data = dataset.ReadAsArray(x, y, cols, rows)

            # Get unique values and update min_val and max_val
            unique_values = np.unique(block_data)
            if min_val is None:
                min_val = np.min(unique_values)
                max_val = np.max(unique_values)
            else:
                min_val = min(min_val, np.min(unique_values))
                max_val = max(max_val, np.max(unique_values))

    dtype = None

    if np.issubdtype(block_data.dtype, np.integer):
        if min_val >= 0:
            if max_val <= 255:
                dtype = gdal.GDT_Byte
            elif max_val <= 65535:
                dtype = gdal.GDT_UInt16
            else:
                dtype = gdal.GDT_Int32
        else:
            if min_val >= -32768 and max_val <= 32767:
                dtype = gdal.GDT_Int16
            else:
                dtype = gdal.GDT_Int32
    else:
        # Floating-point data
        dtype = gdal.GDT_Float32

    return dtype

def convert_data_type(input_raster, output_raster, data_type):
    # Open the input raster dataset
    dataset = gdal.Open(input_raster)
    if dataset is None:
        raise ValueError("Unable to open the input raster file.")
    
    if not data_type:
        print("Get the data type")
        band = dataset.GetRasterBand(1)
        data_type = band.DataType

    # get the projection
    projection = dataset.GetProjection()

    # Create the output raster dataset with the desired data type
    gdal.Warp(output_raster, 
              dataset, 
              outputType=data_type, 
              dstSRS=projection, 
              dstNodata=0, 
              creationOptions = ["TILED=YES","COMPRESS=DEFLATE",'COPY_SRC_OVERVIEWS=YES'])

    # Close the datasets
    dataset = None
    return print("finished with: " + output_raster)


In [5]:
File_list = get_raster_file_list(path)
print(File_list)
create_output_folder(output_path)

rule = "don't calculate optimal type" # Habrá que mejorar esto

for file in File_list[:]:
    print(file)
    # get the file name
    filename = os.path.basename(file)
    # split the filename by the dot and add a sufix
    output_filename = os.path.splitext(filename)[0] + ".tif"
    #build the output path + new filename
    output_file = os.path.join(output_path,output_filename)
    # get the optimal file type
    if not rule:
        dtype = get_optimal_dtype(file)
        convert_data_type(file, output_file, dtype)
        
    print("no need to optimize")
    convert_data_type(file, output_file, data_type=None)
    
print("Raster data type optimization completed.")

['C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N10.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N15.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N20.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N35.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N40.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N45.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N50.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N55.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E0N65.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E100N0.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E100N10.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E100N15.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E100N20.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_2020_E100N25.tif', 'C:\\Users\\admin\\Downloads\\wetlands\\GWL_FCS30_20

In [None]:
"""Create the overviews if neccesary"""
File_list = get_raster_file_list(output_path)

for file in File_list[:]:
    # Open the raster dataset
    dataset = gdal.Open(file, gdal.GA_Update) # GA_Update is used when we want to do modification on the raster

    if dataset:
        # Get the first raster band (assuming single-band raster)
        band = dataset.GetRasterBand(1)

        # Check if the band has overviews
        if band.GetOverviewCount() > 0:
            print("Raster band already has overviews.")
        else:
            # Build overviews using multiple overview levels (2x, 4x, 8x, etc.)
            overview_levels = [2, 4, 8]
            dataset.BuildOverviews("average", overview_levels)
            print("Overviews built successfully.")

        dataset = None  # Close the dataset
    else:
        print("Failed to open the dataset.")

Some personal testing

In [None]:
os.chdir(path)

#This iterates for every file in the directory
for file in File_list[:]:
    print(file)
    dataset = gdal.Open(file)

    """This part is just for testing, it is not really neccesary of this case.
     This is a way to get the Metadata of the raster (If the raster is big It can take more than 2mins)"""
    stats =  dataset.GetRasterBand(1).GetStatistics(0,1) #it will return a list(Min, Max, Mean, StdDev)
    Metadata_domain = dataset.GetMetadataDomainList() #From here we get the 'IMAGE_STRUCTURE'
    image_structure = dataset.GetMetadata('IMAGE_STRUCTURE') #here inside

    """The file is too big to read it into a single array, so we are going to split into different ones"""

    """Here we get the size of the file"""
    width = dataset.RasterXSize #columns
    height = dataset.RasterYSize #rows

    # define your tile size
    # it could be 256,512 or 1024. it depends on your scope
    tilesize = 10000

    unique_values_list = []

    """Here we start with the tiling to"""
    for i in range(0, width, tilesize)[0:1]: #tilesize marks from where to where in width
        w = min(i+tilesize, width) - i
        for j in range(0, height, tilesize)[0:1]: #tilesize in height
            #for the edge parts, so we don't get nodata from the borders
            h = min(j+tilesize, height) - j
            try:
                ds = gdal.Translate("",
                dataset, 
                format = 'MEM', 
                # noData = -32768,
                # outputType = gdal.GDT_Int16, 
                # creationOptions = ['COMPRESS=DEFLATE','TILED=YES','COPY_SRC_OVERVIEWS=YES'],
                srcWin = [str(i), str(j), str(w), str(h)])
                print("the dataset is: ", ds)

                band =  ds.GetRasterBand(1)
                array = np.array(band.ReadAsArray())
                values = np.unique(array)
                for x in values:
                    print (x)
                    if x not in unique_values_list:
                        unique_values_list.append(x)
                    else:
                        pass
                ds = None
            except RuntimeError:
                print("The script got an error")
                sys.exit(1)
                



In [8]:
"""
Tenemos que comprobar que los datos sean Categoricos o contiugos
Para categoricos habria que hacer el vecino proximo
Para continuos habría que hacerlo Bicubica - Bilineal
"""


#This iterates for every file in the directory
for file in os.listdir(path)[0:1]:
    #un landcover al ser una classificacion tendra valores tipo int (NearestNeightbour), los valores tipo real son mas para elevaciones, distancias etc (bilineal)

    # resampleAlg = gdal.GRIORA_NearestNeighbour
    # resampleAlg = gdal.GRIORA_Bilinear
    # esto añadelo al final

    #we concatenate the path
    input = path +"/"+ file
    output = out + '/' + file.replace(".tif","") + "_optimized.tif"
    #here is the GDAL transformation, we set the path + / + variable
    
    # gdal.Warp(output, input, creationOptions = ['COMPRESS=' + str(compression)], dstSRS = "EPSG:4326")
    gdal.Translate(output, input, creationOptions = ['COMPRESS=DEFLATE','TILED=YES','BLOCKXSIZE=128','BLOCKYSIZE=128'])
    

    ds = gdal.Open(output, 1)
    band = ds.GetRasterBand(1)
    band.SetRasterColorTable(None) # this works and deletes the color pallete
    # band.DestroyColorTable() #another option

    #set the data type


    del band, ds
        
    else:
        pass

print("It's over the optimization")



It's over the optimization
