# Extract box images around tree coordinates

In this notebook we will extract rectangular images around each tree coordinate from the raster file and store them together with the correct label in a `.npz` file. This file will be loaded in the next notebook to train the model.

### Importing needed libraries & packages

In [73]:
# Toolboxes for data handling
import pandas as pd
import numpy as np

# Toolboxes for raster handling
import rasterio

# Toolboxes for warning handling
import warnings

In [74]:
# Ignore warnings for this notebook
warnings.filterwarnings("ignore")

### Loading data
We will load the data that contains the tree coordinates and labels as well as information on how to transform geo coordinates to pixel coordinates as we need the latter to extract box images around each tree coordinate.

In [75]:
# Define the path to the file containing the needed data
path = "./data/Laubbäume_utm32_cleaned.csv" 

# Import data that contains the labeled (but uncorrected) gps tree
# coordinates
tc_df = pd.read_csv(path)

# Extract those variables that will be of importance
tc_df = tc_df[['X', 'Y', 'desc']]

# Rename the columns
tc_df.columns = ['x_geo', 'y_geo', 'label']

Define a function to transform geo to pixel coordinates to later on extract box images of a certain pixel size

In [76]:
# Define a function to translate geo-coordinates to pixel coordinates
def pixel_coordinate_translation(tc_df, transform):

    # Report that geo coordinates will be translated to pixel coordinates
    print('Started the translation of tree coordinates from geo to pixel.',
          end='\n')
    
    # Create two new columns in the tree top coordinate data frame that will
    # contain the pixel coordinate information
    tc_df['x_pix'] = np.full(tc_df.shape[0], fill_value=np.nan)
    tc_df['y_pix'] = np.full(tc_df.shape[0], fill_value=np.nan)

    # Loop through the coordinates of all treetops
    for i, row in tc_df.iterrows():

        # Extract the coordinate information of a single treetop
        longitude = row['x_geo']
        latitude = row['y_geo']

        # Translate the coordinate to a pixel coordinate
        y_pix, x_pix = ~transform * (longitude, latitude)

        # Add the pixel coordinates to the data frame
        tc_df.loc[i,'x_pix'] = int(x_pix)
        tc_df.loc[i,'y_pix'] = int(y_pix)

        # Report progress each time after 100 treetop coordinates have been
        # translated
        if (((i+1)%100) == 0) and (((i+1)%1000) > 0):
            print('.', end='')
        elif (((i+1)%1000) == 0):
            print('; ' + str(i+1) + ' tree coordinates have been ' + 
                'translated (' + str((100*i)//tc_df.shape[0]) + '%)', 
                end='\n')
        elif ((i+1) == tc_df.shape[0]):
            print('; ' + str(i+1) + ' tree coordinates have been ' + 
                'translated (' + str(100) + '%)', end='\n')

    # Report that geo coordinates have been translated to pixel coordinates
    print('Finished the translation of tree coordinates from geo to pixel.',
          end='\n\n')

    # Return the pixel coordinates that have been added to the dataframe
    return tc_df

Define a function to extract box images around each tree coordinate for each image that will later on be of interest (spectral images, vegetation height image, ...)

In [77]:
# Define a function to extract image boxes around tree top pixel coordinates
def extract_treetop_imageboxes(tc_df, img_path, box_size, n_chans):
    
    # Get information on how to transform pixel coordinates to geo coordinates
    with rasterio.open(img_path) as data:

        # Extract the transformation information
        transform = data.transform    
    
    # Translate geo to pixel coordinates
    tc_df = pixel_coordinate_translation(tc_df, transform)
    
    # Report that image boxes will be extracted from the image data
    print('Started the extraction of image boxes.', end='\n')

    # Define an empty array that will later contain all box images
    boximg_array = np.full((tc_df.shape[0], box_size, box_size, n_chans), 
                            fill_value=np.nan)
    
    # Keep the image ready to be loaded but do not load the whole image
    with rasterio.open(img_path) as img_data:
    
        # Define the pixel coordinate limits of the image
        x_max_img = int(img_data.shape[0])
        y_max_img = int(img_data.shape[1])
    
        # Iterate through the pixel coordinates of all trees
        for i, row in tc_df.iterrows():
            
            # Get the pixel coordinate of the tree
            x_tpc = int(row['x_pix'])
            y_tpc = int(row['y_pix'])

            # Define the pixel coordinates of the box
            y_min_box = y_tpc - box_size//2
            y_max_box = y_tpc + box_size//2
            x_min_box = x_tpc - box_size//2
            x_max_box = x_tpc + box_size//2

            # Check whether the box stays inside of the image limits
            if (y_min_box >= 0) and (y_max_box <= y_max_img) and \
               (x_min_box >= 0) and (x_max_box <= x_max_img):
            
                # Define a box around the treetop coordinate that should be
                # extracted
                box_window = rasterio.windows.Window(y_min_box, x_min_box,
                                                    box_size, box_size)

                # Extract the box image and transpose it as rasters are in the
                # shape: channels, width, height while we need width, height,
                # channels
                boximg_array[i,:,:,:] = \
                    np.transpose(img_data.read(window=box_window), (1,2,0))
                                    
            # Report progress each time after 100 boxes have been extracted
            if ((i+1) == tc_df.shape[0]):
                print('; ' + str(i+1) + ' trees have been processed (' + 
                    str(100) + '%)', end='\n')
            elif (((i+1)%100) == 0):
                print('; ' + str(i+1) + ' trees have been processed (' + 
                    str((100*(i+1))//tc_df.shape[0]) + '%)', end='\n')
            elif (((i+1)%10) == 0) and (i > 0):
                print('.', end='')

            
    # Report that image boxes have been extracted from the spectral data
    print('Finished the extraction of image boxes.', end='\n\n')

    # Return the array containing all image boxes placed around tree tops
    return boximg_array

Extract box images around the tree coordinates for the spectral data of each year.

In [78]:
# Define the years, for which we have spectral data. We omit year 2014 here,
# as it has a lower resolution compared to the images of the other years
years = [2016, 2018, 2020, 2021, 2022]

# Extract box images from the spectral data separately for each year
for year in years:

    # Report that image boxes will be extracted for the spectral data of a
    # specific year
    print('Started the extraction of image boxes for the year ' + 
          str(year) + '.', end='\n\n')

    # Define a box size in pixel for the spectral data (4*4 meters)
    # The spectral data have a resolution of 10 cm
    box_size = 38
    
    # Define the number of channels in the spectral data
    n_chans = 4
    
    # Define a path to the spectral data
    img_path = './data/TDOP/TDOP_' + str(year) + '_weg30m.tif'
    
    # Define a path to save the extracted array of box images
    save_path = './data/TDOP/box_images/TDOP_box_images_' + str(year) + '_weg30m.tif'

    # Extract box images around each tree coordinate
    bia_spec = extract_treetop_imageboxes(tc_df, img_path, box_size, n_chans)
    
    # Save the array as an .npz file
    np.savez(save_path, bia_array=bia_spec)
    
    # Report that image boxes have been extracted for the spectral data of a
    # specific year
    print('Finished the extraction of image boxes for the year ' + 
          str(year) + '.', end='\n\n')

Started the extraction of image boxes for the year 2016.

Started the translation of tree coordinates from geo to pixel.
.........; 1000 tree coordinates have been translated (58%)
.......; 1703 tree coordinates have been translated (100%)
Finished the translation of tree coordinates from geo to pixel.

Started the extraction of image boxes.
.........; 100 trees have been processed (5%)
.........; 200 trees have been processed (11%)
.........; 300 trees have been processed (17%)
.........; 400 trees have been processed (23%)
.........; 500 trees have been processed (29%)
.........; 600 trees have been processed (35%)
.........; 700 trees have been processed (41%)
.........; 800 trees have been processed (46%)
.........; 900 trees have been processed (52%)
.........; 1000 trees have been processed (58%)
.........; 1100 trees have been processed (64%)
.........; 1200 trees have been processed (70%)
.........; 1300 trees have been processed (76%)
.........; 1400 trees have been processed 

Extract box images around tree coordinates for the vegetation height

In [79]:
# Define a box size in pixel for the vegetation height data (4*4 meters)
# As the vegetation height has a resolution of 40 cm, we only take 10 pixels
# to get the same degree of information into the box image as in the spectral
# data
box_size = 10

# Define the number of channels in the height data
n_chans = 1

# Define a path to the height data of this year
img_path = './data/LiDAR/Lidar2021_vegheight_int16_04m_cm_weg30m.tif'

# Define a path to save the extracted array of box images
save_path = './data/LiDAR/box_images/vegheight_box_images_weg30m.tif'

# Extract box images around each tree coordinate for the height data and
# put them inside of an array
bia_height = extract_treetop_imageboxes(tc_df, img_path, box_size, n_chans)

# Reduce the box image array of singular dimensions and calculate from cm to m
bia_height = np.squeeze(bia_height) / 100

# Save the array as an .npz file
np.savez(save_path, bia_array=bia_height)

Started the translation of tree coordinates from geo to pixel.
.........; 1000 tree coordinates have been translated (58%)
.......; 1703 tree coordinates have been translated (100%)
Finished the translation of tree coordinates from geo to pixel.

Started the extraction of image boxes.
.........; 100 trees have been processed (5%)
.........; 200 trees have been processed (11%)
.........; 300 trees have been processed (17%)
.........; 400 trees have been processed (23%)
.........; 500 trees have been processed (29%)
.........; 600 trees have been processed (35%)
.........; 700 trees have been processed (41%)
.........; 800 trees have been processed (46%)
.........; 900 trees have been processed (52%)
.........; 1000 trees have been processed (58%)
.........; 1100 trees have been processed (64%)
.........; 1200 trees have been processed (70%)
.........; 1300 trees have been processed (76%)
.........; 1400 trees have been processed (82%)
.........; 1500 trees have been processed (88%)
....

Extract the slope information for each tree coordinate

In [80]:
# Set the box size for the slope data to 1 so that only the information
# from a single pixel is extracted
box_size = 1

# Define the number of channels in the slope data
n_chans = 1

# Define a path to the height data of this year
img_path = './data/LiDAR/Lidar2015_slope_float32_1m_dm_weg30m.tif'

# Define a path to save the extracted array of box images
save_path = './data/LiDAR/box_images/slope_box_images_weg30m.tif'

# Extract box images around each tree coordinate for the height data and
# put them inside of an array
bia_slope = extract_treetop_imageboxes(tc_df, img_path, box_size, n_chans)

# Reduce the box image array of singular dimensions
bia_slope = np.squeeze(bia_slope)

# Save the array as an .npz file
np.savez(save_path, bia_array=bia_slope)

Started the translation of tree coordinates from geo to pixel.
.........; 1000 tree coordinates have been translated (58%)
.......; 1703 tree coordinates have been translated (100%)
Finished the translation of tree coordinates from geo to pixel.

Started the extraction of image boxes.
.........; 100 trees have been processed (5%)
.........; 200 trees have been processed (11%)
.........; 300 trees have been processed (17%)
.........; 400 trees have been processed (23%)
.........; 500 trees have been processed (29%)
.........; 600 trees have been processed (35%)
.........; 700 trees have been processed (41%)
.........; 800 trees have been processed (46%)
.........; 900 trees have been processed (52%)
.........; 1000 trees have been processed (58%)
.........; 1100 trees have been processed (64%)
.........; 1200 trees have been processed (70%)
.........; 1300 trees have been processed (76%)
.........; 1400 trees have been processed (82%)
.........; 1500 trees have been processed (88%)
....

In [81]:
# Set the box size for the slope data to 1 so that only the information
# from a single pixel is extracted
box_size = 1

# Define the number of channels in the slope data
n_chans = 1

# Define a path to the height data of this year
img_path = './data/LiDAR/Lidar2015_altitude_int16_1m_dm_weg30m.tif'

# Define a path to save the extracted array of box images
save_path = './data/LiDAR/box_images/altitude_box_images_weg30m.tif'

# Extract box images around each tree coordinate for the height data and
# put them inside of an array
bia_altitude = extract_treetop_imageboxes(tc_df, img_path, box_size, n_chans)

# Reduce the box image array of singular dimensions and calculate from dm into m
bia_altitude = np.squeeze(bia_altitude) / 10

# Save the array as an .npz file
np.savez(save_path, bia_array=bia_altitude)

Started the translation of tree coordinates from geo to pixel.
.........; 1000 tree coordinates have been translated (58%)
.......; 1703 tree coordinates have been translated (100%)
Finished the translation of tree coordinates from geo to pixel.

Started the extraction of image boxes.
.........; 100 trees have been processed (5%)
.........; 200 trees have been processed (11%)
.........; 300 trees have been processed (17%)
.........; 400 trees have been processed (23%)
.........; 500 trees have been processed (29%)
.........; 600 trees have been processed (35%)
.........; 700 trees have been processed (41%)
.........; 800 trees have been processed (46%)
.........; 900 trees have been processed (52%)
.........; 1000 trees have been processed (58%)
.........; 1100 trees have been processed (64%)
.........; 1200 trees have been processed (70%)
.........; 1300 trees have been processed (76%)
.........; 1400 trees have been processed (82%)
.........; 1500 trees have been processed (88%)
....