# Sentinel-2 Data Preprocessing
## Clipping Sentinel-2 Images around PEP725 Stations

**Inputs**
- parent directory of a S2 image (.SAFE folder)
- `envelope_gdf` (for now I will also include the code to go from buffer to envelope here)

## TODO
- ~~Make the code iterate over different folders. Currently it accesses just a single satellite image~~ **DONE**
- ~~Look into making the code work without extracting the .zip file of the satellite image. If not, try Extract .zip folder --> Clip patches --> Delete extracted folder.~~ **DONE**
- ~~Think about converting the extracted patches to another format, as .jp2 may not be suitable for a training dataset~~ **DONE**
- Find a way to link elements of the GeoDataFrame with the extracted patch to include them as labels such as the class DBL, EC, M

In [1]:
# Imports. To be cleaned as well

import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
from shapely.geometry import box
import rasterio
from rasterio.plot import show
from rasterio.mask import mask
from rasterio.coords import BoundingBox
from rasterio.features import geometry_mask
import json
import numpy as np
import glob
import os
from rasterio.windows import from_bounds
import pathlib
path = r'C:\Users\Kostas\Downloads\S2A_MSIL2A_20170420T103021_N0204_R108_T32UNB_20170420T103454.SAFE\GRANULE\L2A_T32UNB_A009543_20170420T103454\IMG_DATA\R10m\L2A_T32UNB_20170420T103021_B02_10m.jp2'

### ------------ Buffer to envelope -----------------

In [2]:
# Read buffer file
buffers = gpd.read_file(r'C:\Users\Kostas\Desktop\GIMA\Module_7\Data\PEP725\After_2016_sent_from_PEP725\pep725_outputs\pep725_high_count_days\buffers_day_92.geojson')
# Set crs because by default GeoPandas loads it in 4326, whereas it is actually 32632
buffers.set_crs(32632, inplace=True, allow_override=True)

# Create envelopes for the buffers
envelope_series = buffers.geometry.envelope
envelope_series.rename('envelope_geometry', inplace=True)
envelope_gdf = buffers.merge(envelope_series, left_index=True, right_index=True)
envelope_gdf = envelope_gdf.drop(['geometry'], axis=1).set_geometry('envelope_geometry').rename_geometry('geometry')

# Change the envelope to a list to use it later
envelope_list = envelope_gdf.geometry.tolist()
# Creating a list of tuples that will be used to preserve the indexing information of the GeoDataFrame.
# This may be of use later, to get information from the GeoDataFrame and put it in the image, e.g., a label such as the class (DBL, EC, M).
envelope_list_with_index = []
for index, row in envelope_gdf.iterrows():
    envelope_list_with_index.append((index, row['geometry']))


## Define functions to be used

In [3]:
"""Function to parse features from GeoDataFrame in such a manner that rasterio wants them"""

def getFeatures(gdf):
        return [json.loads(gdf.to_json())['features'][0]['geometry']]

In [4]:
'''
This function reads the envelope list and a raster, checks if the polygons are fully contained in the raster 
and returns 4 lists with the boundary coordinates for all the envelopes that are fully contained in the raster.
'''

def getContainedEnvelopeCoords (raster, envelope_list):
    with rasterio.open(raster, driver='JP2OpenJPEG') as src:
        raster_extent = src.bounds
        
        # List initialization
        minx_list = []
        miny_list = []
        maxx_list = []
        maxy_list = []

        for poly in envelope_list:
            poly_extent = poly.bounds

            # Check if the polygon is fully inside the raster's extent
            if (poly_extent[0] >= raster_extent[0] and poly_extent[2] <= raster_extent[2] and
                poly_extent[1] >= raster_extent[1] and poly_extent[3] <= raster_extent[3]):
                    minx_list.append(poly_extent[0])
                    miny_list.append(poly_extent[1])
                    maxx_list.append(poly_extent[2])
                    maxy_list.append(poly_extent[3])
    return minx_list, miny_list, maxx_list, maxy_list

In [5]:
'''
This function receives a raster file (.jp2) and the boundary coordinates for a polygon. 
It then clips the raster to the extent of the polygon. 
The polygon has to intersect the raster for the operation to be completed
'''

def exportImage(raster, output_path, minx, miny, maxx, maxy):
    # open the raster file (Single Band)
    data = rasterio.open(raster, driver='JP2OpenJPEG')

    # Create a bounding box from the polygon min-max coordinates    
    bbox = box(minx, miny, maxx, maxy)
    # Create a geodataframe with a single polygon so that it can be used with rasterio
    geo = gpd.GeoDataFrame({'geometry': bbox}, index=[0], crs='32632')
    # Transform the geodataframe to a GeoJSON-like object that can be used as an input in the rasterio mask function
    coords = getFeatures(geo)
    #print(coords)
    
    # Mask and crop the raster AOI where polygon overlaps the whole raster
    out_img, out_transform = mask(data, shapes=coords, crop=True)
    # Define resolution and more
    out_profile = data.profile.copy()
    
    out_profile.update({'driver':'PNG', 'width': out_img.shape[2],'height': out_img.shape[1], 'transform': out_transform})
    
    # Write the extracted raster patch to a file
    with rasterio.open(output_path, 'w', **out_profile) as dst:
        dst.write(out_img)
    
    # data.close()
    # data = None

In [6]:
'''
This function is used to receive a string from the raster file's name
using the split() method. It splits the string wherever an underscore appears and then accesses the second-to-last element.
'''
def imageNaming(raster_path):
    string_parts = raster_path.split("_")
    band_string = string_parts[-3]+ "_" + string_parts[-2] + "_" + string_parts[-1]
    band_string = band_string.replace('.jp2','')
    return band_string

# Example output: 20170420T103021_B02_10m

In [7]:
'''
This function is used to mask an already created patch based on CLC. It reads the CLC raster, the list of the CLC pixel values that the mask will
have and the output path, which should be the same if you don't want to keep the original patch.
It opens the patch and gets its boundary
Then it opens the CLC raster and limits the operation based on the patch boundary
Then, it creates a mask on the CLC raster based on the clc_pixel_values list
Because of the way the mask is created it creates nodata values. Rasterio needs the nodata values to be numbers, so the next step replaces the string "nodata" with an integer (0)
Finally, it exports the masked patch
'''

def pixelMasking(patch_path, clc_path, clc_pixel_values, output_path):
    # Read the patch and its boundaries
    with rasterio.open(patch_path) as patch:
        patchdata = patch.read()
        profile = patch.profile.copy()
        bounds = patch.bounds

    # Read clc in the boundaries of the patch
    with rasterio.open(clc_path) as clc:
        window = from_bounds(*bounds, clc.transform)
        clcdata = clc.read(window=window)

    # Create bool array mask where clc is not in list of values
    mask = np.isin(clcdata, clc_pixel_values, invert=True)

    # Replace "None" values with integer. In this case 0
    if profile["nodata"] is None:
        profile["nodata"] = 0
        
    # set everything where mask is True to NoData
    patchdata[mask] = profile["nodata"]

    # Save the new patch with the patch values where clc has specific values
    with rasterio.open(output_path, "w", **profile) as output:
        output.write(patchdata)
   

# Explanation of the program
There are three different loops that are nested. These will be explained below


1. First, the program reads the relevant variables that will be used. 
    - The folder where the Sentinel-2 zip archives are located, from which it creates a list of their paths named `sentinel_2_zip_list`. The zip files of the folder should be only Sentinel-2 zip files.
    - The Corine Land Cover (CLC) info, which is the raster of CLC for Germany, and the pixel values list of the relevant vegetation group (DBL, EC, M). This list is named `clc_pixel_values`.
    - The folder where the patches will be extracted, named `output_path`
    - The envelope list, which is above for now.
<br></br>

2. First loop
    - It starts to iterate the `sentinel_2_zip_list`. Each element of the list represents a different Sentinel-2 image.
    - It creates a list of the rasters of all the bands in the 60m spatial resolution that are contained in the Sentinel-2 image, named `filtered_list`. Bands that are normally 10m or 20m are also included here, but resampled to 60m. This uses another small loop, but it is irrelevant for the explanation, so it is not counted.
    <br></br>
    
3. Second loop. Nested within the first loop
    - It starts to iterate the `filtered_list`. Each element of the list represents a different Sentinel-2 band.
    - It uses the `getContainedEnvelopeCoords()` function to create four lists of the coordinates of the envelopes fully contained within the raster. (i.e. that do not touch the borders of the raster). These lists are the minimum and maximum x and y coordinates, named `minx_list, miny_list, maxx_list, maxy_list`
    - It uses the `imageNaming()` function to slice the name of the path of the band to use it as a part of the exported patches later. Example output: *20170420T103021_B02_10m*. It stores it in a variable called `raster_name`.
    <br></br>    

4. Third loop. Nested within the second loop
    - It starts to iterate over the different patches, using the `minx_list`. The coordinate lists have all the same shape, so any one of them can be selected for a loop.
    - It creates an output name for the patch to be exported by concatenating `raster_name` and the ending '\_Patch_' with an ascending number based on the loops iterator. Example outputs: *20170420T103021_B01_60m_Patch_1.png*, *20170420T103021_B01_60m_Patch_2.png*, etc.
    - It uses the `exportImage()` function to crop the area from the coordinate lists `minx_list, miny_list, maxx_list, maxy_list` where the envelop is overlapping the band from the `filtered_list`. It exports it to the `output_path`, mentioned in step 1. 
    - Then it reads the patch that was exported, creates a mask based on the Corine Land Cover info provided in step 1 to keep only the pixels that have a value of the `clc_pixel_values` list and replaces the original patch with its masked version
    




In [9]:
from zipfile import ZipFile
import fnmatch

# Read clc related info
clc_pixel_values = [10, 23, 25, 29, 24, 18, 26]
clc_path = pathlib.Path(r"C:\Users\Kostas\Desktop\GIMA\Module_7\Data\CLC2018\CLC2018_GER_WGS84UTM32N_60m_ArcPro.tif")

# Read the output directory for the images that will be extracted
output_path = pathlib.Path(r'C:\Users\Kostas\Desktop\GIMA\Module_7\Data\filtered_patches')

# Create a list of the existing Sentinel-2 zip files
sentinel_2_directory = pathlib.Path(r'C:\Users\Kostas\Desktop\GIMA\Module_7\Data\Sentinel2_images')
sentinel_2_zip_list = glob.glob(str(sentinel_2_directory) + '/*.zip', recursive=True)

# Create a list of the band paths of Sentinel-2 zips
for s2zip in sentinel_2_zip_list:
    zipaki = s2zip
    print('---------------')

    # This part is a filter to find only the relevant 60m bands from inside the Sentinel-2 zip. Maybe I should wrap this up into a function as well.
    with ZipFile(zipaki, 'r') as zipObj:
        file_list = zipObj.namelist()
        pattern = '*/R60m/*B???60m.jp2'

        filtered_list = []
        for file in file_list:
            if fnmatch.fnmatch(file, pattern):
                filtered_list.append(file)
        print(f'List of spectral bands for file {zipaki} completed.')

        # Now read each band (each band is a different raster)
        for raster in filtered_list:
            zipped_image = pathlib.Path("zip+file:" + zipaki + '!/' + raster)
            minx_list, miny_list, maxx_list, maxy_list = getContainedEnvelopeCoords(zipped_image, envelope_list)
            raster_name = imageNaming(raster)
            print("\tCreating images around PEP725 stations for the band:", raster_name)

            # For each envelope in the band, export the image and use the clc related info to mask the pixels
            for j in range(0, len(minx_list)):
                output_file_name = os.path.join(output_path, raster_name + f'_Patch_{j+1}.png')
                exportImage(zipped_image, output_file_name, minx_list[j], miny_list[j], maxx_list[j], maxy_list[j])
                pixelMasking(output_file_name, clc_path, clc_pixel_values, output_file_name)
            print(f'\tPatch pixel masking completed for all patches of the band {raster_name}\n')
        print(f'\tImage extraction completed for the file{zipaki}')
print('Image extraction completed for all the files')
       
    

---------------
List of spectral bands for file C:\Users\Kostas\Desktop\GIMA\Module_7\Data\Sentinel2_images\S2A_MSIL2A_20170420T103021_N0204_R108_T32UNB_20170420T103454.zip completed.
	Creating images around PEP725 stations for the band: 20170420T103021_B01_60m
	Patch pixel masking completed for all patches of the band 20170420T103021_B01_60m

	Creating images around PEP725 stations for the band: 20170420T103021_B02_60m
	Patch pixel masking completed for all patches of the band 20170420T103021_B02_60m

	Creating images around PEP725 stations for the band: 20170420T103021_B03_60m
	Patch pixel masking completed for all patches of the band 20170420T103021_B03_60m

	Creating images around PEP725 stations for the band: 20170420T103021_B04_60m
	Patch pixel masking completed for all patches of the band 20170420T103021_B04_60m

	Creating images around PEP725 stations for the band: 20170420T103021_B05_60m
	Patch pixel masking completed for all patches of the band 20170420T103021_B05_60m

	Creat