In [1]:
import rasterio as rio
import pandas as pd
import geopandas as gpd
from pathlib import Path
from datetime import datetime
import json
import os

In [6]:
s2_image_path = r"C:\Users\Kostas\Downloads\sentinel2_images_mean_2019-04-01_to_2019-05-01-0000000000-0000000000.tif"
envelopes_gdf = gpd.read_file(r"C:\Users\Kostas\Desktop\GIMA\Module_7\Data\PEP725\After_2016_sent_from_PEP725\pep725_outputs\PEP725_envelopes.geojson")

In [3]:
with rio.open(s2_image_path) as src:
    print(src.bounds)
    print(src.crs)

BoundingBox(left=280320.0, bottom=5686440.0, right=695040.0, top=6101160.0)
EPSG:32632


In [7]:
envelopes_gdf.set_crs(32632, inplace=True, allow_override=True)
envelopes_gdf.head(10)

Unnamed: 0,s_id,lon,lat,alt,alt_dem,gss_id,genus,species,phase_id,year,day,date,Label,geometry
0,5363,13.9167,54.0833,2,0,1050100,Alnus,Alnus glutinosa,60,2017,27,2017-01-27,DBL,"POLYGON ((817520.468 5999973.720, 825520.468 5..."
1,1554,7.51667,51.7333,60,72,2210500,Salix,Salix caprea,60,2017,29,2017-01-29,DBL,"POLYGON ((393567.206 5728416.903, 401567.206 5..."
2,3120,8.68333,49.55,140,261,1050100,Alnus,Alnus glutinosa,60,2017,31,2017-01-31,DBL,"POLYGON ((473094.080 5484647.767, 481094.080 5..."
3,2021,8.58333,50.0,100,101,1050100,Alnus,Alnus glutinosa,60,2017,32,2017-02-01,DBL,"POLYGON ((466138.525 5534713.881, 474138.525 5..."
4,1521,7.83333,51.7,60,58,1050100,Alnus,Alnus glutinosa,60,2017,33,2017-02-02,DBL,"POLYGON ((415374.473 5724316.443, 423374.473 5..."
5,1710,8.71667,51.7333,100,99,1050100,Alnus,Alnus glutinosa,60,2017,34,2017-02-03,DBL,"POLYGON ((476433.883 5727413.739, 484433.883 5..."
6,1234,6.6,51.3833,35,34,1050100,Alnus,Alnus glutinosa,60,2017,36,2017-02-05,DBL,"POLYGON ((328995.067 5691184.337, 336995.067 5..."
7,1111,10.85,52.4333,60,57,1050100,Alnus,Alnus glutinosa,60,2017,37,2017-02-06,DBL,"POLYGON ((621768.054 5806842.511, 629768.054 5..."
8,8154,6.66667,51.7667,40,38,1050100,Alnus,Alnus glutinosa,60,2017,37,2017-02-06,DBL,"POLYGON ((334994.639 5733666.257, 342994.639 5..."
9,494,10.2,51.5333,200,175,1050100,Alnus,Alnus glutinosa,60,2017,39,2017-02-08,DBL,"POLYGON ((579233.202 5705815.075, 587233.202 5..."


In [8]:
# Separate the gdfs by year
envelopes_gdf_2019 = envelopes_gdf[envelopes_gdf['year'] == 2019]
envelopes_gdf_2020 = envelopes_gdf[envelopes_gdf['year'] == 2020]

In [9]:
envelopes_gdf_2019.head()

Unnamed: 0,s_id,lon,lat,alt,alt_dem,gss_id,genus,species,phase_id,year,day,date,Label,geometry
46653,5213,12.2667,53.45,75,60,1050100,Alnus,Alnus glutinosa,60,2019,17,2019-01-17,DBL,"POLYGON ((712909.141 5923300.968, 720909.141 5..."
46654,19541,12.0333,51.7333,80,79,1050100,Alnus,Alnus glutinosa,60,2019,18,2019-01-18,DBL,"POLYGON ((705450.172 5731730.559, 713450.172 5..."
46655,1524,8.2,51.8,140,142,1050100,Alnus,Alnus glutinosa,60,2019,20,2019-01-20,DBL,"POLYGON ((440835.572 5735096.682, 448835.572 5..."
46656,1372,6.81667,50.9167,80,65,1050100,Alnus,Alnus glutinosa,60,2019,22,2019-01-22,DBL,"POLYGON ((342530.036 5638831.939, 350530.036 5..."
46657,5845,11.9667,51.35,105,101,1050100,Alnus,Alnus glutinosa,60,2019,22,2019-01-22,DBL,"POLYGON ((702581.818 5688925.917, 710581.818 5..."


## 1. Temporal filter

Now that everything is loaded the temporal filter should be applied

In [None]:
# Function to extract the dates from the filename of GEE S2 images

def imageNamingGEEfiles(raster_path):
    # Example file name: sentinel2_images_mean_2019-07-01_to_2019-08-01-0000006912-0000006912.tif
    string_parts = raster_path.split("_")
    start_date = string_parts[3]
    token = string_parts[5]
    token_string_parts = token.split("-")
    end_date = token_string_parts[0] + "-" + token_string_parts[1] + "-" + token_string_parts[2]
    return start_date, end_date 
    
a, b = imageNamingGEEfiles("sentinel2_images_mean_2019-07-01_to_2019-08-01-0000006912-0000006912.tif")
print(a, b)

In [None]:
# Pathlib stuff: Path methods: anchor, parent, name, stem, suffixes
# This can be used for easier extraction of dates from the filename
print("anchor: ", Path(s2_image_path).anchor)
print("parent: ", Path(s2_image_path).parent)
print("name: ", Path(s2_image_path).name)
print("stem: ", Path(s2_image_path).stem)
print("suffixes: ", Path(s2_image_path).suffixes)
print("Normal print: ", s2_image_path)


In [None]:
# Get the start and end dates of the image from its name
s2_image_start_date, s2_image_end_date = imageNamingGEEfiles(Path(s2_image_path).name)
print(s2_image_start_date, s2_image_end_date)

__________

Datetime stuff in order to do date comparisons and find all the dates that are represented in an image

In [None]:
# Converting the date column to datetime data type

envelopes_gdf['date'] = pd.to_datetime(envelopes_gdf['date'], format='%Y-%m-%d').dt.date

In [None]:
# Converting the outputs to datetime.date dtype
s2_image_start_date = datetime.strptime(s2_image_start_date, '%Y-%m-%d').date()
s2_image_end_date = datetime.strptime(s2_image_end_date, '%Y-%m-%d').date()


In [None]:
# Creating a mask to filter the dates that are needed
temporal_mask = (envelopes_gdf.date > s2_image_start_date) & (envelopes_gdf.date < s2_image_end_date)
display(envelopes_gdf.loc[temporal_mask])
s2_image_gdf = envelopes_gdf.loc[temporal_mask]


## Target extraction

In [11]:
def addMonths(gdf):
    # Convert the date to datetime type to work later
    gdf['date'] = pd.to_datetime(gdf['date'])

    # Create a Series with the month (1-12)
    # It finds the month (int 1-12) based on the .month method of the datetime property
    # It achieves that by mapping a lambda function on each element of the date column. Therefore the result is just the month number

    getmonth = gdf['date'].map(lambda x:x.month)

    # Merge this into the gdf
    gdf = gdf.merge(getmonth, left_index=True, right_index=True)

    # Rename the column
    gdf.rename(columns = {'date_y':'month'}, inplace = True)
    return gdf

In [17]:
envelopes_gdf = addMonths(envelopes_gdf)

In [18]:
envelopes_gdf.tail()

Unnamed: 0,s_id,lon,lat,alt,alt_dem,gss_id,genus,species,phase_id,year,day,date_x,Label,geometry,month
129324,5456,13.75,50.7333,875,864,10000,perm_grass,,131,2021,244,2021-09-01,M,"POLYGON ((831136.405 5626933.921, 839136.405 5...",9
129325,961,7.5,52.7667,30,33,10000,perm_grass,,111,2021,257,2021-09-14,M,"POLYGON ((394795.749 5843373.189, 402795.749 5...",9
129326,20595,8.5,49.4667,95,86,10000,perm_grass,,131,2021,258,2021-09-15,M,"POLYGON ((459771.720 5475458.856, 467771.720 5...",9
129327,19312,8.36667,48.6,490,622,10000,perm_grass,,131,2021,285,2021-10-12,M,"POLYGON ((449306.954 5379184.811, 457306.954 5...",10
129328,8197,10.9833,48.2333,525,504,10000,perm_grass,,111,2021,289,2021-10-16,M,"POLYGON ((643273.716 5340132.606, 651273.716 5...",10


In [36]:
# Group observations by s_id and month, and calculate the label with the maximum frequency for each group
freqresults_df = envelopes_gdf.groupby(['s_id', pd.Grouper(key='month'), pd.Grouper(key='year')])['Label'].apply(lambda x: x.value_counts().index[0]).reset_index()

# Rename the column with the label
freqresults_df = freqresults_df.rename(columns={'Label': 'max_label'})


In [37]:
# Check if it works correctly
freqresults_df.sort_values(['year', 'month']).tail(50)

Unnamed: 0,s_id,month,year,max_label
38759,21468,11,2021,DBL
38792,21469,11,2021,DBL
38839,21471,11,2021,DBL
38872,21472,11,2021,DBL
38904,21474,11,2021,DBL
38935,21475,11,2021,DBL
38986,21512,11,2021,DBL
39008,21513,11,2021,DBL
39033,21514,11,2021,DBL
39126,21519,11,2021,DBL


In [27]:
freqresults_df['max_label'].unique()

array(['DBL', 'M', 'EC'], dtype=object)

In [33]:
freqresults_df[freqresults_df['max_label'] == 'M'].head(10)

Unnamed: 0,s_id,month,year,max_label
18,21,6,2017,M
19,21,6,2018,M
20,21,6,2019,M
59,32,6,2017,M
60,32,6,2018,M
62,32,6,2020,M
113,66,7,2017,M
174,110,3,2021,M
186,110,6,2018,M
187,110,6,2019,M


___________

## Spatial filter

In [None]:
# Convert the envelopes_gdf to a list to work with the functions
s2_image_gdf_list = s2_image_gdf.geometry.tolist()

# This is used to save the indices and then extract the targets directly from the gdf
s2_image_gdf_index = s2_image_gdf.index.tolist()


In [None]:
output_dir = Path(r'C:\Users\Kostas\Desktop\GIMA\Module_7\Data\filtered_patches_GEE')

In [None]:
"""Function to parse features from GeoDataFrame in such a manner that rasterio wants them"""

def getFeatures(gdf):
        return [json.loads(gdf.to_json())['features'][0]['geometry']]

In [None]:
'''
This function reads the envelope list and a raster, checks if the polygons are fully contained in the raster 
and returns 4 lists with the boundary coordinates for all the envelopes that are fully contained in the raster.
'''

def getContainedEnvelopeCoords (raster, envelope_list, full_index_list):
    with rio.open(raster, driver='GTiff') as src:
        raster_extent = src.bounds
        
        # List initialization
        minx_list = []
        miny_list = []
        maxx_list = []
        maxy_list = []
        index_list = []
        for i in range(0, len(envelope_list)):
            poly_extent = envelope_list[i].bounds

            # Check if the polygon is fully inside the raster's extent
            if (poly_extent[0] >= raster_extent[0] and poly_extent[2] <= raster_extent[2] and
                poly_extent[1] >= raster_extent[1] and poly_extent[3] <= raster_extent[3]):
                    minx_list.append(poly_extent[0])
                    miny_list.append(poly_extent[1])
                    maxx_list.append(poly_extent[2])
                    maxy_list.append(poly_extent[3])
                    index_list.append(full_index_list[i])
    return minx_list, miny_list, maxx_list, maxy_list, index_list

In [None]:
for poly in s2_image_gdf_list:
    print(poly)
    break

for i in range(0, len(s2_image_gdf_list)):
    print(i)
    break


In [None]:
'''
This function receives a raster file (.tif) and the boundary coordinates for a polygon. 
It then clips the raster to the extent of the polygon. 
The polygon has to intersect the raster for the operation to be completed
'''

from shapely.geometry import box
from rasterio.mask import mask

def exportImage(raster, output_path, minx, miny, maxx, maxy):
    # open the raster file (Single Band)
    data = rio.open(raster, driver='GTiff')

    # Create a bounding box from the polygon min-max coordinates    
    bbox = box(minx, miny, maxx, maxy)
    # Create a geodataframe with a single polygon so that it can be used with rasterio
    geo = gpd.GeoDataFrame({'geometry': bbox}, index=[0], crs='32632')
    # Transform the geodataframe to a GeoJSON-like object that can be used as an input in the rasterio mask function
    coords = getFeatures(geo)
    #print(coords)
    
    # Mask and crop the raster AOI where polygon overlaps the whole raster
    out_img, out_transform = mask(data, shapes=coords, crop=True)
    # Define resolution and more
    out_profile = data.profile.copy()
    
    out_profile.update({'driver':'GTiff', 'width': out_img.shape[2],'height': out_img.shape[1], 'transform': out_transform})
    
    # Write the extracted raster patch to a file
    with rio.open(output_path, 'w', **out_profile) as dst:
        dst.write(out_img)
    
    # data.close()
    # data = None

In [None]:
minx_list, miny_list, maxx_list, maxy_list, index_list = getContainedEnvelopeCoords(s2_image_path, s2_image_gdf_list, s2_image_gdf_index)

In [None]:
# Test to check if the index list works. It works.

for test in range(0, 10):
    print(minx_list[test], miny_list[test], maxx_list[test], maxy_list[test], index_list[test])
    print("---------")

print("Check if the elements on the index and coord lists are the ones from the actual gdf")
s2_image_gdf.loc[49419]

In [None]:
s2_image_gdf.loc[49410, 's_id']

Somewhere around here the code should calculate the frequency of the label and do the target extraction

In [None]:
print("Creating patches for the image: ", Path(s2_image_path).name)

# Iterating over each envelope in the gdf

for i in range(0, len(minx_list)):
    # Get the station id for the station with the index in the i-th position
    station_id = s2_image_gdf.loc[index_list[i], 's_id']
    output_name = os.path.join(output_dir, Path(s2_image_path).stem + f'_station_{station_id}_index_{index_list[i]}.tif')
    print(f"\t Patch {i+1} out of {len(minx_list) + 1}")
    exportImage(s2_image_path, output_name, minx_list[i], miny_list[i], maxx_list[i], maxy_list[i])
print('Patch creation completed!') 



    