In [None]:
import rasterio as rio
import pandas as pd
import geopandas as gpd
from pathlib import Path
from datetime import datetime
import json
import os

In [None]:
s2_image_path = r"C:\Users\Kostas\Downloads\sentinel2_images_mean_2019-04-01_to_2019-05-01-0000000000-0000000000.tif" #OG one
s2_image_path = r"C:\Users\Kostas\Downloads\sentinel2_images_mean_2019-04-01_to_2019-05-01-0000000000-0000006912.tif"
envelopes_gdf = gpd.read_file(r"C:\Users\Kostas\Desktop\GIMA\Module_7\Data\PEP725\After_2016_sent_from_PEP725\pep725_outputs\PEP725_envelopes.geojson")

In [None]:
with rio.open(s2_image_path) as src:
    print(src.bounds)
    print(src.crs)
    print(src.count)
    src.close

In [None]:
envelopes_gdf.set_crs(32632, inplace=True, allow_override=True)
envelopes_gdf.tail(10)

In [None]:
# Separate the gdfs by year
#envelopes_gdf_2019 = envelopes_gdf[envelopes_gdf['year'] == 2019]
#envelopes_gdf_2020 = envelopes_gdf[envelopes_gdf['year'] == 2020]
#envelopes_gdf_2019.head()

## 1. Temporal filter
Now that everything is loaded the temporal filter should be applied

In [None]:
# Function to extract the dates from the filename of GEE S2 images

def imageNamingGEEfiles(raster_path):
    # Example file name: sentinel2_images_mean_2019-07-01_to_2019-08-01-0000006912-0000006912.tif
    string_parts = raster_path.split("_")
    start_date = string_parts[3]
    token = string_parts[5]
    token_string_parts = token.split("-")
    end_date = token_string_parts[0] + "-" + token_string_parts[1] + "-" + token_string_parts[2]
    # Save the month and year to variables
    s2month = datetime.strptime(start_date, '%Y-%m-%d').month
    s2year = datetime.strptime(start_date, '%Y-%m-%d').year
    return start_date, end_date, s2month, s2year

a, b, c, d = imageNamingGEEfiles("sentinel2_images_mean_2019-07-01_to_2019-08-01-0000006912-0000006912.tif")
print(a, b, c, d)

In [None]:
# Pathlib stuff: Path methods: anchor, parent, name, stem, suffixes
# This can be used for easier extraction of dates from the filename
print("anchor: ", Path(s2_image_path).anchor)
print("parent: ", Path(s2_image_path).parent)
print("name: ", Path(s2_image_path).name)
print("stem: ", Path(s2_image_path).stem)
print("suffixes: ", Path(s2_image_path).suffixes)
print("Normal print: ", s2_image_path)

In [None]:
# Get the start and end dates of the image from its name
s2_image_start_date, s2_image_end_date, s2month, s2year = imageNamingGEEfiles(Path(s2_image_path).name)
print(s2_image_start_date, s2_image_end_date, s2month, s2year)

__________
Datetime operations in order to do date comparisons and find all the dates that are represented in an image

In [None]:
# Converting the date column to datetime data type

envelopes_gdf['date'] = pd.to_datetime(envelopes_gdf['date'], format='%Y-%m-%d').dt.date
# Converting the outputs to datetime.date dtype
s2_image_start_date = datetime.strptime(s2_image_start_date, '%Y-%m-%d').date()
s2_image_end_date = datetime.strptime(s2_image_end_date, '%Y-%m-%d').date()

In [None]:
# Creating a mask to filter the dates that are needed
temporal_mask = (envelopes_gdf.date > s2_image_start_date) & (envelopes_gdf.date < s2_image_end_date)
display(envelopes_gdf.loc[temporal_mask])
s2_image_gdf = envelopes_gdf.loc[temporal_mask]

## Target extraction

In [None]:
def addMonths(gdf):
    # Convert the date to datetime type to work later
    gdf['date'] = pd.to_datetime(gdf['date'])

    # Create a Series with the month (1-12)
    # It finds the month (int 1-12) based on the .month method of the datetime property
    # It achieves that by mapping a lambda function on each element of the date column. Therefore the result is just the month number

    getmonth = gdf['date'].map(lambda x:x.month)
    # Another way
    # test_df = test_df.assign(month=test_df['date'].map(lambda x: x.month))

    # Merge this into the gdf
    gdf = gdf.merge(getmonth, left_index=True, right_index=True)

    # Rename the column
    gdf.rename(columns = {'date_y':'month'}, inplace = True)
    return gdf

In [None]:
envelopes_gdf = addMonths(envelopes_gdf)

In [None]:
# Check for indexing using specific observations
station = 4240
year = 2021
month = 8
envelopes_gdf[(envelopes_gdf['s_id'] == station) & (envelopes_gdf['year'] == year) & (envelopes_gdf['month'] == month)]
envelopes_gdf.tail()

In [None]:
# Group observations by s_id and month, and calculate the label with the maximum frequency for each group
freqresults_df = envelopes_gdf.groupby(['s_id', pd.Grouper(key='month'), pd.Grouper(key='year')])\
    .apply(lambda x: pd.Series({'Label': x['Label'].value_counts().index[0],
                                'phase_id': x['phase_id'].value_counts().index[0]}))\
    .reset_index()

In [None]:
# Rename the column with the labels
freqresults_df = freqresults_df.rename(columns={'Label': 'max_label', 'phase_id': 'max_phase_id'})

freqresults_df.head()
freqresults_df['max_label'].value_counts()
freqresults_df['max_phase_id'].value_counts()

___________

## Spatial filter

In [None]:
s2_image_gdf = addMonths(s2_image_gdf)

In [None]:
# Convert the envelopes_gdf to a list to work with the functions
s2_image_gdf_list = s2_image_gdf.geometry.tolist()

In [None]:
# This is used to save the indices and then extract the targets directly from the gdf
s2_image_gdf_index_list = s2_image_gdf.index.values.tolist()

In [None]:
s2_image_gdf_index_list[150:160]
s2_image_gdf.iloc[150:160]

In [None]:
output_dir = Path(r'C:\Users\Kostas\Desktop\GIMA\Module_7\Data\filtered_patches_GEE')

In [None]:
"""Function to parse features from GeoDataFrame in such a manner that rasterio wants them"""

def getFeatures(gdf):
        return [json.loads(gdf.to_json())['features'][0]['geometry']]

In [None]:
'''
This function reads the envelope list and a raster, checks if the polygons are fully contained in the raster 
and returns 5 lists, 4 with the boundary coordinates for all the envelopes that are fully contained in the raster 
and one of their indexes from the full_index_list.
'''

def getContainedEnvelopeCoords (raster, envelope_list, full_index_list):
    with rio.open(raster, driver='GTiff') as src:
        raster_extent = src.bounds
        
        # List initialization
        minx_list = []
        miny_list = []
        maxx_list = []
        maxy_list = []
        index_list = []
        for i in range(0, len(envelope_list)):
            poly_extent = envelope_list[i].bounds

            # Check if the polygon is fully inside the raster's extent
            if (poly_extent[0] >= raster_extent[0] and poly_extent[2] <= raster_extent[2] and
                poly_extent[1] >= raster_extent[1] and poly_extent[3] <= raster_extent[3]):
                    minx_list.append(poly_extent[0])
                    miny_list.append(poly_extent[1])
                    maxx_list.append(poly_extent[2])
                    maxy_list.append(poly_extent[3])
                    index_list.append(full_index_list[i])
    return minx_list, miny_list, maxx_list, maxy_list, index_list

In [None]:

'''
This function receives a raster file (.tif) and the boundary coordinates for a polygon. 
It then clips the raster to the extent of the polygon. 
The polygon has to intersect the raster for the operation to be completed
'''

from shapely.geometry import box
from rasterio.mask import mask

def exportImage(raster, output_path, minx, miny, maxx, maxy):
    # open the raster file (Single Band)
    data = rio.open(raster, driver='GTiff')

    # Create a bounding box from the polygon min-max coordinates    
    bbox = box(minx, miny, maxx, maxy)
    # Create a geodataframe with a single polygon so that it can be used with rasterio
    geo = gpd.GeoDataFrame({'geometry': bbox}, index=[0], crs='32632')
    # Transform the geodataframe to a GeoJSON-like object that can be used as an input in the rasterio mask function
    coords = getFeatures(geo)
    #print(coords)
    
    # Mask and crop the raster AOI where polygon overlaps the whole raster
    out_img, out_transform = mask(data, shapes=coords, crop=True)
    # Define resolution and more
    out_profile = data.profile.copy()
    
    out_profile.update({'driver':'GTiff', 'width': out_img.shape[2],'height': out_img.shape[1], 'transform': out_transform})
    
    # Write the extracted raster patch to a file
    with rio.open(output_path, 'w', **out_profile) as dst:
        dst.write(out_img)
    
    # data.close()
    # data = None

In [None]:
minx_list, miny_list, maxx_list, maxy_list, contained_index_list = getContainedEnvelopeCoords(s2_image_path, s2_image_gdf_list, s2_image_gdf_index_list)

In [None]:
# Create a list with the contained s_id's. It works
contained_s_id_list = []
for i in range(0, len(contained_index_list)):
    sid = s2_image_gdf.loc[contained_index_list[i], 's_id']
    contained_s_id_list.append(sid)

## Combination of everything to mine the labels

In [None]:
freqresults_df.head()

In [None]:
freqresults_df[freqresults_df['s_id'] == contained_s_id_list[3]]

In [None]:
condition = (freqresults_df['year'] == s2year) & (freqresults_df['month'] == s2month) & (freqresults_df['s_id'] == contained_s_id_list[3])
#freqresults_df['s_id'] == contained_s_id_list[3]
label = freqresults_df.loc[condition, 'max_label'].values[0]
phase_id = freqresults_df.loc[condition, 'max_phase_id'].values[0]
print(f'Station with ID {contained_s_id_list[3]}, for the year {s2year} and month {s2month} has the label {label} and phenophase with id {phase_id}')

In [None]:
# Same as above but iterating over s_ids. Works.
for station in contained_s_id_list:
    condition = (freqresults_df['year'] == s2year) & (freqresults_df['month'] == s2month) & (freqresults_df['s_id'] == station)
    label = freqresults_df.loc[condition, 'max_label'].values[0]
    phase_id = freqresults_df.loc[condition, 'max_phase_id'].values[0]
    print(f'Station with ID {station}, for the year {s2year} and month {s2month} has the label {label} and phenophase with id {phase_id}')

In [None]:
freqresults_df.head()

In [None]:
# Create lists of the maximum frequency labels and s_ids to organise the outputs to folders
unique_labels = freqresults_df['max_label'].unique().tolist()
unique_phase_ids = freqresults_df['max_phase_id'].unique().tolist()

# Create folder from one of the lists. Change unique_phase_ids with unique_labels depending on what you want.
for folder in unique_phase_ids:
    p = Path(output_dir) / str(folder)
    path_exists = Path.exists(p)
    if path_exists:
        print(f'Folder {folder} already exists, skipping...')
    else:
        print(f'Folder {folder} does not exist, creating it...')
        p.mkdir(parents=True, exist_ok=True)

In [None]:
# Debugging to find out why it saves everything as DBL

print("Creating patches for the image: ", Path(s2_image_path).name)

# Iterating over each envelope in the gdf
# Reminder, minx_list, contained_index_list and contained_s_id_list have the same length with the same sequence.
for i in range(0, len(minx_list)):
    # Condition to extract the max_label and max_phase_id from the freqresults_df, based on the s_id of the contained envelope
    condition = (freqresults_df['year'] == s2year) & (freqresults_df['month'] == s2month) & (freqresults_df['s_id'] == contained_s_id_list[i])
    # Get the index and station id for the station with the index in the i-th position
    station_id = contained_s_id_list[i]
    index = contained_index_list[i]

    # Get the maximum frequency label and phase_id for the current envelope based on the month and year
    label = freqresults_df.loc[condition, 'max_label'].values[0]
    phase_id = freqresults_df.loc[condition, 'max_phase_id'].values[0]
    
    output_folder = str(phase_id)
    # Include the aforementioned information in the image name
    output_name = os.path.join(os.path.join(output_dir, output_folder), Path(s2_image_path).stem + f'index_{index}_station_{station_id}_label_{label}_phase_id_{phase_id}.tif')
    print(f"\t Patch {i+1} out of {len(minx_list) + 1}")

    # Export the patches
    exportImage(s2_image_path, output_name, minx_list[i], miny_list[i], maxx_list[i], maxy_list[i])
print('Patch creation completed!') 

In [None]:
# Check what should be the output values from the freqresults_df of the PEP725 database depending on year and month
for i in [2017, 2018, 2019, 2020, 2021]:
    print(f'Checking for year {i}')
    for j in range(1,13):
        condition = (freqresults_df['year'] == i) & (freqresults_df['month'] == j) & (freqresults_df['s_id'].isin(contained_s_id_list))
        test_df = freqresults_df[condition]
        print('Month:', j, ' ',test_df.max_phase_id.unique(), test_df.max_label.unique())
    print(' ')