Read the buffers file

In [1]:
import pandas as pd
import geopandas as gpd
import pathlib

In [2]:
path = pathlib.Path(r"C:\Users\Kostas\Desktop\GIMA\Module_7\Data\PEP725\After_2016_sent_from_PEP725\pep725_outputs\PEP725_buffers.geojson")
buffers = gpd.read_file(path).set_crs(32632, inplace=True, allow_override=True)

# Create envelopes for the buffers
envelope_series = buffers.geometry.envelope
envelope_series.rename('envelope_geometry', inplace=True)
envelope_gdf = buffers.merge(envelope_series, left_index=True, right_index=True)
envelope_gdf = envelope_gdf.drop(['geometry'], axis=1).set_geometry('envelope_geometry').rename_geometry('geometry')

# Change the envelope to a list to use it later
envelope_list = envelope_gdf.geometry.tolist()
# Creating a list of tuples that will be used to preserve the indexing information of the GeoDataFrame.
# This may be of use later, to get information from the GeoDataFrame and put it in the image, e.g., a label such as the class (DBL, EC, M).
envelope_list_with_index = []
for index, row in envelope_gdf.iterrows():
    envelope_list_with_index.append((index, row['geometry'], row['s_id']))

In [3]:
print(type(envelope_list_with_index))

<class 'list'>


In [4]:
print(envelope_list_with_index[3])

(3, <shapely.geometry.polygon.Polygon object at 0x000001ECB8AD49D0>, 2021)


In [5]:
envelope_gdf.head()

Unnamed: 0,s_id,lon,lat,alt,alt_dem,gss_id,genus,species,phase_id,year,day,date,Label,geometry
0,5363,13.9167,54.0833,2,0,1050100,Alnus,Alnus glutinosa,60,2017,27,2017-01-27,DBL,"POLYGON ((817520.468 5999973.720, 825520.468 5..."
1,1554,7.51667,51.7333,60,72,2210500,Salix,Salix caprea,60,2017,29,2017-01-29,DBL,"POLYGON ((393567.206 5728416.903, 401567.206 5..."
2,3120,8.68333,49.55,140,261,1050100,Alnus,Alnus glutinosa,60,2017,31,2017-01-31,DBL,"POLYGON ((473094.080 5484647.767, 481094.080 5..."
3,2021,8.58333,50.0,100,101,1050100,Alnus,Alnus glutinosa,60,2017,32,2017-02-01,DBL,"POLYGON ((466138.525 5534713.881, 474138.525 5..."
4,1521,7.83333,51.7,60,58,1050100,Alnus,Alnus glutinosa,60,2017,33,2017-02-02,DBL,"POLYGON ((415374.473 5724316.443, 423374.473 5..."


In [6]:
# Converting the date column to datetime data type

envelope_gdf['date'] = pd.to_datetime(envelope_gdf['date'], format='%Y-%m-%d').dt.date

In [7]:
# Extract a list with the stations.
stations_list = envelope_gdf['s_id'].unique().tolist()

In [8]:
import glob
from zipfile import ZipFile
import fnmatch
import datetime
from datetime import timedelta

sentinel_2_directory = r"C:\Users\Kostas\Desktop\GIMA\Module_7\Data\Sentinel2_images"

# Create a list of all the Sentinel-2 zipfiles
sentinel_2_zip_list = glob.glob(str(sentinel_2_directory) + '/*.zip', recursive=True)

# Get the date from the Sentinel-2 zip archive name. Returns a datetime class object
def getS2Date(s2zip_path):
    string_parts = s2zip_path.split("_")
    band_string = string_parts[-1]
    band_string = band_string.replace('.zip','')
    string_parts = band_string.split("T")
    s2_date = string_parts[0]
    s2_time = string_parts[1]
    s2_date = datetime.datetime.strptime(s2_date, "%Y%m%d").date()
    #print(band_string)
    #print(s2_date, s2_time)
    return s2_date#, s2_time This is in case you want the time as well. It will be returned as a tuple

for s2zip_path in sentinel_2_zip_list:
    s2date = getS2Date(s2zip_path)
    print(s2date)






2017-04-20
2022-10-22
2020-09-21


In [9]:
# Get start and end date
date_start = getS2Date(sentinel_2_zip_list[0])
date_end = getS2Date(sentinel_2_zip_list[1])
print(f"from: {date_start} \tto: {date_end}")


from: 2017-04-20 	to: 2022-10-22


In [33]:
# New try 22feb
for i_id in range(0, len(sentinel_2_zip_list) - 1):
    print(i_id)
    print(sentinel_2_zip_list[i_id])
    image_1 = sentinel_2_zip_list[i_id]
    image_2 = sentinel_2_zip_list[i_id + 1]

    date_start = getS2Date(image_1)
    date_end = getS2Date(image_2)
    # create an empty list to store the unique station results
    results = []
    freqresults = []

    # Create an empty dataframe to store the frequency results
    freqresults_df = pd.DataFrame(columns=['s_id', 'max_label', 'max_phase_id'])

    # iterate over the station IDs
    for s_id in stations_list:
        # Filter the geodataframe to include only the rows with the current station ID
        # mask is a boolean Series, with True in the places where the station IDs match.
        mask = envelope_gdf['s_id'] == s_id
        
        # Filter the geodataframe to include only the rows with dates between 'date_start' and 'date_end'
        # Now, the dates between date_start and date_end are assigned a True value
        # Because &= is used, the ultimate values that kept are the ones that meet both the station matching and the date matching criteria
        # The mask is then updated to be used next
        mask &= (envelope_gdf['date'] >= date_start) & (envelope_gdf['date'] < date_end)
        
        # Extract the relevant columns for the filtered rows. Here, the mask is used as an index because it shares the same indices with envelope_gdf
        # This way the only data that are passed are the ones for which the criteria mentioned above match.
        filtered_gdf = envelope_gdf.loc[mask, ['s_id', 'date', 'Label', 'phase_id']]
        
        # Append the filtered data to the results list. This will be used to concatenate them to the final_df
        results.append(filtered_gdf)
        
        # This part finds the 'Label' that has the highest frequency

        # Check if the filtered geodataframe is not empty
        if not filtered_gdf.empty:
            # Group the filtered geodataframe by station ID and label, and count the frequency of each label
            label_counts = filtered_gdf.groupby(['s_id', 'Label']).size().reset_index(name='count')
            
            # Find the label that has the highest frequency for the current station ID
            max_label = label_counts.loc[label_counts['s_id'] == s_id, 'Label'][label_counts.loc[label_counts['s_id'] == s_id, 'count'].idxmax()]

            phase_id_counts = filtered_gdf.groupby(['s_id', 'phase_id']).size().reset_index(name='count2')

            max_phase_id = phase_id_counts.loc[phase_id_counts['s_id'] == s_id, 'phase_id'][phase_id_counts.loc[phase_id_counts['s_id'] == s_id, 'count2'].idxmax()]
            
            # Add the result to the freqresults dataframe
            freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)

    # Concatenate the filtered data from all stations into a single geodataframe
    final_df = pd.concat(results)
    break

0
C:\Users\Kostas\Desktop\GIMA\Module_7\Data\Sentinel2_images\S2A_MSIL2A_20170420T103021_N0204_R108_T32UNB_20170420T103454.zip


  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase_id': max_phase_id}, ignore_index=True)
  freqresults_df = freqresults_df.append({'s_id': s_id, 'max_label': max_label, 'max_phase

In [34]:
final_df.tail(10)

Unnamed: 0,s_id,date,Label,phase_id
113349,2896,2021-11-19,DBL,95
107677,558,2021-08-31,DBL,286
108791,558,2021-09-23,DBL,286
110667,558,2021-10-12,DBL,205
110968,558,2021-10-15,DBL,205
112013,558,2021-10-24,DBL,205
112549,558,2021-10-30,DBL,95
129290,558,2021-07-16,M,131
113272,685,2021-11-16,DBL,95
127753,2558,2021-04-02,M,182


In [41]:
freqresults_df.tail()

Unnamed: 0,s_id,max_label,max_phase_id
1226,409,DBL,286
1227,2896,DBL,205
1228,558,DBL,205
1229,685,DBL,95
1230,2558,M,182


In [48]:
freqresults_df.loc[freqresults_df['max_label'] == 'DBL']

Unnamed: 0,s_id,max_label,max_phase_id
0,5363,DBL,60
1,3120,DBL,60
2,2021,DBL,60
3,1521,DBL,60
4,1710,DBL,60
...,...,...,...
1225,2509,DBL,60
1226,409,DBL,286
1227,2896,DBL,205
1228,558,DBL,205


In [29]:
unique_dates = pd.unique(final_df['date'])
print(unique_dates[0])

2017-05-11


In [25]:
for d in unique_dates:
    

2017-05-11


In [None]:
### TODO next after meeting with Mahdi 15 Feb

import pandas as pd

final_res = pd.DataFrame()

unique_dates = pd.unique(final_res.date)

def get_image_paths(date):
    image_paths = []
    # load
    return image_paths

def load_image(path):
    pass

for d in unique_dates:
    image_paths = get_image_paths(d)

    for p in image_paths:
         image = load_image(p)

        res_for_the_image = final_res[#Filter based the boundary of the image and date d]

        # iterate over stations within res_for_the_image
        # patch for the station
        # save the patch into the correct folder

In [None]:
def get_data(station, vegetation_class, start_date, end_date):
    data = [station, ]

def calc_target_highest_freq(data):
    pass


stations = []
veg_classes = []
for i_id in range(0, len(sentinel_2_zip_list) - 1):
    image_1 = sentinel_2_zip_list[i_id]
    image_2 = sentinel_2_zip_list[i_id + 1]

    date_start = getS2Date(image_1)
    date_end = getS2Date(image_2)
    dates = [date.strftime("%Y-%m-%d") for date in date_range(date_start, date_end)]
    print(f'Timeslot {i_id+1}:', date_start, date_end)
    data_list = []
    for s in range(0, len(stations_list)):
            for d in range(0, len(dates)):
                if dates[d] == dates_list[d]:
                    data_list = [stations_list[s], veg_classes_list[s], dates_list[s]]
                    print(data_list)
            # data = get_data(s, v, date_start, date_end)
            # target = calc_target_highest_freq(data)

In [None]:
image_1 = sentinel_2_zip_list[0]
image_2 = sentinel_2_zip_list[1]

date_start = getS2Date(image_1)
date_end = getS2Date(image_2)
print(date_start, date_end)
date_range(date_start, date_end)
# Create a list with all the inbetween dates
dates = [date.strftime("%Y-%m-%d") for date in date_range(date_start, date_end)]



In [None]:
date_start = str(getS2Date(sentinel_2_zip_list[0]))
date_end = str(getS2Date(sentinel_2_zip_list[1]))

a = dates[0]
a = datetime.datetime.strptime(a, "%Y-%m-%d").date()
print(a)
print(type(a))