In [30]:
import geopandas as gpd
from datetime import datetime, date
import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [25]:
IDENTIFIER_REGEX = re.compile(
    r"""(?P<mission>S2[A-B])_MSI
        (?P<product_level>L[1-2][A-C])_
        (?P<sensing_time>\d{8}T\d{6})_
        (?P<processing_baseline>N\d{4})_
        (?P<relative_orbit>R\d{3})_T
        (?P<utm_code>\d{2})
        (?P<latitude_band>\w{1})
        (?P<square>\w{2})_
        (?P<year>\d{4})
        (?P<month>\d{2})
        (?P<day>\d{2})T
        (?P<product_time>\d{6})""",
    re.VERBOSE,
)

In [26]:
def get_tile_and_date(identifier: str):
    regex_match = re.search(IDENTIFIER_REGEX, identifier)

    if not regex_match:
        return None, None

    utm_code = regex_match.group("utm_code")
    latitude_band = regex_match.group("latitude_band")
    square = regex_match.group("square")
    year = regex_match.group("year")
    # remove leading zeros
    month = str(int(regex_match.group("month")))
    day = str(int(regex_match.group("day")))

    tile = f"{utm_code}{latitude_band}{square}"
    tile_date = f"{year}-{month}-{day}"

    return tile, tile_date

In [3]:
masks_gdf = gpd.read_file('data/trn_polygons_germany_tile_names.geojson')

In [4]:
len(masks_gdf)

23222

In [5]:
masks_gdf.head()

Unnamed: 0,osm_id,tile_name,centroid_of_tile,geometry
0,483534385.0,33UVS,POINT (14.35719383285053 50.955770194506044),"POLYGON ((13.88598 50.89644, 13.88598 50.89642..."
1,359898843.0,33UVT,POINT (14.344460359916278 51.85505191805214),"POLYGON ((13.95746 51.53471, 13.95746 51.53468..."
2,359898992.0,33UVT,POINT (14.344460359916278 51.85505191805214),"POLYGON ((13.95746 51.53479, 13.95746 51.53477..."
3,359899222.0,33UVT,POINT (14.344460359916278 51.85505191805214),"POLYGON ((13.95746 51.53487, 13.95746 51.53484..."
4,359899000.0,33UVT,POINT (14.344460359916278 51.85505191805214),"POLYGON ((13.95745 51.53494, 13.95745 51.53491..."


In [13]:
masks_gdf["area"] = masks_gdf['geometry'].to_crs('EPSG:25832').area

In [18]:
masks_gdf.sort_values(by='area', ascending=True).head()

Unnamed: 0,osm_id,tile_name,centroid_of_tile,geometry,area,transformed_geo
8892,363524413.0,32UMV,POINT (8.380958387924386 49.15731928511489),"POLYGON ((8.52661 49.21162, 8.52663 49.21161, ...",2.166341,"POLYGON ((465521.782 5451088.866, 465523.333 5..."
8890,363524441.0,32UMV,POINT (8.380958387924386 49.15731928511489),"POLYGON ((8.52659 49.21161, 8.52661 49.21160, ...",2.173492,"POLYGON ((465520.739 5451087.366, 465522.283 5..."
8902,363524209.0,32UMV,POINT (8.380958387924386 49.15731928511489),"POLYGON ((8.52669 49.21170, 8.52671 49.21169, ...",2.1747,"POLYGON ((465527.936 5451097.837, 465529.480 5..."
8900,363524685.0,32UMV,POINT (8.380958387924386 49.15731928511489),"POLYGON ((8.52668 49.21169, 8.52670 49.21168, ...",2.177647,"POLYGON ((465526.846 5451096.220, 465528.391 5..."
19241,408820216.0,31UGS,POINT (6.624872784754955 50.90124511611022),"POLYGON ((7.00398 50.80755, 7.00399 50.80753, ...",3.407226,"POLYGON ((359367.161 5630323.100, 359368.129 5..."


In [19]:
large_polygons = masks_gdf[masks_gdf['area'] > 100]
len(large_polygons)

16464

In [22]:
seasons_dict = {"winter":{'start_date':date(2018, 1, 1), 'end_date':date(2018, 3, 31)},
                "spring":{'start_date':date(2018, 4, 1), 'end_date':date(2018, 6, 30)},
                "summer":{'start_date':date(2018, 7, 1), 'end_date':date(2018, 9, 30)},
                "autumn":{'start_date':date(2018, 10, 1), 'end_date':date(2018, 11, 30)}}

In [31]:
# count number of tiles from each season
directory = Path(r"C:\Users\Fabian\Documents\Github_Masterthesis\Solarpark-detection\data_local\training_data_raw")
tiles_per_season = {"winter": 0, "spring": 0, "summer": 0, "autumn": 0}
# Iterate over the files in the directory
for filename in os.listdir(directory):
    # Extract the tile name and date from the filename
    tile, tile_date = get_tile_and_date(filename)
    if tile is None or tile_date is None:
        continue

    # Convert the date to a datetime object
    tile_datetime = datetime.strptime(tile_date, "%Y-%m-%d").date()

    # Determine the season based on the date
    season = None
    for season_name, season_dates in seasons_dict.items():
        if season_dates['start_date'] <= tile_datetime <= season_dates['end_date']:
            season = season_name
            break

    # If the season was determined, increment the count for that season
    if season is not None:
        tiles_per_season[season] += 1

# Display the number of tiles for each season
total_tiles = sum(tiles_per_season.values())
for season, count in tiles_per_season.items():
    print(f"{season}: {count} tiles")
print(f"Total: {total_tiles} tiles")

winter: 2 tiles
spring: 60 tiles
summer: 77 tiles
autumn: 53 tiles
Total: 192 tiles


In [51]:
from collections import defaultdict

# Initialize a dictionary to store the tiles and their files for each season
tiles_per_season = defaultdict(lambda: defaultdict(list))

# Iterate over the files in the directory
for filename in os.listdir(directory):
    # Extract the tile name and date from the filename
    tile, tile_date = get_tile_and_date(filename)
    if tile is None or tile_date is None:
        continue

    # Convert the date to a datetime object
    tile_datetime = datetime.strptime(tile_date, "%Y-%m-%d").date()

    # Determine the season based on the date
    season = None
    for season_name, season_dates in seasons_dict.items():
        if season_dates['start_date'] <= tile_datetime <= season_dates['end_date']:
            season = season_name
            break

    # If the season was determined, add the tile and its file to the dictionary for that season
    if season is not None:
        tiles_per_season[season][tile].append(filename)

# Iterate over the seasons
for season, tiles in tiles_per_season.items():
    # Create a dictionary to store the tiles that appear more than once
    duplicate_tiles = {tile: files for tile, files in tiles.items() if len(files) > 1}
    
    # Print the duplicate tiles for the season
    print(f"{season}: {len(duplicate_tiles)} duplicate tiles: {duplicate_tiles}")

winter: 0 duplicate tiles: {}
spring: 0 duplicate tiles: {}
summer: 0 duplicate tiles: {}
autumn: 0 duplicate tiles: {}


In [50]:
import shutil
copy_directory = r'C:\Users\Fabian\Documents\Github_Masterthesis\Solarpark-detection\data_local\training_data_raw_duplicated_seasons'
for season, tiles in tiles_per_season.items():
    # Create a dictionary to store the tiles that appear more than once
    duplicate_tiles = {tile: files for tile, files in tiles.items() if len(files) > 1}
    # Print the duplicate tiles for the season
    print(f"{season}: {len(duplicate_tiles)} duplicate tiles: {duplicate_tiles}")
    # Iterate over the duplicate tiles
    for tile, files in duplicate_tiles.items():
        # Skip the first file
        for file in files[1:]:
            # Define the source and destination directories
            src_dir = os.path.join(directory, file)
            dst_dir = os.path.join(copy_directory, file)
            
            # Copy the directory
            shutil.copytree(src_dir, dst_dir)
            
            # Remove the source directory
            shutil.rmtree(src_dir)

winter: 0 duplicate tiles: {}
spring: 8 duplicate tiles: {'33UUU': ['S2A_MSIL2A_20180409T101031_N0207_R022_T33UUU_20180409T154909', 'S2B_MSIL2A_20180606T102019_N0208_R065_T33UUU_20180606T190659'], '33UVV': ['S2A_MSIL2A_20180409T101031_N0207_R022_T33UVV_20180409T154909', 'S2B_MSIL2A_20180606T102019_N0208_R065_T33UVV_20180606T190659'], '33UUP': ['S2A_MSIL2A_20180419T101031_N0207_R022_T33UUP_20180419T111252', 'S2A_MSIL2A_20180608T101021_N0208_R022_T33UUP_20180608T131739'], '32ULB': ['S2A_MSIL2A_20180508T104031_N0207_R008_T32ULB_20180508T175127', 'S2B_MSIL2A_20180420T103019_N0207_R108_T32ULB_20180420T114307'], '32UNF': ['S2A_MSIL2A_20180607T104021_N0208_R008_T32UNF_20180607T132721', 'S2B_MSIL2A_20180420T103019_N0207_R108_T32UNF_20180420T114307'], '32TPT': ['S2B_MSIL2A_20180407T102019_N0207_R065_T32TPT_20180407T143030', 'S2B_MSIL2A_20180616T102019_N0208_R065_T32TPT_20180616T162410'], '32UQE': ['S2B_MSIL2A_20180507T102019_N0207_R065_T32UQE_20180507T123037', 'S2B_MSIL2A_20180606T102019_N0208_

In [45]:
duplicate_tiles

{'32ULB': ['S2B_MSIL2A_20181010T104019_N0209_R008_T32ULB_20181010T171128',
  'S2B_MSIL2A_20181116T103309_N0210_R108_T32ULB_20181116T152612']}