### Split the data on the disk into a train, validation and test set

1. Create for each tile on folder which contains the folder from every season for this tile
2. Random sample tiles for the train, validation and test set
3. Copy the tiles from the disk to the train, validation and test folder

In [41]:
from pathlib import Path
import re
import os
import random
import shutil

In [4]:
IDENTIFIER_REGEX = re.compile(
    r"""(?P<mission>S2[A-B])_MSI
        (?P<product_level>L[1-2][A-C])_
        (?P<sensing_time>\d{8}T\d{6})_
        (?P<processing_baseline>N\d{4})_
        (?P<relative_orbit>R\d{3})_T
        (?P<utm_code>\d{2})
        (?P<latitude_band>\w{1})
        (?P<square>\w{2})_
        (?P<year>\d{4})
        (?P<month>\d{2})
        (?P<day>\d{2})T
        (?P<product_time>\d{6})""",
    re.VERBOSE,
)

In [6]:
def get_tile_and_date(identifier: str):
    regex_match = re.search(IDENTIFIER_REGEX, identifier)

    if not regex_match:
        return None, None

    utm_code = regex_match.group("utm_code")
    latitude_band = regex_match.group("latitude_band")
    square = regex_match.group("square")
    year = regex_match.group("year")
    # remove leading zeros
    month = str(int(regex_match.group("month")))
    day = str(int(regex_match.group("day")))

    tile = f"{utm_code}{latitude_band}{square}"
    tile_date = f"{year}-{month}-{day}"

    return tile, tile_date

In [47]:
images_prepro_path = Path(r'C:\Users\Fabian\Documents\Github_Masterthesis\Solarpark-detection\data_local\images_only_AOI_test_color_corr_cleaned')
masks_prepro_path = Path(r'C:\Users\Fabian\Documents\Github_Masterthesis\Solarpark-detection\data_local\masks_only_AOI_test_color_corr_cleaned')

## Create a list of all tiles which are available on the disk

In [None]:
def index_to_filename(image_dir: Path, set_list: list) -> list:
    set_filenames = []
    for file_path in image_dir.glob("*.pt"):
        tile, number, date = file_path.stem.split("_")
        if (tile, number) in set_list:
            set_filenames.append(str(file_path))  # .name
    return set_filenames

In [52]:
# create a list of all tiles
tile_list = []
for file_path in images_prepro_path.glob("*.pt"):
    filename = file_path.stem
    tile, number, date = filename.split("_")
    tile_list.append(tile)

# set is used to get unique values of tiles
tiles_unique = list(set(tile_list))

In [None]:
# select random tiles for train, val and test
random.seed(42)
random.shuffle(tiles_unique)

num_total = len(tiles_unique)
num_train = int(num_total * 0.7)
num_val = int(num_total * 0.1)

In [54]:
# create train, val and test list
train_list = tiles_unique[:num_train]
val_list = tiles_unique[num_train : num_train + num_val]
test_list = tiles_unique[num_train + num_val :]

In [56]:
# create a directory for each set
train_dir = Path(r'C:\Users\Fabian\Documents\Github_Masterthesis\Solarpark-detection\data_local\data_splitted')
for set_name in ["train", "val", "test"]:
    os.makedirs(train_dir / set_name, exist_ok=True)
    os.makedirs(train_dir / set_name / "images", exist_ok=True)
    os.makedirs(train_dir / set_name / "masks", exist_ok=True)

In [69]:
for file in images_prepro_path.glob("*.pt"):
    file_stammed = file.stem
    filename = file.name
    tile, _, _ = file_stammed.split("_")
    if tile in train_list:
        shutil.copyfile(images_prepro_path / filename, train_dir / "train" / "images" / filename)
        shutil.copyfile(masks_prepro_path / filename, train_dir / "train" / "masks" / filename)
    elif tile in val_list:
        shutil.copyfile(images_prepro_path / filename, train_dir / "val" / "images" / filename)
        shutil.copyfile(masks_prepro_path / filename, train_dir / "val" / "masks" / filename)
    elif tile in test_list:
        shutil.copyfile(images_prepro_path / filename, train_dir / "test" / "images" / filename)
        shutil.copyfile(masks_prepro_path / filename, train_dir / "test" / "masks" / filename)
    else:
        print("Error: Tile not in any set!")

In [73]:
# check if all files are copied
print("Number of files in each set:")
print(f'Train images: {len(list(train_dir.glob("train/images/*.pt")))}')
print(f'Train masks: {len(list(train_dir.glob("train/masks/*.pt")))}')
print(f'Val images: {len(list(train_dir.glob("val/images/*.pt")))}')
print(f'Val masks: {len(list(train_dir.glob("val/masks/*.pt")))}')
print(f'Test images: {len(list(train_dir.glob("test/images/*.pt")))}')
print(f'Test masks: {len(list(train_dir.glob("test/masks/*.pt")))}')

Number of files in each set:
Train images: 3832
Train masks: 3832
Val images: 489
Val masks: 489
Test images: 342
Test masks: 342


In [None]:
print("Sum of all files:")
print(f'Images: {len(list(train_dir.glob("**/*.pt")))}')