## Overview

This script will move downloaded images to `prepare_dataset`. The process is briefed as follows:

- Images downloaded from `notebooks/download.ipynb` will be stored in `download_file` folder.
- Under `download_file` folder, images will be grouped according to the polygon they belong to.
- In each polygon, 2 types of images are present: geotiff with band information, and a mask file.
- Geotiffs should be moved and stored in `prepare_dataset/images_directory{group_id}` folder.
- Mask files are stored in `prepare_dataset/mask_directory{group_id}` folder.

In [1]:
import os
import glob
import shutil
import pandas as pd
import json
import numpy as np
import csv
import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def fast_scandir(dirname: str) -> list:
    """
    Scan and return all subfolders of a directory.
    """
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders

source_path = "../datasets/sentinel2"
subfolders_list = fast_scandir(source_path)
print(f"Number of subfolders: {len(subfolders_list)}")

for folder in subfolders_list:
    print(folder)


Number of subfolders: 4520
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A
../datasets/sentinel2/S2A_10UDU_20220920_0_L2A
../datasets/sentinel2/S2B_9UXR_20221117_0_L2A
../datasets/sentinel2/S2A_10UCA_20220808_0_L2A
../datasets/sentinel2/S2A_9UXR_20220725_0_L2A
../datasets/sentinel2/S2B_9UYQ_20221015_0_L2A
../datasets/sentinel2/S2B_10UCA_20221002_0_L2A
../datasets/sentinel2/S2B_10UCB_20220925_0_L2A
../datasets/sentinel2/S2B_9UWR_20221117_0_L2A
../datasets/sentinel2/S2B_10UCU_20221015_0_L2A
../datasets/sentinel2/S2B_10UDV_20221012_0_L2A
../datasets/sentinel2/S2B_9UXR_20221008_0_L2A
../datasets/sentinel2/S2A_9UWR_20220817_0_L2A
../datasets/sentinel2/S2A_9UYR_20220722_0_L2A
../datasets/sentinel2/S2B_9UYQ_20221117_0_L2A
../datasets/sentinel2/S2A_9UWS_20220817_0_L2A
../datasets/sentinel2/S2B_10UCA_20220730_0_L2A
../datasets/sentinel2/S2A_9UYQ_20220725_0_L2A
../datasets/sentinel2/S2B_10UDA_20220902_0_L2A
../datasets/sentinel2/S2B_9UXR_20221015_0_L2A
../datasets/sentinel2/S2B_9UWS_20221117_0_L2A

In [3]:
def get_subfolders_with_keyword(keyword: str, subfolders_list: list = subfolders_list) -> list:
    subfolders_with_keyword_list = []

    for folder in subfolders_list:
        if keyword in folder:
            subfolders_with_keyword_list.append(folder)

    return subfolders_with_keyword_list

subfolders_with_keyword_list = get_subfolders_with_keyword("tiles/") # note that we need the / to get folders
for folder in subfolders_with_keyword_list:
    print(folder)


../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/3_4
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/3_3
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/9_4
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/9_3
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/12_5
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/10_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/12_2
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/10_6
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_1
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/9_2
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/9_5
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/3_2
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/13_10
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/10_0
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/10_7
../datasets/senti

In [4]:
raw_df = pd.DataFrame(subfolders_with_keyword_list, columns=["path_name"])
raw_df.head()


Unnamed: 0,path_name
0,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...
1,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...
2,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...
3,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...
4,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...


In [5]:
detailed_df = pd.DataFrame([x.rsplit('/') for x in raw_df['path_name']])
# insert detailed_df into raw_df
raw_df = pd.concat([raw_df, detailed_df], axis=1)
raw_df


Unnamed: 0,path_name,0,1,2,3,4,5,6
0,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,3_4
1,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,3_3
2,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,9_4
3,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,9_3
4,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,12_5
...,...,...,...,...,...,...,...,...
4185,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,..,datasets,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,tiles,3_12
4186,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,..,datasets,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,tiles,3_13
4187,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,..,datasets,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,tiles,0_13
4188,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,..,datasets,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,tiles,1_13


In [6]:
# remove column '0', '1', '5' in raw_df
raw_df.drop([0, 1, 5], axis=1, inplace=True)

In [7]:
# rename columns
raw_df = raw_df.rename(columns={"path_name": "path_name_sentinel2", 2: "satellite", 3: "imagery_id", 4: "date", 6: "tile_id"})
raw_df

Unnamed: 0,path_name_sentinel2,satellite,imagery_id,date,tile_id
0,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,3_4
1,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,3_3
2,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,9_4
3,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,9_3
4,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,12_5
...,...,...,...,...,...
4185,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,3_12
4186,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,3_13
4187,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,0_13
4188,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,20221002,1_13


In [8]:
# generate saving path name by data and tile_id starting with "..prepared_dataset"
raw_df['saving_path'] = raw_df.apply(lambda x: f"../prepared_dataset/{x['date']}/{x['tile_id']}", axis=1)

In [9]:
import ast

with open(f"{raw_df['path_name_sentinel2'][0]}/coordinates.json", "r") as f:
    tile_info = json.load(f)
    coordinates = tile_info['geometry']['coordinates'][0]
    coordinates_str = ",".join(str(tuple(coord)) for coord in coordinates)
    print(coordinates_str)
    coordinates_tuple = ast.literal_eval(coordinates_str)
    print(coordinates_tuple)


(-125.7607154787941, 50.29994118390697),(-125.65306262838969, 50.29688850061719),(-125.65789652279332, 50.22793508254463),(-125.7653945658079, 50.23098034961218)
((-125.7607154787941, 50.29994118390697), (-125.65306262838969, 50.29688850061719), (-125.65789652279332, 50.22793508254463), (-125.7653945658079, 50.23098034961218))


In [10]:
# open a file called coordinates.json from path_name_sentinel2
# save the content of the file into a new column called 'coordinates'

def get_coordinates_from_json(path_name_sentinel2: str) -> str:
    with open(f"{path_name_sentinel2}/coordinates.json", "r") as file:
        tile_info = json.load(file)
        coordinates = tile_info['geometry']['coordinates'][0]
        coordinates_str = ",".join(str(tuple(coord)) for coord in coordinates)
        return coordinates_str

raw_df['coordinates'] = raw_df.apply(lambda x: get_coordinates_from_json(x['path_name_sentinel2']), axis=1)

In [11]:
# save the dataframe to a csv file with name "sentinel2_tiles_path.csv"
# create the folder if not exist
if not os.path.exists("../dataset_tables"):
    os.makedirs("../dataset_tables")

raw_df.to_csv("../dataset_tables/sentinel2_tiles_path.csv", index=False)