## Overview

This script will move downloaded images to `prepare_dataset`. The process is briefed as follows:

- Images downloaded from `notebooks/download.ipynb` will be stored in `download_file` folder.
- Under `download_file` folder, images will be grouped according to the polygon they belong to.
- In each polygon, 2 types of images are present: geotiff with band information, and a mask file.
- Geotiffs should be moved and stored in `prepare_dataset/images_directory{group_id}` folder.
- Mask files are stored in `prepare_dataset/mask_directory{group_id}` folder.

## Implementation

In [2]:
import os
import glob
import shutil
import pandas as pd


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Get all subfolders

In [4]:
def fast_scandir(dirname: str) -> list:
    """
    Scan and return all subfolders of a directory.
    """
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders

source_path = "../datasets/sentinel2"
subfolders_list = fast_scandir(source_path)
print(f"Number of subfolders: {len(subfolders_list)}")

for folder in subfolders_list:
    print(folder)


Number of subfolders: 197
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A
../datasets/sentinel2/S2A_10UCA_20210730_1_L2A
../datasets/sentinel2/S2A_9UYR_20170810_0_L2A
../datasets/sentinel2/S2A_9UYR_20190529_0_L2A
../datasets/sentinel2/S2A_9UYR_20220910_0_L2A
../datasets/sentinel2/S2B_9UYR_20180728_0_L2A
../datasets/sentinel2/S2A_10UCA_20190509_0_L2A
../datasets/sentinel2/S2A_9UYR_20210730_1_L2A
../datasets/sentinel2/S2A_10UCA_20170810_0_L2A
../datasets/sentinel2/S2A_10UCA_20220910_0_L2A
../datasets/sentinel2/S2B_9UYR_20170703_0_L2A
../datasets/sentinel2/S2B_10UCA_20180618_0_L2A
../datasets/sentinel2/S2B_10UCA_20220809_0_L2A
../datasets/sentinel2/S2B_10UCA_20170703_0_L2A
../datasets/sentinel2/S2A_10UCA_20210627_0_L2A
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20

### Get folders with restrictions - Regex

In [5]:
def get_subfolders_with_keyword(keyword: str, subfolders_list: list = subfolders_list) -> list:
    subfolders_with_keyword_list = []

    for folder in subfolders_list:
        if keyword in folder:
            subfolders_with_keyword_list.append(folder)

    return subfolders_with_keyword_list

subfolders_with_keyword_list = get_subfolders_with_keyword("tiles/") # note that we need the / to get folders
for folder in subfolders_with_keyword_list:
    print(folder)


../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_7
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/6_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/8_7
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/8_9
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/6_7
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/6_9
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/8_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/7_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/7_9
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/7_7
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/tiles/7_5
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/tiles/7_4
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/tiles/7_3
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/tiles/8_3
../datasets/sentinel2

### Get folders with restrictions - DataFrame

#### 1. Turn paths into columns

In [6]:
raw_df = pd.DataFrame(subfolders_list, columns=["path_name"])
raw_df.head()


Unnamed: 0,path_name
0,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A
1,../datasets/sentinel2/S2B_10UCA_20220806_0_L2A
2,../datasets/sentinel2/S2A_10UCA_20210730_1_L2A
3,../datasets/sentinel2/S2A_9UYR_20170810_0_L2A
4,../datasets/sentinel2/S2A_9UYR_20190529_0_L2A


In [7]:
test_df = pd.DataFrame([x.rsplit('/') for x in raw_df['path_name']])
test_df


Unnamed: 0,0,1,2,3,4,5,6
0,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,,,
1,..,datasets,sentinel2,S2B_10UCA_20220806_0_L2A,,,
2,..,datasets,sentinel2,S2A_10UCA_20210730_1_L2A,,,
3,..,datasets,sentinel2,S2A_9UYR_20170810_0_L2A,,,
4,..,datasets,sentinel2,S2A_9UYR_20190529_0_L2A,,,
...,...,...,...,...,...,...,...
192,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_3
193,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_4
194,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_5
195,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,6_3


#### 2. Filter out rows that have `tile` as part of the path names

In [15]:
df_subfolder_under_tiles = test_df[((test_df[5] == "tiles") & (test_df[5].notnull()))].reset_index(drop=True)
df_subfolder_under_tiles


Unnamed: 0,0,1,2,3,4,5,6
0,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,
1,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,5_8
2,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,5_7
3,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,6_8
4,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,8_7
...,...,...,...,...,...,...,...
160,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_3
161,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_4
162,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_5
163,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,6_3


In [20]:
# drop rows where 6th column is None
df_subfolder_under_tiles = df_subfolder_under_tiles[df_subfolder_under_tiles[6].notnull()].reset_index(drop=True)
df_subfolder_under_tiles

Unnamed: 0,0,1,2,3,4,5,6
0,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,5_8
1,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,5_7
2,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,6_8
3,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,8_7
4,..,datasets,sentinel2,S2B_9UYR_20220809_0_L2A,20220809,tiles,8_9
...,...,...,...,...,...,...,...
144,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_3
145,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_4
146,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,8_5
147,..,datasets,sentinel2,S2A_10UCA_20210627_0_L2A,20210627,tiles,6_3


#### 3. Reconcatenate to pathnames and indicate date

In [21]:
df_selected_paths_date = df_subfolder_under_tiles
df_selected_paths_date["file_path"] = df_selected_paths_date[[0, 1, 2, 3, 4, 6]].apply(lambda x: "/".join(x), axis=1)
df_selected_paths_date["date"] = df_selected_paths_date[4].apply(lambda x: pd.to_datetime(x, format="%Y%m%d"))
df_selected_paths_date.drop([0, 1, 2, 3, 4, 5, 6], axis=1, inplace=True)
df_selected_paths_date


Unnamed: 0,file_path,date
0,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,2022-08-09
1,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,2022-08-09
2,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,2022-08-09
3,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,2022-08-09
4,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,2022-08-09
...,...,...
144,../datasets/sentinel2/S2A_10UCA_20210627_0_L2A...,2021-06-27
145,../datasets/sentinel2/S2A_10UCA_20210627_0_L2A...,2021-06-27
146,../datasets/sentinel2/S2A_10UCA_20210627_0_L2A...,2021-06-27
147,../datasets/sentinel2/S2A_10UCA_20210627_0_L2A...,2021-06-27


In [22]:
df_selected_paths_date["file_path"].values

array(['../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/5_8',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/5_7',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/6_8',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/8_7',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/8_9',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/6_7',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/6_9',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/8_8',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/7_8',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/7_9',
       '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/7_7',
       '../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/7_5',
       '../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/7_4',
       '../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/7_3',
       '../datasets/sentinel2/S

#### Save and copy selected file paths to a list

In [23]:
selected_path_list = df_selected_paths_date["file_path"].values.tolist()

for path in selected_path_list:
    print(path)


../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/5_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/5_7
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/6_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/8_7
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/8_9
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/6_7
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/6_9
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/8_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/7_8
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/7_9
../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/7_7
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/7_5
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/7_4
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/7_3
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/8_3
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/20220806/8_4
../datasets/sentinel2/S2B_10UCA_20220806_0_L2A/2022

### Given a list of directories, get a list of files inside, and move to the selected directory

For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [24]:
def get_list_of_files_in_directory(directory_name: str, keyword: str = ".tif") -> list:
    return [f"{directory_name}/{f}" for f in os.listdir(directory_name) if f.endswith(keyword)]

test_get_list_of_files_in_directory = get_list_of_files_in_directory(subfolders_with_keyword_list[0])
print(test_get_list_of_files_in_directory)


['../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B08.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B09.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B8A.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/mask.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B02.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B03.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B01.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B04.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B11.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B05.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B07.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B06.tif', '../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B12.tif']


For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [25]:
def move_file(source_path: str, label: str, id: int) -> None:
    """
    label: either "image" or "mask"
    the moved file will be named as `date_original-name`
    """
    file_name = source_path.split("/")[-1]
    file_date = source_path.split("/")[-4]
    destination_folder = f"../prepare_dataset/{label}_directory{id}"
    destination_path = f"../prepare_dataset/{label}_directory{id}/{file_date}_{file_name}"

    if not os.path.isdir(destination_folder):
        os.makedirs(os.path.dirname(destination_path))

    if os.path.isfile(destination_path):
        print("File exists.")
        return

    shutil.copy(source_path, destination_path)
    print(f"File copied to destination: {destination_path}.")

test_move_file = move_file(test_get_list_of_files_in_directory[1], "image", 1)


File copied to destination: ../prepare_dataset/image_directory1/20220809_B09.tif.


#### Batch move files

There are two things we need to do:

1. Construct a set so that geotif under the same folder gets the same `id` when `move_file()` gets executed.
2. `mask.tif` should get the label `mask`, others get `image`.

In [26]:
def batch_move_files(source_path_list: list) -> None:
    path_dict = {}

    for i in range(len(source_path_list)):
        current_path = source_path_list[i]
        current_folder = current_path.rsplit("/", 1)[0] # split on the last occurrence

        if current_folder not in path_dict:
            path_dict[current_folder] = len(path_dict)

        current_id = path_dict[current_folder]

        if "mask" in current_path:
            move_file(current_path, "mask", current_id)
        else:
            print(f"image: {current_path}")
            move_file(current_path, "image", current_id)

batch_move_files(test_get_list_of_files_in_directory)


image: ../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B08.tif
File copied to destination: ../prepare_dataset/image_directory0/20220809_B08.tif.
image: ../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B09.tif
File copied to destination: ../prepare_dataset/image_directory0/20220809_B09.tif.
image: ../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B8A.tif
File copied to destination: ../prepare_dataset/image_directory0/20220809_B8A.tif.
File copied to destination: ../prepare_dataset/mask_directory0/20220809_mask.tif.
image: ../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B02.tif
File copied to destination: ../prepare_dataset/image_directory0/20220809_B02.tif.
image: ../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B03.tif
File copied to destination: ../prepare_dataset/image_directory0/20220809_B03.tif.
image: ../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/20220809/tiles/5_8/B01.tif
File copied to destination: 