## Overview

This script will move downloaded images to `prepare_dataset`. The process is briefed as follows:

- Images downloaded from `notebooks/download.ipynb` will be stored in `download_file` folder.
- Under `download_file` folder, images will be grouped according to the polygon they belong to.
- In each polygon, 2 types of images are present: geotiff with band information, and a mask file.
- Geotiffs should be moved and stored in `prepare_dataset/images_directory{group_id}` folder.
- Mask files are stored in `prepare_dataset/mask_directory{group_id}` folder.

## Implementation

In [1]:
import os
import glob
import shutil
import pandas as pd


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Get all subfolders

In [2]:
def fast_scandir(dirname: str) -> list:
    """
    Scan and return all subfolders of a directory.
    """
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders

source_path = "../datasets/landsat2"
subfolders_list = fast_scandir(source_path)
print(f"Number of subfolders: {len(subfolders_list)}")

for folder in subfolders_list:
    print(folder)


Number of subfolders: 90
../datasets/landsat2/LC08_L2SP_049025_20210730_20210804_02_T1_ST
../datasets/landsat2/LE07_L2SP_048025_20190507_20200825_02_T1_ST
../datasets/landsat2/LC08_L2SP_048025_20180715_20200831_02_T1_ST
../datasets/landsat2/LE07_L2SP_048025_20210629_20210725_02_T1_ST
../datasets/landsat2/LE07_L2SP_049025_20220920_20221016_02_T1_ST
../datasets/landsat2/LC08_L2SP_048025_20180426_20200901_02_T1_ST
../datasets/landsat2/LC08_L2SP_048026_20180426_20201015_02_T1_ST
../datasets/landsat2/LC08_L2SP_048026_20210621_20210629_02_T1_ST
../datasets/landsat2/LE07_L2SP_049025_20180425_20200829_02_T1_ST
../datasets/landsat2/LE07_L2SP_049025_20190530_20201008_02_T1_ST
../datasets/landsat2/LC09_L2SP_048025_20220920_20230328_02_T1_ST
../datasets/landsat2/LC08_L2SP_048025_20170914_20200903_02_T1_ST
../datasets/landsat2/LC08_L2SP_048026_20180715_20200831_02_T1_ST
../datasets/landsat2/LC08_L2SP_049025_20170804_20200903_02_T1_ST
../datasets/landsat2/LC08_L2SP_048026_20170914_20200903_02_T1_ST


### Get folders with restrictions - Regex

In [3]:
def get_subfolders_with_keyword(keyword: str, subfolders_list: list = subfolders_list) -> list:
    subfolders_with_keyword_list = []

    for folder in subfolders_list:
        if keyword in folder:
            subfolders_with_keyword_list.append(folder)

    return subfolders_with_keyword_list

subfolders_with_keyword_list = get_subfolders_with_keyword("tiles/") # note that we need the / to get folders
for folder in subfolders_with_keyword_list:
    print(folder)


../datasets/landsat2/LC08_L2SP_049025_20210730_20210804_02_T1_ST/20210730/tiles/7_5
../datasets/landsat2/LC08_L2SP_049025_20210730_20210804_02_T1_ST/20210730/tiles/7_4
../datasets/landsat2/LC08_L2SP_049025_20210730_20210804_02_T1_ST/20210730/tiles/6_5
../datasets/landsat2/LC08_L2SP_049025_20210730_20210804_02_T1_ST/20210730/tiles/6_4
../datasets/landsat2/LE07_L2SP_048025_20190507_20200825_02_T1_ST/20190507/tiles/5_1
../datasets/landsat2/LE07_L2SP_048025_20190507_20200825_02_T1_ST/20190507/tiles/5_0
../datasets/landsat2/LE07_L2SP_048025_20190507_20200825_02_T1_ST/20190507/tiles/6_1
../datasets/landsat2/LE07_L2SP_048025_20190507_20200825_02_T1_ST/20190507/tiles/6_0
../datasets/landsat2/LC08_L2SP_048025_20180715_20200831_02_T1_ST/20180715/tiles/5_0
../datasets/landsat2/LC08_L2SP_048025_20180715_20200831_02_T1_ST/20180715/tiles/6_0
../datasets/landsat2/LE07_L2SP_048025_20210629_20210725_02_T1_ST/20210629/tiles/5_1
../datasets/landsat2/LE07_L2SP_048025_20210629_20210725_02_T1_ST/20210629/ti

### Get folders with restrictions - DataFrame

#### 1. Turn paths into columns

In [4]:
raw_df = pd.DataFrame(subfolders_with_keyword_list, columns=["path_name"])
raw_df.head()


Unnamed: 0,path_name
0,../datasets/landsat2/LC08_L2SP_049025_20210730...
1,../datasets/landsat2/LC08_L2SP_049025_20210730...
2,../datasets/landsat2/LC08_L2SP_049025_20210730...
3,../datasets/landsat2/LC08_L2SP_049025_20210730...
4,../datasets/landsat2/LE07_L2SP_048025_20190507...


In [5]:
detailed_df = pd.DataFrame([x.rsplit('/') for x in raw_df['path_name']])
# insert detailed_df into raw_df
raw_df = pd.concat([raw_df, detailed_df], axis=1)
raw_df


Unnamed: 0,path_name,0,1,2,3,4,5,6
0,../datasets/landsat2/LC08_L2SP_049025_20210730...,..,datasets,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,tiles,7_5
1,../datasets/landsat2/LC08_L2SP_049025_20210730...,..,datasets,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,tiles,7_4
2,../datasets/landsat2/LC08_L2SP_049025_20210730...,..,datasets,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,tiles,6_5
3,../datasets/landsat2/LC08_L2SP_049025_20210730...,..,datasets,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,tiles,6_4
4,../datasets/landsat2/LE07_L2SP_048025_20190507...,..,datasets,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,tiles,5_1
5,../datasets/landsat2/LE07_L2SP_048025_20190507...,..,datasets,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,tiles,5_0
6,../datasets/landsat2/LE07_L2SP_048025_20190507...,..,datasets,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,tiles,6_1
7,../datasets/landsat2/LE07_L2SP_048025_20190507...,..,datasets,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,tiles,6_0
8,../datasets/landsat2/LC08_L2SP_048025_20180715...,..,datasets,landsat2,LC08_L2SP_048025_20180715_20200831_02_T1_ST,20180715,tiles,5_0
9,../datasets/landsat2/LC08_L2SP_048025_20180715...,..,datasets,landsat2,LC08_L2SP_048025_20180715_20200831_02_T1_ST,20180715,tiles,6_0


In [6]:
# remove column '0', '1', '5' in raw_df
raw_df.drop([0, 1, 5], axis=1, inplace=True)

In [7]:
# rename columns
raw_df = raw_df.rename(columns={"path_name": "path_name_landsat2", 2: "satellite", 3: "imagery_id", 4: "date", 6: "tile_id"})
raw_df

Unnamed: 0,path_name_landsat2,satellite,imagery_id,date,tile_id
0,../datasets/landsat2/LC08_L2SP_049025_20210730...,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,7_5
1,../datasets/landsat2/LC08_L2SP_049025_20210730...,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,7_4
2,../datasets/landsat2/LC08_L2SP_049025_20210730...,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,6_5
3,../datasets/landsat2/LC08_L2SP_049025_20210730...,landsat2,LC08_L2SP_049025_20210730_20210804_02_T1_ST,20210730,6_4
4,../datasets/landsat2/LE07_L2SP_048025_20190507...,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,5_1
5,../datasets/landsat2/LE07_L2SP_048025_20190507...,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,5_0
6,../datasets/landsat2/LE07_L2SP_048025_20190507...,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,6_1
7,../datasets/landsat2/LE07_L2SP_048025_20190507...,landsat2,LE07_L2SP_048025_20190507_20200825_02_T1_ST,20190507,6_0
8,../datasets/landsat2/LC08_L2SP_048025_20180715...,landsat2,LC08_L2SP_048025_20180715_20200831_02_T1_ST,20180715,5_0
9,../datasets/landsat2/LC08_L2SP_048025_20180715...,landsat2,LC08_L2SP_048025_20180715_20200831_02_T1_ST,20180715,6_0


In [8]:
# generate saving path name by data and tile_id starting with "..prepared_dataset"
raw_df['saving_path'] = raw_df.apply(lambda x: f"../prepared_dataset/{x['date']}/{x['tile_id']}", axis=1)

In [11]:
raw_df
# save the dataframe to a csv file with name "sentinel2_tiles_path.csv"
# create the folder if not exist
if not os.path.exists("../dataset_tables"):
    os.makedirs("../dataset_tables")

raw_df.to_csv("../dataset_tables/landsat2_tiles_path.csv", index=False)

#### Save and copy selected file paths to a list

In [None]:
selected_path_list = df_selected_paths_date["file_path"].values.tolist()

for path in selected_path_list:
    print(path)


### Given a list of directories, get a list of files inside, and move to the selected directory

For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [None]:
def get_list_of_files_in_directory(directory_name: str, keyword: str = ".tif") -> list:
    return [f"{directory_name}/{f}" for f in os.listdir(directory_name) if f.endswith(keyword)]

test_get_list_of_files_in_directory = get_list_of_files_in_directory(subfolders_with_keyword_list[0])
print(test_get_list_of_files_in_directory)


For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [None]:
def move_file(source_path: str, label: str, id: int) -> None:
    """
    label: either "image" or "mask"
    the moved file will be named as `date_original-name`
    """
    file_name = source_path.split("/")[-1]
    file_date = source_path.split("/")[-4]
    destination_folder = f"../prepare_dataset/{label}_directory{id}"
    destination_path = f"../prepare_dataset/{label}_directory{id}/{file_date}_{file_name}"

    if not os.path.isdir(destination_folder):
        os.makedirs(os.path.dirname(destination_path))

    if os.path.isfile(destination_path):
        print("File exists.")
        return

    shutil.copy(source_path, destination_path)
    print(f"File copied to destination: {destination_path}.")

test_move_file = move_file(test_get_list_of_files_in_directory[1], "image", 1)


#### Batch move files

There are two things we need to do:

1. Construct a set so that geotif under the same folder gets the same `id` when `move_file()` gets executed.
2. `mask.tif` should get the label `mask`, others get `image`.

In [None]:
def batch_move_files(source_path_list: list) -> None:
    path_dict = {}

    for i in range(len(source_path_list)):
        current_path = source_path_list[i]
        current_folder = current_path.rsplit("/", 1)[0] # split on the last occurrence

        if current_folder not in path_dict:
            path_dict[current_folder] = len(path_dict)

        current_id = path_dict[current_folder]

        if "mask" in current_path:
            move_file(current_path, "mask", current_id)
        else:
            print(f"image: {current_path}")
            move_file(current_path, "image", current_id)

batch_move_files(test_get_list_of_files_in_directory)
