# Final formatting of Dataset
This notebook is for formatting the dataset properly and uploading it to kaggle

# Imports

In [1]:
import pandas as pd
import os, shutil, threading
import kagglehub

from tqdm.notebook import tqdm, trange
from concurrent.futures import ThreadPoolExecutor, as_completed

# Copy dataset files to temp directory for write access

In [2]:
# Copy dataset to writable directory
def get_files_to_copy(src, dst):
    files_to_copy = []
    total_size = 0
    for root, _, files in os.walk(src):
        for file in files:
            src_file_path = os.path.join(root, file)
            relative_path = os.path.relpath(src_file_path, src)
            dst_file_path = os.path.join(dst, relative_path)
            files_to_copy.append((src_file_path, dst_file_path))
            total_size += os.path.getsize(src_file_path)
    return files_to_copy, total_size

def copy_file(src_dst_tuple, pbar, lock):
    src_file, dst_file = src_dst_tuple
    os.makedirs(os.path.dirname(dst_file), exist_ok=True)

    with open(src_file, 'rb') as fsrc, open(dst_file, 'wb') as fdst:
        while True:
            buf = fsrc.read(1024 * 1024)  # 1 MB chunks
            if not buf:
                break
            fdst.write(buf)
            with lock:
                pbar.update(len(buf))

def copy_folder_parallel(src, dst, max_workers=4):
    files_to_copy, total_size = get_files_to_copy(src, dst)
    lock = threading.Lock()

    with tqdm(total=total_size, unit='B', unit_scale=True, desc="Copying files") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(copy_file, file_pair, pbar, lock) for file_pair in files_to_copy]
            for _ in as_completed(futures):
                pass  # just wait for all threads to finish

## Copy original dataset

In [3]:
copy_folder_parallel("/kaggle/input/tum-sentinel-1-2/Sentinel_fall/ROIs1970_fall", "/kaggle/temp/tum-sentinel-1-2/fall", 2)
copy_folder_parallel("/kaggle/input/tum-sentinel-1-2/Sentinel_spring/ROIs1158_spring", "/kaggle/temp/tum-sentinel-1-2/spring", 2)
copy_folder_parallel("/kaggle/input/tum-sentinel-1-2/Sentinel_winter/ROIs2017_winter", "/kaggle/temp/tum-sentinel-1-2/winter", 2)
copy_folder_parallel("/kaggle/input/tum-sentinel-1-2/Sentinel_summer/ROIs1868_summer", "/kaggle/temp/tum-sentinel-1-2/summer", 2)

Copying files:   0%|          | 0.00/13.9G [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/12.4G [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/10.9G [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/8.58G [00:00<?, ?B/s]

## Copy copernicus API extended data

In [6]:
copy_folder_parallel("/kaggle/input/sar-extension/fall", "/kaggle/temp/tum-sentinel-1-2/fall", 2)
copy_folder_parallel("/kaggle/input/sar-extension/spring", "/kaggle/temp/tum-sentinel-1-2/spring", 2)
copy_folder_parallel("/kaggle/input/sar-extension/winter", "/kaggle/temp/tum-sentinel-1-2/winter", 2)
copy_folder_parallel("/kaggle/input/sar-extension/summer", "/kaggle/temp/tum-sentinel-1-2/summer", 2)

Copying files:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/210M [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/3.69G [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

## Copy Earth Engine extended Data

In [8]:
copy_folder_parallel("/kaggle/input/sar-extension-ee/Spring", "/kaggle/temp/tum-sentinel-1-2/spring", 2)
copy_folder_parallel("/kaggle/input/sar-extension-ee/Winter", "/kaggle/temp/tum-sentinel-1-2/winter", 2)
copy_folder_parallel("/kaggle/input/sar-extension-ee/Summer", "/kaggle/temp/tum-sentinel-1-2/summer", 2)

Copying files:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/28.9M [00:00<?, ?B/s]

Copying files:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

# CSV generation for augmentation

In [9]:
SEASONS = ['fall', 'spring', 'summer', 'winter']

REMOVAL_FOLDERS = {
    "fall":   ["s2_13", "s2_4", "s2_66", "s2_8", "s2_82"],
    "spring": ["s2_45", "s2_68", "s2_95"],
    "winter": ["s2_12"],
    "summer": ["s2_53"]
}

FOLDER_PATHS = {
    "fall":   "/kaggle/temp/tum-sentinel-1-2/fall",
    "spring": "/kaggle/temp/tum-sentinel-1-2/spring",
    "winter": "/kaggle/temp/tum-sentinel-1-2/winter",
    "summer": "/kaggle/temp/tum-sentinel-1-2/summer"
}

In [10]:
def genCSV(season: str) :

    df = pd.read_csv(f"/kaggle/input/geotagdata/{season}_ROI_results.csv")

    for index, row in tqdm(df.iterrows(), total=df.shape[0]) :

        csv_content = []

        baseFolder = row["s2_folder"]
        folder = FOLDER_PATHS[season] + "/" + row["s2_folder"]
        zone = row["temperature_zone"]

        image_files = [img for img in os.listdir(folder) if img.endswith("png")]

        for img in tqdm(image_files, leave=False) :
            s1_fileName = season + "/s1" + baseFolder[2:] + "/" + img[:15] + "1" + img[16:]
            s2_fileName = season + "/" + baseFolder + "/" + img

            csv_content.append({
                "s1_fileName": s1_fileName,
                "s2_fileName": s2_fileName,
                "season": season,
                "region": zone
            })

        output_csv = pd.DataFrame(csv_content)
        output_csv.to_csv(f"{FOLDER_PATHS[season]}/info_{baseFolder[3:]}.csv", index=False)

In [None]:
for season in SEASONS :
    print(f"Processing {season}")
    genCSV(season)

# Delete folders which couldn't be tagged

In [15]:
def get_all_files_and_folders(path):
    files = []
    dirs = []

    total_size = 0
    for root, subdirs, filenames in os.walk(path, topdown=False):
        for f in filenames:
            fpath = os.path.join(root, f)
            files.append(fpath)
            total_size += os.path.getsize(fpath)
        for d in subdirs:
            dirs.append(os.path.join(root, d))
    return files, dirs, total_size

def delete_file(path, pbar, lock):
    try:
        size = os.path.getsize(path)
        os.remove(path)
        with lock:
            pbar.update(size)
    except Exception as e:
        print(f"Error deleting file {path}: {e}")

def delete_folder_parallel(path, max_workers=4):
    files, dirs, total_size = get_all_files_and_folders(path)
    lock = threading.Lock()

    with tqdm(total=total_size, unit='B', unit_scale=True, desc="Deleting files") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(delete_file, f, pbar, lock) for f in files]
            for _ in as_completed(futures):
                pass

    # Now remove directories (sequentially to avoid race conditions)
    for d in dirs:
        try:
            os.rmdir(d)
        except Exception as e:
            print(f"Error deleting folder {d}: {e}")

    # Finally remove the root folder
    try:
        os.rmdir(path)
    except Exception as e:
        print(f"Error deleting root folder {path}: {e}")

In [16]:
DELETE_LIST = {
    "fall":   ["s2_13", "s2_4", "s2_66", "s2_8", "s2_82"],
    "spring": ["s2_45", "s2_68", "s2_95"],
    "winter": ["s2_12"],
    "summer": ["s2_53"]
}

In [17]:
for season in DELETE_LIST.keys() :
    print(f"Processing: {season}")

    for to_delete_folder in DELETE_LIST[season] :
        delete_folder_parallel(f"/kaggle/temp/tum-sentinel-1-2/{season}/{to_delete_folder}", 2)

Processing: fall


Deleting files:   0%|          | 0.00/107M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/89.0M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/109M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/122M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/84.2M [00:00<?, ?B/s]

Processing: spring


Deleting files:   0%|          | 0.00/117M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/104M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/150M [00:00<?, ?B/s]

Processing: winter


Deleting files:   0%|          | 0.00/131M [00:00<?, ?B/s]

Processing: summer


Deleting files:   0%|          | 0.00/111M [00:00<?, ?B/s]

In [19]:
DELETE_LIST_S1 = {
    "fall":   ["s1_13", "s1_4", "s1_66", "s1_8", "s1_82"],
    "spring": ["s1_45", "s1_68", "s1_95"],
    "winter": ["s1_12"],
    "summer": ["s1_53"]
}

In [20]:
for season in DELETE_LIST_S1.keys() :
    print(f"Processing: {season}")

    for to_delete_folder in DELETE_LIST_S1[season] :
        delete_folder_parallel(f"/kaggle/temp/tum-sentinel-1-2/{season}/{to_delete_folder}", 2)

Processing: fall


Deleting files:   0%|          | 0.00/45.9M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/48.5M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/43.5M [00:00<?, ?B/s]

Processing: spring


Deleting files:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

Deleting files:   0%|          | 0.00/59.4M [00:00<?, ?B/s]

Processing: winter


Deleting files:   0%|          | 0.00/45.4M [00:00<?, ?B/s]

Processing: summer


Deleting files:   0%|          | 0.00/36.0M [00:00<?, ?B/s]

# Dataset upload

In [None]:
handle = "shambac/augmented-sentinel-1-2"
local_dataset_dir = "/kaggle/temp/tum-sentinel-1-2"

kagglehub.dataset_upload(handle, local_dataset_dir)