In [1]:
import os
import numpy as np
import dvc.api
import rasterio
import yaml

In [2]:
params = dvc.api.params_show()

CHIPS_DIR = params['paths']['chips']
N_BANDS = params['number_of_bands']
N_TIMESTEPS = params['number_of_timesteps']
N_FOLDS = params['number_of_folds']
CHIPS_STATS_TARGET = params['paths']['chips_stats']
LABELS_HIERARCHY = params['paths']['labels_hierarchy']

print(f"CHIPS_DIR: {CHIPS_DIR}")
print(f"N_BANDS: {N_BANDS}")
print(f"N_TIMESTEPS: {N_TIMESTEPS}")
print(f"N_FOLDS: {N_FOLDS}")
print(f"CHIPS_STATS_TARGET: {CHIPS_STATS_TARGET}")
print(f"LABELS_HIERARCHY: {LABELS_HIERARCHY}")

CHIPS_DIR: data/chips
N_BANDS: 6
N_TIMESTEPS: 3
N_FOLDS: 6
CHIPS_STATS_TARGET: data/chips_stats.yaml
LABELS_HIERARCHY: data/labels_hierarchy.tif


In [3]:
def compute_stats(files, data_dir):
    """Compute mean and std for given files, considering all timesteps for each band."""
    data_accumulator = []

    for file in files:
        img_path = os.path.join(data_dir, file)
        with rasterio.open(img_path) as src:
            img = src.read().astype('float32') # original shape: (18, 256, 256)

            # Reshape and reorder to group bands across all timesteps
            img = img.reshape(N_TIMESTEPS, N_BANDS, img.shape[1], img.shape[2]) # reshaped shape: (3, 6, 256, 256)
            img = img.transpose(1, 0, 2, 3) # transposed shape: (6, 3, 256, 256)
            reshaped_img = img.reshape(N_BANDS, -1) # flatten for each band - flattened shape: (6, 196'608)

            data_accumulator.append(reshaped_img)

    # Concatenate all images along the new axis to form (N_BANDS, N_TIMESTEPS * total spatial dimensions)
    combined_data = np.concatenate(data_accumulator, axis=1)
    
    # Compute mean and std along the axis of all timesteps and pixels
    mean = np.mean(combined_data, axis=1)
    std = np.std(combined_data, axis=1)
    return mean.tolist(), std.tolist()

def process_folds(data_dir, fold_range):
    """Process each fold, calculating stats excluding the test fold."""
    stats = {}
    all_files = [f for f in os.listdir(data_dir) if f.endswith("_merged.tif")]
    for fold in fold_range:
        files = [file for file in all_files if f"_fold_{fold}" in file]
        mean, std = compute_stats(files, data_dir)
        stats[f"fold_{fold}"] = {
            "mean": mean, 
            "std": std, 
            "n_chips": len(files), 
        }
    return stats

def get_num_classes(labels_hierarchy_file):
    labels = rasterio.open(labels_hierarchy_file).read()
    n_tiers = labels.shape[0]

    stats = {}
    for i in range(n_tiers):
        stats[f"num_classes_tier{i+1}"] = len(set(labels[i].flatten()))
    return stats

def save_stats_to_yaml(stats, output_file):
    """Save the computed stats to a YAML file."""
    with open(output_file, 'w') as file:
        yaml.dump(stats, file, default_flow_style=False)

In [8]:
stats = {
    **process_folds(CHIPS_DIR, range(N_FOLDS)),
    **get_num_classes(LABELS_HIERARCHY)
    }
save_stats_to_yaml(stats, CHIPS_STATS_TARGET)