# Analysing EuroSat Dataset

### Config for data paths and terminal colors

In [1]:
class Config:
    DATA_DIR = '/usr/local/share/datasets/euroSat/'
    TRAIN_FILE = DATA_DIR + 'train.csv'
    TEST_FILE = DATA_DIR + 'test.csv'
    SAMPLE_SUBMISSION_FILE = DATA_DIR + 'sample_submission.csv'
    TRAIN_RGB_DIR = DATA_DIR + 'train/EuroSAT_RGB/'
    TRAIN_MS_DIR = DATA_DIR + 'train/EuroSAT_MS/'

    TEST_MS_DIR = DATA_DIR + 'test/NoLabel/'
    MEAN_STD_FILE = "tmp/eurosat_ms_mean_std"
    MEAN_STD_FILE_L2C = "tmp/eurosat_ms_mean_std_L2C"
    MEAN_STD_FILE_L2A = "tmp/eurosat_ms_mean_std_L2A"
    TEST_LABELD_FILE = "labels.csv"

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

c = bcolors()
config = Config()

### Helper functions for loading pickle data

In [2]:
import pickle

def load_object(fname):
    try:
        with open(fname + ".pickle", "rb") as f:
            return pickle.load(f)
    except Exception as ex:
        print("Error during unpickling object (Possibly unsupported):", ex)


def save_object(obj, fname):
    try:
        with open(fname + ".pickle", "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print("Error during pickling object (Possibly unsupported):", ex)

In [3]:
import os
import numpy as np
import pandas as pd
import rasterio
from joblib import Parallel, delayed

df = pd.read_csv(config.TRAIN_FILE)
chan_L2C = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
chan_L2A = [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]

data_L2C_files = [
    d + '/' + f for d in os.listdir(config.TRAIN_MS_DIR)
    for f in os.listdir(config.TRAIN_MS_DIR + d)
]

data_L2A_files = os.listdir(config.TEST_MS_DIR)

data_paths_L2C = [os.path.join(config.TRAIN_MS_DIR, f) for f in data_L2C_files]
data_paths_L2A = [os.path.join(config.TEST_MS_DIR, f) for f in data_L2A_files]
print(f"{c.OKBLUE}Number of L2C images: {len(data_paths_L2C)}")
print(f"{c.OKBLUE}Number of L2A images: {len(data_paths_L2A)}")


[94mNumber of L2C images: 27000
[94mNumber of L2A images: 4232


In [4]:
def process_image(idx, d_paths):
    if idx % 1000 == 0:
        print(f"Processing image {idx}...")

    img_path = d_paths[idx]

    if img_path.endswith(".npy"):
        image = np.load(img_path).transpose(2, 0, 1)
        image = image[chan_L2C].astype(np.float32)
    else:
        with rasterio.open(img_path) as src:
            image = np.array(src.read())
        image = image[chan_L2A].astype(np.float32)
    
    # get the min and max of each channel
    min_chan, max_chan = image.min(axis=(1, 2)), image.max(axis=(1, 2))

    mean = np.mean(image, axis=(1, 2))
    std = np.std(image, axis=(1, 2))

    return mean, std, min_chan, max_chan

def get_overall_mean_std(results):
    means = np.array([stat[0] for stat in results])
    stds = np.array([stat[1] for stat in results])

    overall_mean = np.mean(means, axis=0)
    overall_std = np.mean(stds, axis=0)

    return overall_mean, overall_std

def get_overall_min_max(results):
    min_vals = np.array([stat[2] for stat in results])
    max_vals = np.array([stat[3] for stat in results])

    overall_min = np.mean(min_vals, axis=0)
    overall_max = np.mean(max_vals, axis=0)

    return overall_min, overall_max

In [5]:
stats_L2C = Parallel(n_jobs=-4)(
    delayed(process_image)(idx, data_paths_L2C) for idx in range(len(data_paths_L2C))
)

mean_L2C, std_L2C = get_overall_mean_std(stats_L2C)
min_L2C, max_L2C = get_overall_min_max(stats_L2C)

In [6]:
print("Overall Mean L2C:    ", list([round(i, 3) for i in mean_L2C]))
print("Overall Std Dev L2C: ", list([round(i, 3) for i in std_L2C]))

print("Overall Min L2C:     ", list([round(i, 3) for i in min_L2C]))
print("Overall Max L2C:     ", list([round(i, 3) for i in max_L2C]))

Overall Mean L2C:     [1353.732, 1117.196, 1041.886, 946.556, 1199.187, 2003.004, 2374.012, 2301.228, 732.185, 1820.695, 1118.201, 2599.776]
Overall Std Dev L2C:  [65.289, 153.755, 187.676, 278.091, 227.896, 355.89, 455.075, 530.717, 98.918, 378.115, 303.069, 502.103]
Overall Min L2C:      [1236.94, 864.673, 690.741, 470.055, 742.607, 1215.664, 1388.282, 1053.108, 527.87, 1009.013, 519.858, 1495.878]
Overall Max L2C:      [1560.355, 2144.7, 2197.044, 2442.571, 2066.732, 3120.732, 3759.036, 4220.174, 975.034, 3016.484, 2194.036, 4083.037]


In [7]:
save_object(dict(
    mean=mean_L2C,
    std=std_L2C,
    idx_l2c=chan_L2C,
    min=min_L2C,
    max=max_L2C
), "tmp/eurosat_ms_mean_std_L2C")

In [8]:
stats_L2A = Parallel(n_jobs=-4)(
    delayed(process_image)(idx, data_paths_L2A) for idx in range(len(data_paths_L2A))
)

mean_L2A, std_L2A = get_overall_mean_std(stats_L2A)
min_L2A, max_L2A = get_overall_min_max(stats_L2A)

In [9]:
print("Overall Mean L2A:    ", list([round(i, 3) for i in mean_L2A]))
print("Overall Std Dev L2A: ", list([round(i, 3) for i in std_L2A]))

print("Overall Min L2A:     ", list([round(i, 3) for i in min_L2A]))
print("Overall Max L2A:     ", list([round(i, 3) for i in max_L2A]))

Overall Mean L2A:     [380.173, 400.149, 628.864, 578.871, 943.427, 1826.243, 2116.666, 2205.976, 2281.183, 2266.932, 1487.692, 959.235]
Overall Std Dev L2A:  [115.16, 209.123, 241.177, 301.069, 269.481, 420.199, 503.757, 597.968, 529.349, 403.888, 398.094, 342.399]
Overall Min L2A:      [198.793, 100.154, 210.173, 143.857, 425.231, 846.068, 961.567, 724.469, 1050.31, 1380.276, 667.476, 362.698]
Overall Max L2A:      [795.486, 1908.949, 2160.55, 2265.381, 2041.951, 3159.708, 3682.206, 4257.495, 3861.278, 3332.05, 2768.335, 2238.926]


In [10]:
save_object(dict(
    mean=mean_L2A,
    std=std_L2A,
    idx_l2c=chan_L2A,
    min=min_L2A,
    max=max_L2A
), "tmp/eurosat_ms_mean_std_L2A")