In [32]:
# File handling and manipulation
import os  # Provides functions for interacting with the operating system, such as reading and writing files and directories.
import shutil  # Offers high-level file operations like copying and moving files.
import tarfile  # Library for reading and writing tar archive files.
import requests  # Library for making HTTP requests, often used for downloading files.
import zstandard as zstd  # Library for Zstandard compression and decompression.
from PIL import Image  # Python Imaging Library (PIL) for opening, manipulating, and saving image files.

# Data manipulation and analysis
import pandas as pd  # Data analysis and manipulation library providing data structures like DataFrames.
import numpy as np  # Provides support for large arrays and matrices, along with mathematical functions to operate on these arrays.

# Geospatial data processing
import rasterio  # Library for reading and writing geospatial raster data.
from rasterio.warp import calculate_default_transform, reproject  # Functions for transforming and resizing raster data.
from rasterio.enums import Resampling  # Functions for transforming and resizing raster data.
from rasterio.plot import show  # Function for visualizing raster data.

# Visualization and plotting
import matplotlib.pyplot as plt  # Plotting library for creating static, animated, and interactive visualizations in Python.
import seaborn as sns  # Data visualization library based on matplotlib, providing a high-level interface for drawing attractive statistical graphics.

# Utilities
import random  # Provides functions for generating random numbers and performing random operations.
from tqdm import tqdm  # Library for displaying progress bars in loops or processes.
import textwrap  # Utilities for wrapping and formatting text to fit a specific width.


In [5]:
# Load the metadata
#metadata_df = pd.read_parquet(r'C:\Users\isaac\Downloads\metadata.parquet')
# Load the metadata for the snow/cloud/shadow data
#snow_cloud_shadow_metadata_df = pd.read_parquet(r'C:\Users\isaac\Downloads\metadata_for_patches_with_snow_cloud_or_shadow.parquet')

metadata_csv = pd.read_csv(r'C:\Users\isaac\Desktop\BigEarthTests\one_percent_metadata.csv')

# Base directories
source_dir = r'C:\Users\isaac\Desktop\BigEarthTests\BigEarthSubset1'

In [6]:
# Get the number of rows in the DataFrame
num_rows = metadata_df.shape[0]

# Print the number of rows in the DataFrame, followed by a blank line
print(f"The DataFrame contains {num_rows} rows.\n")

# Display the column names in the DataFrame, followed by a blank line
print("Column Names:")
print(metadata_df.columns, "\n")

# Display the first few rows of the DataFrame with a preceding message
print("First few rows of the DataFrame:")
print(metadata_df.head())

The DataFrame contains 480038 rows.

Column Names:
Index(['patch_id', 'labels', 'split', 'country', 's1_name', 's2v1_name',
       'contains_seasonal_snow', 'contains_cloud_or_shadow'],
      dtype='object') 

First few rows of the DataFrame:
                                            patch_id  \
0  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
1  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
2  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
3  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
4  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   

                                              labels split  country  \
0  [Arable land, Broad-leaved forest, Mixed fores...  test  Austria   
1  [Arable land, Broad-leaved forest, Inland wate...  test  Austria   
2  [Arable land, Broad-leaved forest, Coniferous ...  test  Austria   
3  [Broad-leaved forest, Complex cultivation patt...  test  Austria   
4  [Broad-leaved forest, Complex cultivation patt...  test  Austr

In [7]:
# Extract unique labels from the metadata dataframe
unique_labels = metadata_df['labels'].explode().unique()

# Print the unique labels in a numbered table format
print("Unique Labels:")
for i, label in enumerate(unique_labels, start=1):
    print(f"{i}. {label}")

print()

Unique Labels:
1. Arable land
2. Broad-leaved forest
3. Mixed forest
4. Pastures
5. Inland waters
6. Coniferous forest
7. Complex cultivation patterns
8. Land principally occupied by agriculture, with significant areas of natural vegetation
9. Urban fabric
10. Industrial or commercial units
11. Inland wetlands
12. Transitional woodland, shrub
13. Natural grassland and sparsely vegetated areas
14. Moors, heathland and sclerophyllous vegetation
15. Marine waters
16. Coastal wetlands
17. Permanent crops
18. Beaches, dunes, sands
19. Agro-forestry areas



In [30]:
def create_binary_vector(labels, categories):
    vector = [1 if category in labels else 0 for category in categories]
    return vector

# Step 3: Apply the function to each row in the DataFrame
metadata_df['binary_vector'] = metadata_df['labels'].apply(lambda labels: create_binary_vector(labels, unique_labels))

# Display the first few rows to verify the new column
#print(metadata_df.head())

NameError: name 'unique_labels' is not defined

In [9]:
# Get the number of rows in the DataFrame
num_rows = snow_cloud_shadow_metadata_df.shape[0]

# Print the number of rows in the DataFrame, followed by a blank line
print(f"The DataFrame contains {num_rows} rows.\n")

# Display the column names in the DataFrame, followed by a blank line
print("Column Names:")
print(snow_cloud_shadow_metadata_df.columns, "\n")

# Display the first few rows of the DataFrame with a preceding message
print("First few rows of the DataFrame:")
print(snow_cloud_shadow_metadata_df.head())

The DataFrame contains 69450 rows.

Column Names:
Index(['patch_id', 'labels', 'split', 'country', 's1_name', 's2v1_name',
       'contains_seasonal_snow', 'contains_cloud_or_shadow'],
      dtype='object') 

First few rows of the DataFrame:
                                            patch_id  \
0  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_3...   
1  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_3...   
2  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_3...   
3  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_3...   
4  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_4...   

                                              labels  split  country  \
0                   [Arable land, Coniferous forest]   test  Austria   
1  [Arable land, Coniferous forest, Land principa...   test  Austria   
2  [Broad-leaved forest, Coniferous forest, Indus...  train  Austria   
3  [Arable land, Broad-leaved forest, Land princi...   test  Austria   
4  [Arable land, Broad-leaved forest, Land princi...   test  

In [26]:
# Load the metadata DataFrame
snow_cloud_shadow_metadata_df = pd.read_parquet(r'C:\Users\isaac\Downloads\metadata_for_patches_with_snow_cloud_or_shadow.parquet')

# Initialize counters
true_count = 0
false_count = 0

# Base directory path
base_dir = r'C:\Users\isaac\Desktop\BigEarthTests\BigEarthSubset1'

# Iterate through all folders in the base directory
for folder_name in tqdm(os.listdir(base_dir), desc="Processing folders"):
    folder_path = os.path.join(base_dir, folder_name)
    for patch_name in os.listdir(folder_path):
        patch_path = os.path.join(folder_path, patch_name)

        # Filter the DataFrame to get the row with the matching patch_id
        matching_row = snow_cloud_shadow_metadata_df.loc[metadata_df['patch_id'] == patch_name]

         # If a match is found, extract the labels
        if not matching_row.empty:
            contains_cloud_or_shadow = matching_row['contains_cloud_or_shadow'].values[0]
            if contains_cloud_or_shadow:
                true_count += 1
                shutil.rmtree(patch_path)
            else:
                false_count += 1

# Print the results
print(f"Total True (contains_cloud_or_shadow): {true_count}")
print(f"Total False (contains_cloud_or_shadow): {false_count}")

Processing folders: 100%|██████████| 3/3 [05:34<00:00, 111.51s/it]

Total True (contains_cloud_or_shadow): 0
Total False (contains_cloud_or_shadow): 14987





In [45]:
# Base directories
source_dir = r'D:\Datasets\BigEarthNet-S2\100%BigEarthNet'

# Initialize a counter for deleted folders
deleted_folders_count = 0

# Loop through each row in the DataFrame with a progress bar
for index, row in tqdm(snow_cloud_shadow_metadata_df.iterrows(), total=len(snow_cloud_shadow_metadata_df), desc="Processing files"):
    patch_id = row['patch_id']  # Adjust if the column name differs
    # Extract the folder name (all but the last part of the patch_id)
    folder_name = '_'.join(patch_id.split('_')[:-2])
    dest_dir = os.path.join(source_dir, folder_name, patch_id)  # Use os.path.join for better path handling
    
    # If dest_dir exists, delete it
    if os.path.exists(dest_dir):
        shutil.rmtree(dest_dir)  # Remove the directory and its contents
        deleted_folders_count += 1  # Increment the counter

# Print the total number of deleted folders
print(f"Total deleted folders: {deleted_folders_count}")

Processing files: 100%|██████████| 69450/69450 [03:37<00:00, 320.00it/s]

Total deleted folders: 69450





In [49]:
# Base directory containing all data
base_dir = r'D:\Datasets\BigEarthNet-S2\100%BigEarthNet'
# Directories for subsets
subsets = {
    '50%': r'D:\Datasets\BigEarthNet-S2\50%BigEarthNet',
    '10%': r'D:\Datasets\BigEarthNet-S2\10%BigEarthNet',
    '1%': r'D:\Datasets\BigEarthNet-S2\1%BigEarthNet'
}

# Create subset directories if they don't exist
for subset_dir in subsets.values():
    os.makedirs(subset_dir, exist_ok=True)

# Iterate through all folders in the base directory
for folder in tqdm(os.listdir(base_dir), desc="Processing folders"):
    folder_path = os.path.join(base_dir, folder)
    
    # Check if the current path is a directory
    if os.path.isdir(folder_path):
        # List all subfolders
        subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
        
        # Calculate number of subfolders for each subset
        num_subfolders = len(subfolders)
        num_50_percent = max(1, num_subfolders // 2)  # Ensure at least one folder is copied
        num_10_percent = max(1, num_subfolders // 10)
        num_1_percent = max(1, num_subfolders // 100)

        # Randomly select subfolders for each subset
        selected_50 = random.sample(subfolders, num_50_percent)
        selected_10 = random.sample(subfolders, num_10_percent)
        selected_1 = random.sample(subfolders, num_1_percent)

        # Copy selected subfolders to the respective subset directories
        for selected in selected_50:
            shutil.copytree(os.path.join(folder_path, selected), os.path.join(subsets['50%'], folder, selected))

        for selected in selected_10:
            shutil.copytree(os.path.join(folder_path, selected), os.path.join(subsets['10%'], folder, selected))

        for selected in selected_1:
            shutil.copytree(os.path.join(folder_path, selected), os.path.join(subsets['1%'], folder, selected))

Processing folders: 100%|██████████| 115/115 [5:06:14<00:00, 159.78s/it] 


In [4]:
def count_subfolders(base_dir, folder):
    # Dictionary to hold folder counts
    folder_counts = {}
    total_subfolders = 0  # Initialize total subfolder counter
    
    # Iterate through all folders in the base directory
    for folder in tqdm(os.listdir(base_dir), desc="Processing folders"):
        folder_path = os.path.join(base_dir, folder)
        
        # Check if the current path is a directory
        if os.path.isdir(folder_path):
            # Count subdirectories within this folder
            subfolder_count = sum(os.path.isdir(os.path.join(folder_path, subfolder)) for subfolder in os.listdir(folder_path))
            folder_counts[folder] = subfolder_count
        
            # Update total subfolder count
            total_subfolders += subfolder_count

    # Print total subfolders
    return total_subfolders, folder

In [22]:
full_subfolder_count, folder = count_subfolders(r'D:\Datasets\BigEarthNet-S2\100%BigEarthNet', '100%BigEarthNet')
half_subfolder_count, folder = count_subfolders(r'D:\Datasets\BigEarthNet-S2\50%BigEarthNet', '50%BigEarthNet' )
tenth_subfolder_count, folder = count_subfolders(r'D:\Datasets\BigEarthNet-S2\10%BigEarthNet', '10%BigEarthNet' )
hundredth_subfolder_count, folder = count_subfolders(r'D:\Datasets\BigEarthNet-S2\1%BigEarthNet', '1%BigEarthNet' )

Processing folders: 100%|██████████| 115/115 [02:46<00:00,  1.44s/it]
Processing folders: 100%|██████████| 115/115 [01:08<00:00,  1.69it/s]
Processing folders: 100%|██████████| 115/115 [00:11<00:00,  9.91it/s]
Processing folders: 100%|██████████| 115/115 [00:00<00:00, 518.06it/s]


In [23]:
# Function to calculate and display subfolder count and percentage
def display_percentage(partial_count, full_count, folder_name):
    percentage = (partial_count / full_count) * 100
    print(f"Folder: {folder_name} | Subfolder Count: {partial_count} | Percentage: {percentage:.2f}%")

# Display the counts and percentages for each folder
print(f"Total subfolder count in full dataset: {full_subfolder_count}\n")
display_percentage(half_subfolder_count, full_subfolder_count, '50%BigEarthNet')
display_percentage(tenth_subfolder_count, full_subfolder_count, '10%BigEarthNet')
display_percentage(hundredth_subfolder_count, full_subfolder_count, '1%BigEarthNet')

Total subfolder count in full dataset: 480038

Folder: 50%BigEarthNet | Subfolder Count: 239988 | Percentage: 49.99%
Folder: 10%BigEarthNet | Subfolder Count: 47948 | Percentage: 9.99%
Folder: 1%BigEarthNet | Subfolder Count: 4750 | Percentage: 0.99%


In [15]:
def count_splits_in_folders(root_dir, df):
    test_count = 0
    train_count = 0
    val_count = 0

    # Get the total number of directories for the progress bar
    total_dirs = sum([len(dirnames) for _, dirnames, _ in os.walk(root_dir)])

    # Walk through the directories with a progress bar
    for dirpath, dirnames, filenames in tqdm(os.walk(root_dir), total=total_dirs, desc="Processing directories"):
        for dirname in dirnames:
            # Extract patch_id from the folder name
            patch_id = dirname

            # Fetch the split from the DataFrame
            split = df.loc[df['patch_id'] == patch_id, 'split'].values
            
            # Check if the patch_id exists in the DataFrame
            if len(split) > 0:
                split = split[0]
                if split == 'test':
                    test_count += 1
                elif split == 'train':
                    train_count += 1
                elif split == 'validation':
                    val_count += 1
    
    # Calculate total count
    total_count = test_count + train_count + val_count

    # Calculate percentages
    test_percentage = (test_count / total_count) * 100 if total_count > 0 else 0
    train_percentage = (train_count / total_count) * 100 if total_count > 0 else 0
    val_percentage = (val_count / total_count) * 100 if total_count > 0 else 0
    
    # Return the counts and percentages
    return {
        'test': {'count': test_count, 'percentage': test_percentage},
        'train': {'count': train_count, 'percentage': train_percentage},
        'validation': {'count': val_count, 'percentage': val_percentage}
    }

metadata_df = pd.read_parquet(r'C:\Users\isaac\Downloads\metadata.parquet')

# Root directory of the subset
root_dir = r'D:\Datasets\BigEarthNet-S2\1%BigEarthNet'

# Count the splits in the folders
split_counts = count_splits_in_folders(root_dir, metadata_df)

print("Number of samples in each split in the subset:")
for split_type, data in split_counts.items():
    print(f"{split_type.capitalize()}: {data['count']} ({data['percentage']:.2f}%)")

Processing directories: 4866it [01:32, 52.63it/s]                          

Number of samples in each split in the subset:
Test: 1165 (24.53%)
Train: 2370 (49.89%)
Validation: 1215 (25.58%)





In [16]:
def count_splits(subset_df):
    # Initialize counters
    test_count = 0
    train_count = 0
    val_count = 0
    
    # Iterate through the subset DataFrame
    for split in subset_df['split']:
        if split == 'test':
            test_count += 1
        elif split == 'train':
            train_count += 1
        elif split == 'validation':
            val_count += 1
    
    # Calculate total count
    total_count = test_count + train_count + val_count
    
    # Calculate percentages
    test_percentage = (test_count / total_count) * 100 if total_count > 0 else 0
    train_percentage = (train_count / total_count) * 100 if total_count > 0 else 0
    val_percentage = (val_count / total_count) * 100 if total_count > 0 else 0
    
    # Return the counts and percentages
    return {
        'test': {'count': test_count, 'percentage': test_percentage},
        'train': {'count': train_count, 'percentage': train_percentage},
        'validation': {'count': val_count, 'percentage': val_percentage}
    }

# Example usage
metadata_df = pd.read_parquet(r'C:\Users\isaac\Downloads\metadata.parquet')

# Count the splits in the subset
split_counts = count_splits(metadata_df)

print("Number of samples in each split in the subset:")
for split_type, data in split_counts.items():
    print(f"{split_type.capitalize()}: {data['count']} ({data['percentage']:.2f}%)")

Number of samples in each split in the subset:
Test: 119825 (24.96%)
Train: 237871 (49.55%)
Validation: 122342 (25.49%)
