# Data Preperation

### Importing Necessary Libraries

In [2]:
import os # For directory and file manipulation
import shutil # For high-level file operations like copying and removal
import pandas as pd # For data manipulation and analysis
from tqdm import tqdm # For displaying a progress bar
import rasterio # For working with raster data (e.g., satellite imagery)
from rasterio.plot import show # For displaying raster data
import numpy as np # For numerical operations on arrays
import matplotlib.pyplot as plt # For plotting data and images

### Loading Metadata and Defining Base Directories

In [3]:
# Load the metadata
# metadata_df = pd.read_parquet(r'C:\Users\isaac\datasets\2020-BigEarthNet-S2\metadata.parquet')
metadata_df = pd.read_parquet(r'C:\Users\isaac\Desktop\SampleBigEarth\metadata.parquet')

# Base directories
source_base_dir = r'D:\datasets\2020-BigEarthNet-S2'
destination_base_dir = r'D:\datasets\2020-BigEarthNet-S2\BigEarthNetDataset'

### Inspecting the Meta Data

In [3]:
num_rows = metadata_df.shape[0]
print(f"The DataFrame contains {num_rows} rows.")

print()

# Display the column names
print("Column Names:")
print(metadata_df.columns)

print()

# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(metadata_df.head())

The DataFrame contains 480038 rows.

Column Names:
Index(['patch_id', 'labels', 'split', 'country', 's1_name', 's2v1_name',
       'contains_seasonal_snow', 'contains_cloud_or_shadow'],
      dtype='object')

First few rows of the DataFrame:
                                            patch_id  \
0  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
1  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
2  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
3  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
4  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   

                                              labels split  country  \
0  [Arable land, Broad-leaved forest, Mixed fores...  test  Austria   
1  [Arable land, Broad-leaved forest, Inland wate...  test  Austria   
2  [Arable land, Broad-leaved forest, Coniferous ...  test  Austria   
3  [Broad-leaved forest, Complex cultivation patt...  test  Austria   
4  [Broad-leaved forest, Complex cultivation patt...  test  Austri

### Extracting Unique Labels and Creating Directories

In [4]:
# Extract unique labels from the metadata
unique_labels = metadata_df['labels'].explode().unique()

# Print the unique labels in a numbered table format
print("Unique Labels:")
for i, label in enumerate(unique_labels, start=1):
    print(f"{i}. {label}")

print()

# Create directories for each label with progress bar
for label in tqdm(unique_labels, desc="Creating directories"):
    label_dir = os.path.join(destination_base_dir, label)
    
    # Create the directory if it does not exist
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)

print("All directories have been created.")

Unique Labels:
1. Arable land
2. Broad-leaved forest
3. Mixed forest
4. Pastures
5. Inland waters
6. Coniferous forest
7. Complex cultivation patterns
8. Land principally occupied by agriculture, with significant areas of natural vegetation
9. Urban fabric
10. Industrial or commercial units
11. Inland wetlands
12. Transitional woodland, shrub
13. Natural grassland and sparsely vegetated areas
14. Moors, heathland and sclerophyllous vegetation
15. Marine waters
16. Coastal wetlands
17. Permanent crops
18. Beaches, dunes, sands
19. Agro-forestry areas



Creating directories: 100%|██████████| 19/19 [00:00<00:00, 1807.77it/s]

All directories have been created.





### Converting Labels to Binary Vectors and Saving MetaData

In [5]:
# Extract unique labels from the metadata and sort
unique_labels = metadata_df['labels'].explode().unique()
all_labels = sorted(unique_labels)  # Ensure all_labels is sorted

# Create a dictionary to map label names to indices
label_to_index = {label: i for i, label in enumerate(all_labels)}

# Function to convert label list to binary vector
def labels_to_vector(label_list):
    vector = [0] * len(all_labels)
    for label in label_list:
        index = label_to_index[label]
        vector[index] = 1
    return vector

# Apply the conversion to the 'labels' column
metadata_df['label_vector'] = metadata_df['labels'].apply(labels_to_vector)

# Save the updated DataFrame with the label_vector to a Parquet file
metadata_df.to_parquet('updated_metadata_with_vectors.parquet', index=False)

### Loading and Inspecting Updated Meta Data

In [6]:
metadata_df = pd.read_parquet(r'C:\Users\isaac\FYPCode\updated_metadata_with_vectors.parquet')

num_rows = metadata_df.shape[0]
print(f"The DataFrame contains {num_rows} rows.")

print()

# Display the column names
print("Column Names:")
print(metadata_df.columns)

print()

# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(metadata_df.head())

The DataFrame contains 480038 rows.

Column Names:
Index(['patch_id', 'labels', 'split', 'country', 's1_name', 's2v1_name',
       'contains_seasonal_snow', 'contains_cloud_or_shadow', 'label_vector'],
      dtype='object')

First few rows of the DataFrame:
                                            patch_id  \
0  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
1  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
2  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
3  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   
4  S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_2...   

                                              labels split  country  \
0  [Arable land, Broad-leaved forest, Mixed fores...  test  Austria   
1  [Arable land, Broad-leaved forest, Inland wate...  test  Austria   
2  [Arable land, Broad-leaved forest, Coniferous ...  test  Austria   
3  [Broad-leaved forest, Complex cultivation patt...  test  Austria   
4  [Broad-leaved forest, Complex cultivation patt.

In [7]:
# Function to process each image
def process_image(image_folder_path, labels):
    try:
        # Process each label
        for label in labels:
            dest_dir = os.path.join(destination_base_dir, label)

            # Create the directory if it does not exist
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir, exist_ok=True)

            # Construct the destination path for each label
            dest_folder_path = os.path.join(dest_dir, os.path.basename(image_folder_path))

            # Check if the destination folder already exists
            if not os.path.exists(dest_folder_path):
                # Copy the folder
                shutil.copytree(image_folder_path, dest_folder_path)

        # Remove the source directory after copying all the labels
        shutil.rmtree(image_folder_path)
    except Exception as e:
        print(f"Error processing folder {image_folder_path}: {e}")

# Count total images to process, excluding 'BigEarthNetDataset'
total_images = 0
for date_folder in os.listdir(source_base_dir):
    if date_folder == 'BigEarthNetDataset':
        continue  # Skip this folder
    date_folder_path = os.path.join(source_base_dir, date_folder)
    if os.path.isdir(date_folder_path):
        image_folders = [f for f in os.listdir(date_folder_path) if os.path.isdir(os.path.join(date_folder_path, f))]
        num_images = len(image_folders)
        total_images += num_images

# Create a progress bar for processing images
with tqdm(total=total_images, desc="Processing Images") as pbar:
    # Iterate through each date folder
    for date_folder in os.listdir(source_base_dir):
        if date_folder == 'BigEarthNetDataset':
            continue  # Skip this folder
        date_folder_path = os.path.join(source_base_dir, date_folder)
        
        if os.path.isdir(date_folder_path):
            # Iterate through each image folder within the date folder
            for image_folder in os.listdir(date_folder_path):
                image_folder_path = os.path.join(date_folder_path, image_folder)
                
                if os.path.isdir(image_folder_path):
                    # Find the corresponding metadata row
                    patch_id = image_folder
                    row = metadata_df[metadata_df['patch_id'] == patch_id]
                    
                    if not row.empty:
                        labels = row.iloc[0]['labels']
                        process_image(image_folder_path, labels)
                    
                    # Update progress bar after processing each image
                    pbar.update(1)

print("Processing complete.")

Processing Images: 100%|██████████| 422728/422728 [10:04:23<00:00, 11.66it/s]  


Processing complete.


In [8]:
# Initialize a dictionary to hold the count of subfolders in each folder
folder_subfolder_counts = {}

# Iterate through each item in the base directory with a progress bar
for folder in tqdm(os.listdir(destination_base_dir), desc="Processing Folders", unit="folder"):
    folder_path = os.path.join(destination_base_dir, folder)
    
    # Check if the path is a directory
    if os.path.isdir(folder_path):
        # List all subdirectories within the current folder
        subfolders = [subfolder for subfolder in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, subfolder))]
        
        # Count the number of subfolders
        subfolder_count = len(subfolders)
        
        # Store the count in the dictionary
        folder_subfolder_counts[folder] = subfolder_count

# Sort the dictionary by subfolder count in descending order
sorted_folder_counts = sorted(folder_subfolder_counts.items(), key=lambda x: x[1], reverse=True)

# Print header
print(f"{'Folder Name':<70} {'Subfolder Count':>15}")
print("-" * 85)

# Print the sorted number of subfolders in each folder in a formatted manner
for folder, count in sorted_folder_counts:
    print(f"{folder:<70} {count:>15}")

Processing Folders: 100%|██████████| 19/19 [05:44<00:00, 18.14s/folder]

Folder Name                                                            Subfolder Count
-------------------------------------------------------------------------------------
Arable land                                                                     188025
Mixed forest                                                                    165780
Coniferous forest                                                               154941
Transitional woodland, shrub                                                    141150
Broad-leaved forest                                                             135928
Land principally occupied by agriculture, with significant areas of natural vegetation          122709
Complex cultivation patterns                                                     99598
Pastures                                                                         95605
Urban fabric                                                                     63758
Inland waters               




In [None]:
# Balance the classes further - perform data augmentation and upscaling of lower numbered classes

In [None]:
# Create folder 'dataset_splits' and within this folder create 'test', 'valid' and 'train'. Each of these folders should have the 19 categories

In [None]:
# Move the data to the valid (15%), test(15%) and train(70%) folders