In [1]:
# Import the os module for interacting with the operating system's file system
import os
# Import the shutil module for high-level file operations such as copying and moving files
import shutil
# Import the random module for generating random numbers and making random selections
import random
# Import the gdal module from the osgeo package for working with geospatial data formats
from osgeo import gdal

In [2]:
# Set the path to the datasets directory
# path = "C:/Users/isaac/datasets/eurosat-dataset-tif"
path = "C:/Users/isaac/datasets/eurosat-dataset-jpg"

# Set the random seed for reproducibility, this ensures that the random operations produce the same result every time the code is run
SEED = random.seed(123)

In [3]:
# Initialize empty lists to store category names and file paths
categories = []  # List to hold the names of the categories (subdirectories)
tif_files = []   # List to hold the paths of all .jpg files

# Walk through the directory tree starting from the specified path
# os.walk generates the file names in a directory tree by walking the tree either top-down or bottom-up
for dirpath, dirnames, filenames in os.walk(path):
    # Add the directory names (categories) to the categories list
    categories.extend(dirnames)
    # Iterate over all filenames in the current directory
    for filename in filenames:
        # Check if the file has a .jpg extension
        if filename.endswith('.jpg'):
            # Add the full path of the .jpg file to the tif_files list
            tif_files.append(os.path.join(dirpath, filename))

# Print the list of categories (subdirectory names)
print(categories)


['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']


In [4]:
# Reduce the number of images in each category by 50% to reduce load times and process times
for category in categories:
    # Construct the path to the current category
    category_path = os.path.join(path, category)
    
    # Filter tif_files to get only the files in the current category
    category_files = [file for file in tif_files if file.startswith(category_path)]

    # If there are more than one file in the category, reduce the number by 50%
    if len(category_files) > 1:
        # Calculate the number of files to keep (50% of the total)
        num_files_to_keep = len(category_files) // 2
        
        # Randomly select a subset of files to keep
        files_to_keep = random.sample(category_files, num_files_to_keep)
        
        # Determine the files to remove (those not in the files_to_keep list)
        files_to_remove = set(category_files) - set(files_to_keep)
        
        # Iterate over the files to remove and delete them
        for file_to_remove in files_to_remove:
            os.remove(file_to_remove)  # Delete the file
            print(f"Removed: {file_to_remove}")  # Print the name of the removed file

# Print a completion message
print("Reduction of images in each category by 50% completed.")

Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_2806.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_2814.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_1989.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_1925.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_2962.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_2493.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_633.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_175.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_1805.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_2460.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_1506.jpg
Removed: C:/Users/isaac/datasets/eurosat-dataset-jpg\AnnualCrop\AnnualCrop_103

In [5]:
# List of split names
split_names = ['train', 'test', 'valid']

# Create new folders for train, test, and valid sets with subfolders for all categories
for sp_name in split_names:
    directory = os.path.join(path, 'dataset_splits', sp_name)  # Path to the split folder
    if not os.path.exists(directory):  # Check if the split folder doesn't exist
        os.makedirs(directory)  # Create the split folder if it doesn't exist
        print(f"Created folder: {directory}")
        print()
    # Create category folders within each split
    for category in categories:
        dir_cat = os.path.join(directory, category)  # Path to the category folder within the split
        if not os.path.exists(dir_cat):  # Check if the category folder doesn't exist
            os.makedirs(dir_cat)  # Create the category folder if it doesn't exist
            print(f"Created category folder: {dir_cat}")

print("All folders created successfully.")

Created folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train

Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\AnnualCrop
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\Forest
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\HerbaceousVegetation
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\Highway
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\Industrial
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\Pasture
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\PermanentCrop
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\Residential
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\dataset_splits\train\River
Created ca

In [6]:
import os
import shutil
import random
import subprocess
from tqdm import tqdm  # Import tqdm for progress bar
from osgeo import gdal

# Function to convert TIFF to JPEG using GDAL
def convert_tiff_to_jpeg(input_file, output_file, bands=[1, 2, 3]):
    try:
        # Construct the gdal_translate command
        command = ['gdal_translate', '-of', 'JPEG']
        for band in bands:
            command.extend(['-b', str(band)])
        command.extend(['-scale', input_file, output_file])
        # Run the command
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error converting {input_file} to {output_file}: {e}")
        return False
    return True

# Iterate through each category
for category in categories:
    # Define directories for input TIFF files and output JPEG files
    directory = os.path.join(path, category)  # Input TIFF directory
    directory_train = os.path.join(path, 'dataset_splits', 'train', category)  # Output JPEG directory for training set
    directory_valid = os.path.join(path, 'dataset_splits', 'test', category)  # Output JPEG directory for validation set
    directory_test = os.path.join(path, 'dataset_splits', 'valid', category)  # Output JPEG directory for test set
    
    # Create output directories if they do not exist
    os.makedirs(directory_train, exist_ok=True)
    os.makedirs(directory_valid, exist_ok=True)
    os.makedirs(directory_test, exist_ok=True)
    
    # List all files in the input TIFF directory
    try:
        cat_files = os.listdir(directory)
    except FileNotFoundError:
        continue  # Skip the category if the directory does not exist
    
    # Remove any unwanted files (e.g., .DS_Store)
    if '.DS_Store' in cat_files:
        cat_files.remove('.DS_Store')
    
    # Convert TIFF files to JPEG format and move them to the training directory
    with tqdm(total=len(cat_files), desc=f'Converting {category} TIFFs') as pbar:  # Initialize tqdm progress bar
        for file in cat_files:
            file_no_ext = file.split('.')[0]  # Remove file extension
            img_in = os.path.join(directory, file)  # Input TIFF file path
            img_out = os.path.join(directory_train, file_no_ext + '.jpeg')  # Output JPEG file path
                
            # Check if the converted JPEG file already exists in the training directory
            if not os.path.exists(img_out):
                # Convert the TIFF to JPEG
                success = convert_tiff_to_jpeg(img_in, img_out, bands=[1, 2, 3])
                if not success:
                    print(f"Conversion failed for {img_in}")
            
            pbar.update(1)  # Update progress bar
    
    # Remove any XML files that were generated during the conversion process
    for item in os.listdir(directory_train):  # Iterate over files in the training directory
        if item.endswith(".xml"):
            os.remove(os.path.join(directory_train, item))  # Remove XML file
                
    # Sort files into test and validation folders
    filenames = os.listdir(directory_train)  # List JPEG files in the training directory
    filenames.sort()  # Sort filenames alphabetically
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
        
    random.shuffle(filenames)  # Shuffle filenames randomly
    split_1 = int(0.8 * len(filenames))  # Split index for training-validation split
    split_2 = int(0.9 * len(filenames))  # Split index for validation-test split
    train_filenames = filenames[:split_1]  # Filenames for training set
    valid_filenames = filenames[split_1:split_2]  # Filenames for validation set
    test_filenames = filenames[split_2:]  # Filenames for test set
        
    for file in os.listdir(directory_train):
        if file in valid_filenames:
            shutil.move(os.path.join(directory_train, file), os.path.join(directory_valid, file))  # Move to validation directory
        elif file in test_filenames:
            shutil.move(os.path.join(directory_train, file), os.path.join(directory_test, file))  # Move to test directory


Converting AnnualCrop TIFFs: 100%|██████████| 1500/1500 [01:09<00:00, 21.69it/s]
Converting Forest TIFFs: 100%|██████████| 1500/1500 [01:05<00:00, 23.05it/s]
Converting HerbaceousVegetation TIFFs: 100%|██████████| 1500/1500 [01:04<00:00, 23.25it/s]
Converting Highway TIFFs: 100%|██████████| 1250/1250 [00:52<00:00, 23.61it/s]
Converting Industrial TIFFs: 100%|██████████| 1250/1250 [00:54<00:00, 22.77it/s]
Converting Pasture TIFFs: 100%|██████████| 1000/1000 [00:42<00:00, 23.60it/s]
Converting PermanentCrop TIFFs: 100%|██████████| 1250/1250 [00:56<00:00, 22.04it/s]
Converting Residential TIFFs: 100%|██████████| 1500/1500 [01:11<00:00, 20.97it/s]
Converting River TIFFs: 100%|██████████| 1250/1250 [00:56<00:00, 21.98it/s]
Converting SeaLake TIFFs: 100%|██████████| 1500/1500 [01:04<00:00, 23.09it/s]
