In [9]:
import os
import shutil
import random
from osgeo import gdal
import random

In [14]:
# Set the path to the datasets directory
# path = "C:/Users/isaac/datasets/eurosat-dataset-tif"
path = "C:/Users/isaac/datasets/eurosat-dataset-jpg"
SEED = random.seed(123)

In [16]:
# Initialize lists
categories = []
tif_files = []

# Walk through the directory tree starting from the specified path
for dirpath, dirnames, filenames in os.walk(path):
    categories.extend(dirnames)
    for filename in filenames:
        if filename.endswith('.tif'):
            tif_files.append(os.path.join(dirpath, filename))

print(categories)

['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']


In [17]:
# Reduce the number of images in each category by 50% for reducing load times and process times
for category in categories:
    category_path = os.path.join(path, category)
    category_files = [file for file in tif_files if file.startswith(category_path)]
    
    # If there are more than one files, reduce by 50%
    if len(category_files) > 1:
        num_files_to_keep = len(category_files) // 2
        files_to_keep = random.sample(category_files, num_files_to_keep)
        
        # Remove the files that are not in the files_to_keep list
        files_to_remove = set(category_files) - set(files_to_keep)
        
        for file_to_remove in files_to_remove:
            os.remove(file_to_remove)
            print(f"Removed: {file_to_remove}")

print("Reduction of images in each category by 50% completed.")

Reduction of images in each category by 50% completed.


In [18]:
# List of split names
split_names = ['train', 'test', 'valid']

# Create new folders for train, test, and valid sets with subfolders for all categories
for sp_name in split_names:
    directory = os.path.join(path, 'land-use-jpeg', sp_name)  # Path to the split folder
    if not os.path.exists(directory):  # Check if the split folder doesn't exist
        os.makedirs(directory)  # Create the split folder if it doesn't exist
        print(f"Created folder: {directory}")
    # Create category folders within each split
    for category in categories:
        dir_cat = os.path.join(directory, category)  # Path to the category folder within the split
        if not os.path.exists(dir_cat):  # Check if the category folder doesn't exist
            os.makedirs(dir_cat)  # Create the category folder if it doesn't exist
            print(f"Created category folder: {dir_cat}")

print("All folders created successfully.")

Created folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\AnnualCrop
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\Forest
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\HerbaceousVegetation
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\Highway
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\Industrial
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\Pasture
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\PermanentCrop
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\Residential
Created category folder: C:/Users/isaac/datasets/eurosat-dataset-jpg\land-use-jpeg\train\River
Created category fold

In [19]:
import os
import shutil
import random
import subprocess
from tqdm import tqdm  # Import tqdm for progress bar
from osgeo import gdal

# Function to convert TIFF to JPEG using GDAL
def convert_tiff_to_jpeg(input_file, output_file, bands=[1, 2, 3]):
    try:
        # Construct the gdal_translate command
        command = ['gdal_translate', '-of', 'JPEG']
        for band in bands:
            command.extend(['-b', str(band)])
        command.extend(['-scale', input_file, output_file])
        # Run the command
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error converting {input_file} to {output_file}: {e}")
        return False
    return True

# Iterate through each category
for category in categories:
    # Define directories for input TIFF files and output JPEG files
    directory = os.path.join(path, category)  # Input TIFF directory
    directory_train = os.path.join(path, 'land-use-jpeg', 'train', category)  # Output JPEG directory for training set
    directory_valid = os.path.join(path, 'land-use-jpeg', 'test', category)  # Output JPEG directory for validation set
    directory_test = os.path.join(path, 'land-use-jpeg', 'valid', category)  # Output JPEG directory for test set
    
    # Create output directories if they do not exist
    os.makedirs(directory_train, exist_ok=True)
    os.makedirs(directory_valid, exist_ok=True)
    os.makedirs(directory_test, exist_ok=True)
    
    # List all files in the input TIFF directory
    try:
        cat_files = os.listdir(directory)
    except FileNotFoundError:
        continue  # Skip the category if the directory does not exist
    
    # Remove any unwanted files (e.g., .DS_Store)
    if '.DS_Store' in cat_files:
        cat_files.remove('.DS_Store')
    
    # Convert TIFF files to JPEG format and move them to the training directory
    with tqdm(total=len(cat_files), desc=f'Converting {category} TIFFs') as pbar:  # Initialize tqdm progress bar
        for file in cat_files:
            file_no_ext = file.split('.')[0]  # Remove file extension
            img_in = os.path.join(directory, file)  # Input TIFF file path
            img_out = os.path.join(directory_train, file_no_ext + '.jpeg')  # Output JPEG file path
                
            # Check if the converted JPEG file already exists in the training directory
            if not os.path.exists(img_out):
                # Convert the TIFF to JPEG
                success = convert_tiff_to_jpeg(img_in, img_out, bands=[1, 2, 3])
                if not success:
                    print(f"Conversion failed for {img_in}")
            
            pbar.update(1)  # Update progress bar
    
    # Remove any XML files that were generated during the conversion process
    for item in os.listdir(directory_train):  # Iterate over files in the training directory
        if item.endswith(".xml"):
            os.remove(os.path.join(directory_train, item))  # Remove XML file
                
    # Sort files into test and validation folders
    filenames = os.listdir(directory_train)  # List JPEG files in the training directory
    filenames.sort()  # Sort filenames alphabetically
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
        
    random.shuffle(filenames)  # Shuffle filenames randomly
    split_1 = int(0.8 * len(filenames))  # Split index for training-validation split
    split_2 = int(0.9 * len(filenames))  # Split index for validation-test split
    train_filenames = filenames[:split_1]  # Filenames for training set
    valid_filenames = filenames[split_1:split_2]  # Filenames for validation set
    test_filenames = filenames[split_2:]  # Filenames for test set
        
    for file in os.listdir(directory_train):
        if file in valid_filenames:
            shutil.move(os.path.join(directory_train, file), os.path.join(directory_valid, file))  # Move to validation directory
        elif file in test_filenames:
            shutil.move(os.path.join(directory_train, file), os.path.join(directory_test, file))  # Move to test directory


Converting AnnualCrop TIFFs: 100%|██████████| 3000/3000 [02:29<00:00, 20.03it/s]
Converting Forest TIFFs: 100%|██████████| 3000/3000 [02:51<00:00, 17.47it/s]
Converting HerbaceousVegetation TIFFs: 100%|██████████| 3000/3000 [03:02<00:00, 16.43it/s]
Converting Highway TIFFs: 100%|██████████| 2500/2500 [02:06<00:00, 19.78it/s]
Converting Industrial TIFFs: 100%|██████████| 2500/2500 [01:51<00:00, 22.48it/s]
Converting Pasture TIFFs: 100%|██████████| 2000/2000 [01:29<00:00, 22.28it/s]
Converting PermanentCrop TIFFs: 100%|██████████| 2500/2500 [01:43<00:00, 24.13it/s]
Converting Residential TIFFs: 100%|██████████| 3000/3000 [02:03<00:00, 24.20it/s]
Converting River TIFFs: 100%|██████████| 2500/2500 [01:56<00:00, 21.53it/s]
Converting SeaLake TIFFs: 100%|██████████| 3000/3000 [02:12<00:00, 22.62it/s]
