In [13]:
import os
import random
import shutil
from collections import defaultdict
from tqdm import tqdm

def select_and_copy_images(source_dir, dest_dir, percentage=10):
    # Ensure destination directory exists
    os.makedirs(dest_dir, exist_ok=True)

    # Dictionary to store file paths by tile number
    tiles_dict = defaultdict(list)
    
    # Iterate through files in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith(".tif"):
            parts = filename.split('_')
            if len(parts) == 4:  # Ensure correct format
                tile_number = parts[3].split('.')[0]
                tiles_dict[tile_number].append(filename)
    
    # Determine the number of tiles to select
    total_tiles = len(tiles_dict)
    num_to_select = max(1, int(total_tiles * percentage / 100))
    
    # Randomly select tile numbers
    selected_tiles = random.sample(list(tiles_dict.keys()), num_to_select)
    
    # Copy the selected files to the destination directory with "copied" added to their names
    for tile_number in tqdm(selected_tiles):
        for file in tiles_dict[tile_number]:
            source_path = os.path.join(source_dir, file)
            dest_filename = f"pos_{os.path.splitext(file)[0]}.tif"
            dest_path = os.path.join(dest_dir, dest_filename)
            shutil.copy2(source_path, dest_path)
#             print(f"Copied {file} to {dest_filename} in {dest_dir}")



In [14]:
# Example usage:
source_directory = "/home/macula/SMATousi/Gullies/ground_truth/organized_data/All_Pos_Neg/all_pos/rgb_images"
destination_directory = "/home/macula/SMATousi/Gullies/ground_truth/organized_data/Eval_Pos_Neg/"
select_and_copy_images(source_directory, destination_directory)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 451/451 [00:20<00:00, 21.95it/s]


In [17]:

def find_duplicate_tile_numbers(directory):
    # Dictionary to store filenames by tile number
    tile_dict = defaultdict(list)
    
    # Iterate through files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".tif"):
            # Extract the tile number from the filename
            parts = filename.split('_')
            if len(parts) >= 4:  # Ensure correct format
                tile_number = parts[-1]  # Extract the tile number (second to last part)
                tile_dict[tile_number].append(filename)
    
    # Check for duplicate tile numbers and print the corresponding filenames
    for tile_number, files in tile_dict.items():
        if len(files) > 1:
            print(f"Tile number {tile_number} has duplicates:")
            for file in files:
                print(f"  - {file}")

In [18]:
find_duplicate_tile_numbers(destination_directory)

Tile number 1263.tif has duplicates:
  - neg_rgb_0_tile_1263.tif
  - neg_rgb_2_tile_1263.tif
  - neg_rgb_3_tile_1263.tif
  - neg_rgb_4_tile_1263.tif
  - neg_rgb_1_tile_1263.tif
  - neg_rgb_5_tile_1263.tif
Tile number 1516.tif has duplicates:
  - neg_rgb_4_tile_1516.tif
  - neg_rgb_3_tile_1516.tif
  - neg_rgb_1_tile_1516.tif
  - neg_rgb_5_tile_1516.tif
  - neg_rgb_2_tile_1516.tif
  - neg_rgb_0_tile_1516.tif
Tile number 1349.tif has duplicates:
  - neg_rgb_4_tile_1349.tif
  - neg_rgb_3_tile_1349.tif
  - neg_rgb_2_tile_1349.tif
  - neg_rgb_1_tile_1349.tif
  - neg_rgb_0_tile_1349.tif
  - neg_rgb_5_tile_1349.tif
Tile number 1482.tif has duplicates:
  - neg_rgb_4_tile_1482.tif
  - neg_rgb_0_tile_1482.tif
  - neg_rgb_2_tile_1482.tif
  - neg_rgb_1_tile_1482.tif
  - neg_rgb_5_tile_1482.tif
  - neg_rgb_3_tile_1482.tif
Tile number 3105.tif has duplicates:
  - pos_rgb_4_tile_3105.tif
  - pos_rgb_3_tile_3105.tif
  - pos_rgb_2_tile_3105.tif
  - pos_rgb_0_tile_3105.tif
  - pos_rgb_5_tile_3105.tif
  -