In [3]:
import os
import shutil
import random
from tqdm import tqdm
import multiprocessing # <-- Added this

# --- ⚙️ CONFIGURATION ---
# The folder with your 4 subfolders (kuusi, mänty, etc.)
SOURCE_PATCH_DIR = r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/NewSet_Patches_for_Culling"

# The new directory we will create for training
CLEANING_DATASET_DIR = r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/Dataset_for_Cleaning"

VALIDATION_SPLIT_RATIO = 0.2 # 20% for validation, 80% for training
# ------------------------

# --- Helper function for multiprocessing ---
def copy_file_worker(task_args):
    """
    Takes a tuple (src_path, dest_path) and copies the file.
    Includes error handling.
    """
    src_path, dest_path = task_args
    try:
        shutil.copy(src_path, dest_path)
        return True
    except Exception as e:
        print(f"Error copying {src_path}: {e}")
        return False
# -------------------------------------------


print("Creating new train/validation directory structure...")
os.makedirs(CLEANING_DATASET_DIR, exist_ok=True)
os.makedirs(os.path.join(CLEANING_DATASET_DIR, 'train'), exist_ok=True)
os.makedirs(os.path.join(CLEANING_DATASET_DIR, 'validation'), exist_ok=True)

class_names = [d for d in os.listdir(SOURCE_PATCH_DIR) if os.path.isdir(os.path.join(SOURCE_PATCH_DIR, d))]

# This will be our main "to-do list" for all copy tasks
tasks = []

# Loop through each class (kuusi, mänty...)
for class_name in class_names:
    print(f"\nScanning class: {class_name}")

    # Create the class subfolders in train/val
    train_dest_dir = os.path.join(CLEANING_DATASET_DIR, 'train', class_name)
    val_dest_dir = os.path.join(CLEANING_DATASET_DIR, 'validation', class_name)
    os.makedirs(train_dest_dir, exist_ok=True)
    os.makedirs(val_dest_dir, exist_ok=True)

    # Get all image files for this class
    src_class_dir = os.path.join(SOURCE_PATCH_DIR, class_name)
    all_files = [f for f in os.listdir(src_class_dir) if f.endswith('.png')]

    random.shuffle(all_files) # Shuffle them randomly

    # Split the list
    split_index = int(len(all_files) * VALIDATION_SPLIT_RATIO)
    val_files = all_files[:split_index]
    train_files = all_files[split_index:]

    print(f"  Total: {len(all_files)}, Train: {len(train_files)}, Val: {len(val_files)}")

    # --- Build the task list (instead of copying) ---
    for f in train_files:
        src = os.path.join(src_class_dir, f)
        dest = os.path.join(train_dest_dir, f)
        tasks.append((src, dest)) # Add the (src, dest) tuple to our list

    for f in val_files:
        src = os.path.join(src_class_dir, f)
        dest = os.path.join(val_dest_dir, f)
        tasks.append((src, dest))

print(f"\n--- Built a 'to-do list' of {len(tasks)} total files to copy. ---")


# --- Execute the copying in parallel ---
print("Starting parallel copy pool...")
success_count = 0

with multiprocessing.Pool() as pool:
    with tqdm(total=len(tasks), desc="Copying files") as pbar:
        # Use imap_unordered to get results as they finish
        for result in pool.imap_unordered(copy_file_worker, tasks):
            if result: # The worker returns True on success
                success_count += 1
            pbar.update(1) # Update the progress bar

print(f"\n\n✅ Splitting complete! Successfully copied {success_count} files.")

Creating new train/validation directory structure...

Scanning class: kuusi
  Total: 9856, Train: 7885, Val: 1971

Scanning class: marjakuusi
  Total: 10112, Train: 8090, Val: 2022

Scanning class: mänty
  Total: 9600, Train: 7680, Val: 1920

Scanning class: thuja
  Total: 10496, Train: 8397, Val: 2099

--- Built a 'to-do list' of 40064 total files to copy. ---
Starting parallel copy pool...


Copying files: 100%|██████████| 40064/40064 [01:27<00:00, 457.32it/s]



✅ Splitting complete! Successfully copied 40064 files.





In [4]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# --- ⚙️ CONFIGURATION ---
TRAIN_DIR = os.path.join(CLEANING_DATASET_DIR, 'train')
VAL_DIR = os.path.join(CLEANING_DATASET_DIR, 'validation')

IMG_SIZE = (256, 256)
BATCH_SIZE = 32 # You can lower this to 16 if you run out of GPU memory
NUM_CLASSES = len(class_names) # Should be 4
NUM_EPOCHS = 10 # 10 epochs is a good start
MODEL_SAVE_PATH = 'draft_model.keras'
# ------------------------

# 1. Set up Data Generators
# Rescale pixel values from [0, 255] to [0, 1] (MobileNetV2 requirement)
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

val_generator = val_datagen.flow_from_directory(
    VAL_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

# 2. Build the Model
# Load MobileNetV2 without its top classification layer
base_model = MobileNetV2(
    input_shape=IMG_SIZE + (3,),
    include_top=False,
    weights='imagenet'
)
base_model.trainable = False # Freeze the base model

# Add our custom classification head
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dropout(0.3),
    layers.Dense(NUM_CLASSES, activation='softmax') # 4 outputs
])

# 3. Compile the Model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

# 4. Train the Model
print("\nStarting model training...")
history = model.fit(
    train_generator,
    epochs=NUM_EPOCHS,
    validation_data=val_generator
)

# 5. Save the Model
model.save(MODEL_SAVE_PATH)
print(f"\n\n✅ Draft model saved to {MODEL_SAVE_PATH}")

Found 32052 images belonging to 4 classes.
Found 8012 images belonging to 4 classes.


  base_model = MobileNetV2(
I0000 00:00:1762805986.055793   10495 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5520 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


None

Starting model training...
Epoch 1/10


2025-11-10 22:19:51.443527: I external/local_xla/xla/service/service.cc:163] XLA service 0x7696f0002700 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-11-10 22:19:51.443576: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2025-11-10 22:19:51.545918: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-11-10 22:19:52.249607: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91500
2025-11-10 22:19:54.717012: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:19:54.830226: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel 

[1m 445/1002[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1:42[0m 185ms/step - accuracy: 0.8000 - loss: 0.5176


2025-11-10 22:21:27.662898: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:21:27.783022: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:21:27.909582: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:21:28.029011: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:21:28.151325: E external/local_xla/xla/stream

[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - accuracy: 0.8534 - loss: 0.3868

2025-11-10 22:24:14.480300: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:24:14.609850: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:24:14.729412: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:24:14.847069: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-11-10 22:24:14.961855: E external/local_xla/xla/stream_

[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 260ms/step - accuracy: 0.9067 - loss: 0.2527 - val_accuracy: 0.9458 - val_loss: 0.1545
Epoch 2/10
[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 226ms/step - accuracy: 0.9374 - loss: 0.1715 - val_accuracy: 0.9507 - val_loss: 0.1383
Epoch 3/10
[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 223ms/step - accuracy: 0.9419 - loss: 0.1569 - val_accuracy: 0.9491 - val_loss: 0.1417
Epoch 4/10
[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 214ms/step - accuracy: 0.9434 - loss: 0.1525 - val_accuracy: 0.9552 - val_loss: 0.1228
Epoch 5/10
[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 214ms/step - accuracy: 0.9439 - loss: 0.1503 - val_accuracy: 0.9563 - val_loss: 0.1205
Epoch 6/10
[1m1002/1002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 203ms/step - accuracy: 0.9456 - loss: 0.1480 - val_accuracy: 0.9528 - val_loss: 0.1295
Epo

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# --- We no longer need cleanlab ---
import os
import shutil
from tqdm import tqdm

# --- ⚙️ CONFIGURATION ---
# Use the same directory as Script 1
DATA_DIR_TO_SCAN = SOURCE_PATCH_DIR
MODEL_PATH = 'draft_model.keras'
IMG_SIZE = (256, 256)
BATCH_SIZE = 32

# Where to put the bad images:
REVIEW_DIR = os.path.join(CLEANING_DATASET_DIR, "to_review")
os.makedirs(REVIEW_DIR, exist_ok=True)

# How many of the *worst* images to flag.
PERCENT_TO_FLAG = 0.15
# ------------------------

# 1. Load the trained model
print(f"Loading draft model from {MODEL_PATH}...")
model = tf.keras.models.load_model(MODEL_PATH)

# 2. Create a generator for ALL images (in order)
all_datagen = ImageDataGenerator(rescale=1./255)
all_generator = all_datagen.flow_from_directory(
    DATA_DIR_TO_SCAN,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode=None, # No labels needed, just predicting
    shuffle=False
)

# 3. Get all predictions (pred_probs)
print("Generating predictions for all images... (this may take a while)")
pred_probs = model.predict(all_generator)

# 4. Find "confused" images using NumPy
print("Ranking images by model confidence...")

# --- THIS IS THE NEW LOGIC ---
# Get the model's highest probability for each image
self_confidence = np.max(pred_probs, axis=1)

# Get the indices, sorted from lowest confidence (worst) to highest (best)
sorted_indices = np.argsort(self_confidence)
# -----------------------------

# 5. Get the file paths in the same order
filepaths = all_generator.filenames

# 6. Find the images to flag
num_to_flag = int(len(filepaths) * PERCENT_TO_FLAG)
print(f"Total images: {len(filepaths)}. Flagging worst {num_to_flag}...")

# We just take the first 'num_to_flag' from the sorted list
worst_indices = sorted_indices[:num_to_flag]

# 7. Move the flagged files to the 'to_review' folder
print(f"Moving {num_to_flag} images to {REVIEW_DIR}...")
for i in tqdm(worst_indices, desc="Moving files"):
    # Get the original filename (e.g., 'kuusi/image_1.png')
    relative_filepath = filepaths[i]

    # Create the destination folder if it doesn't exist
    # (e.g., .../to_review/kuusi/)
    class_name = os.path.dirname(relative_filepath)
    os.makedirs(os.path.join(REVIEW_DIR, class_name), exist_ok=True)

    # Get full source and destination paths
    src_path = os.path.join(DATA_DIR_TO_SCAN, relative_filepath)
    dest_path = os.path.join(REVIEW_DIR, relative_filepath)

    try:
        shutil.move(src_path, dest_path)
    except Exception as e:
        print(f"\nError moving {src_path}: {e}")

print(f"\n\n✅ Culling complete! Please review {num_to_flag} images in:\n{REVIEW_DIR}")

Loading draft model from draft_model.keras...
Found 40064 images belonging to 4 classes.
Generating predictions for all images... (this may take a while)
[1m1252/1252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m401s[0m 318ms/step
Ranking images by model confidence...
Total images: 40064. Flagging worst 6009...
Moving 6009 images to /mnt/c/Users/Pavelishko/Pictures/Хвоя/Dataset_for_Cleaning/to_review...


Moving files: 100%|██████████| 6009/6009 [00:54<00:00, 110.75it/s]



✅ Culling complete! Please review 6009 images in:
/mnt/c/Users/Pavelishko/Pictures/Хвоя/Dataset_for_Cleaning/to_review



