In [1]:
import zarr
import numpy as np
import cv2
import matplotlib.pyplot as plt
from skimage.exposure import rescale_intensity 
from ipywidgets import HBox, Image
from IPython.display import display
import io
import os
import json

def convert_to_8bit(vol):
    pmin = np.percentile(vol, 0.0)
    pmax = np.percentile(vol, 99.9)
    return rescale_intensity(vol, in_range=(pmin, pmax), out_range=np.uint8)

paths_r = ['TS_5_4', 'TS_6_4', 'TS_6_6', 'TS_69_2', 'TS_73_6', 'TS_86_3' , 'TS_99_9']

In [2]:


# for r in paths_r:
#     set_type = 'train'
#     zarr_path = f'data/{set_type}/static/ExperimentRuns/{r}/VoxelSpacing10.000/denoised.zarr'

#     try:
#         vol = zarr.open(zarr_path, mode='r')
#         vol = vol[0]
#         vol2 = convert_to_8bit(vol)
#         n_imgs = vol2.shape[0]

#         print(f"This is {r} and there are {n_imgs} images")

#         image_widgets = []
#         for i in range(n_imgs):
#             # Create tmp_img directly from the slice with correct dimensions
#             tmp_img = vol2[i]
#             inp_arr = np.stack([tmp_img] * 3, axis=-1)

#             # Remove resizing
#             inp_arr = cv2.resize(inp_arr, (640, 640))

#             # Convert to PNG bytes for display with ipywidgets.Image
#             img_bytes = io.BytesIO()
#             plt.imsave(img_bytes, inp_arr, format='png')
#             img_widget = Image(value=img_bytes.getvalue())
#             image_widgets.append(img_widget)

#         # Display the images using HBox for horizontal scrolling
#         hbox = HBox(image_widgets)
#         print(f"Displaying slices for {r}:")
#         display(hbox)

#     except Exception as e:
#         print(f"An error occurred while processing {r}: {e}")

# print("Finished displaying images.")

In [3]:



output_dir = 'yolo_data/images'

# Create the main output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
if not os.path.exists('yolo_data/images/TS_5_4/0.png'):
    for r in paths_r:
        set_type = 'train'
        zarr_path = f'data/{set_type}/static/ExperimentRuns/{r}/VoxelSpacing10.000/denoised.zarr'
        ts_output_dir = os.path.join(output_dir, r)
        os.makedirs(ts_output_dir, exist_ok=True)  # Create subfolder for each TS

        try:
            vol = zarr.open(zarr_path, mode='r')
            vol = vol[0]
            vol2 = convert_to_8bit(vol)
            n_imgs = vol2.shape[0]

            print(f"Processing {r} and there are {n_imgs} images")

            for i in range(n_imgs):
                tmp_img = vol2[i]
                inp_arr = np.stack([tmp_img] * 3, axis=-1)
                inp_arr = cv2.resize(inp_arr, (640, 640))

                # Save the image to the corresponding subfolder
                image_filename = os.path.join(ts_output_dir, f"{i}.png") # Added formatting for index
                cv2.imwrite(image_filename, inp_arr)

            print(f"Saved {n_imgs} images for {r} in {ts_output_dir}")

        except Exception as e:
            print(f"An error occurred while processing {r}: {e}")

    print("Finished saving images.")
else:
    print('Images are already extracted')

Images are already extracted


In [4]:


def find_min_max_xyz(path):
    """
    Finds the minimum and maximum 'x', 'y', and 'z' values in all JSON files within the given path.

    Args:
        path: The path to the directory containing subfolders with JSON files.

    Returns:
        A tuple containing the minimum and maximum 'x', 'y', and 'z' values found,
        or None if no 'x', 'y', or 'z' values are found.
    """
    min_x = float('inf')
    max_x = float('-inf')
    min_y = float('inf')
    max_y = float('-inf')
    min_z = float('inf')
    max_z = float('-inf')
    found_x = False
    found_y = False
    found_z = False

    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith(".json"):
                filepath = os.path.join(root, file)
                try:
                    with open(filepath, 'r') as f:
                        data = json.load(f)
                        if 'points' in data and isinstance(data['points'], list):
                            for point in data['points']:
                                if 'location' in point and isinstance(point['location'], dict):
                                    location = point['location']
                                    if 'x' in location:
                                        x_value = location['x']
                                        min_x = min(min_x, x_value)
                                        max_x = max(max_x, x_value)
                                        found_x = True
                                    if 'y' in location:
                                        y_value = location['y']
                                        min_y = min(min_y, y_value)
                                        max_y = max(max_y, y_value)
                                        found_y = True
                                    if 'z' in location:
                                        z_value = location['z']
                                        min_z = min(min_z, z_value)
                                        max_z = max(max_z, z_value)
                                        found_z = True
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file: {filepath}")
                except Exception as e:
                    print(f"Error processing file {filepath}: {e}")

    if found_x or found_y or found_z:
        return (min_x if found_x else None,
                max_x if found_x else None,
                min_y if found_y else None,
                max_y if found_y else None,
                min_z if found_z else None,
                max_z if found_z else None)
    else:
        return None

if __name__ == "__main__":
    path_to_data = r"data\train\overlay\ExperimentRuns"
    
    
    result = find_min_max_xyz(path_to_data)

    if result:
        min_x, max_x, min_y, max_y, min_z, max_z = result
        if min_x is not None:
            print(f"Minimum 'x' value found: {min_x}")
        if max_x is not None:
            print(f"Maximum 'x' value found: {max_x}")
            print(50*'-')
        if min_y is not None:
            print(f"Minimum 'y' value found: {min_y}")
        if max_y is not None:
            print(f"Maximum 'y' value found: {max_y}")
            print(50*'-')
        if min_z is not None:
            print(f"Minimum 'z' value found: {min_z}")
        if max_z is not None:
            print(f"Maximum 'z' value found: {max_z}")
    else:
        print(f"No 'x', 'y', or 'z' values found in the JSON files within the path: {path_to_data}")

Minimum 'x' value found: 69.928
Maximum 'x' value found: 6229.592
--------------------------------------------------
Minimum 'y' value found: 86.606
Maximum 'y' value found: 6266.944
--------------------------------------------------
Minimum 'z' value found: 39.085
Maximum 'z' value found: 1549.302


In [5]:
i2p = {
  0 : 'virus-like-particle',
  1 : 'apo-ferritin',
  2 : 'beta-galactosidase',
  3 : 'ribosome',
  4 : 'thyroglobulin'
}

p2i = {
  'virus-like-particle' : 0 ,
  'apo-ferritin': 1 ,
  'beta-galactosidase' : 2 ,
  'ribosome' : 3 ,
  'thyroglobulin' : 4 
}

particle_radius = {
        'virus-like-particle': 140,
        'apo-ferritin': 60,
        'beta-galactosidase': 90,
        'ribosome': 150,
        'thyroglobulin': 130,   
    }

# the radius devided by the voxel spacing, and the result devided by the X shape of the input image (630x630), and then x2 to get the diameter represented in that image
width_height_particles = { 
        'virus-like-particle': ((particle_radius['virus-like-particle']/10)/630) * 2, 
        'apo-ferritin': ((particle_radius['apo-ferritin']/10)/630) * 2,
        'beta-galactosidase': ((particle_radius['beta-galactosidase']/10)/630) * 2,
        'ribosome': ((particle_radius['ribosome']/10)/630) * 2,
        'thyroglobulin': ((particle_radius['thyroglobulin']/10)/630) * 2,   
    
}

In [6]:
base_dir = r'data\train\overlay\ExperimentRuns'

def round_to_nearest_ten(z):
    """Rounds an integer to the nearest multiple of 10."""
    remainder = z % 10
    if remainder < 5:
        return z - remainder
    else:
        return z + (10 - remainder)

for subdir, dirs, files in os.walk(base_dir):
    for filename in files:
        if filename.endswith('.json'):
            filepath = os.path.join(subdir, filename)
            try:
                with open(filepath, 'r') as f:
                    data = json.load(f)

                # Check if 'points' exists and is a list
                if 'points' in data and isinstance(data['points'], list):
                    for point in data['points']:
                        if 'location' in point and 'z' in point['location']:
                            z_value = point['location']['z']
                            rounded_z = round_to_nearest_ten(int(z_value))
                            rounded_z = rounded_z / 10
                            particle_name = filename.replace('.json', '')

                            if particle_name in particle_radius:
                                radius = particle_radius[particle_name]
                                radius_offset = radius / 10

                                lower_range = rounded_z - radius_offset
                                upper_range = rounded_z + radius_offset

                                print(f"File: {filename}")
                                print(f"  Rounded Z: {rounded_z}")
                                print(f"  Particle Radius ({particle_name}): {radius}")
                                print(f"  Range: [{lower_range}, {upper_range}]")
                            else:
                                print(f"Warning: Particle radius not found for {particle_name} in {filename}")
                            # Break after processing the first valid 'location' in 'points'
                            break
                        else:
                            print(f"Warning: 'location' or 'z' key not found in a 'point' in {filename}")
                else:
                    print(f"Warning: 'points' key not found or is not a list in {filename}")

            except json.JSONDecodeError:
                print(f"Error: Could not decode JSON in {filename}")
            except Exception as e:
                print(f"An error occurred while processing {filename}: {e}")

File: apo-ferritin.json
  Rounded Z: 60.0
  Particle Radius (apo-ferritin): 60
  Range: [54.0, 66.0]
File: beta-galactosidase.json
  Rounded Z: 37.0
  Particle Radius (beta-galactosidase): 90
  Range: [28.0, 46.0]
File: ribosome.json
  Rounded Z: 60.0
  Particle Radius (ribosome): 150
  Range: [45.0, 75.0]
File: thyroglobulin.json
  Rounded Z: 28.0
  Particle Radius (thyroglobulin): 130
  Range: [15.0, 41.0]
File: virus-like-particle.json
  Rounded Z: 64.0
  Particle Radius (virus-like-particle): 140
  Range: [50.0, 78.0]
File: apo-ferritin.json
  Rounded Z: 109.0
  Particle Radius (apo-ferritin): 60
  Range: [103.0, 115.0]
File: beta-galactosidase.json
  Rounded Z: 91.0
  Particle Radius (beta-galactosidase): 90
  Range: [82.0, 100.0]
File: ribosome.json
  Rounded Z: 81.0
  Particle Radius (ribosome): 150
  Range: [66.0, 96.0]
File: thyroglobulin.json
  Rounded Z: 55.0
  Particle Radius (thyroglobulin): 130
  Range: [42.0, 68.0]
File: virus-like-particle.json
  Rounded Z: 80.0
  Parti

In [7]:


output_base_dir = r'yolo_data\labels'  # Use raw string to avoid issues with backslashes

if not os.path.exists(r'yolo_data\labels\TS_5_4'):
    for folder_name in paths_r:
        # Construct the path for the subfolder
        subfolder_path = os.path.join(output_base_dir, folder_name)

        # Create the subfolder if it doesn't exist
        os.makedirs(subfolder_path, exist_ok=True)  # exist_ok=True avoids errors if the folder already exists

        # Create the .txt files within the subfolder
        for i in range(184):  # range(184) generates numbers from 0 to 183
            file_name = f"{i}.txt"
            file_path = os.path.join(subfolder_path, file_name)

            # Create an empty .txt file
            with open(file_path, 'w') as f:
                pass  # 'pass' does nothing, effectively creating an empty file

    print("Subfolders and files created successfully.")
else:
    print('Empty .txt files has already been created')

Empty .txt files has already been created


In [8]:


base_dir = r'data\train\overlay\ExperimentRuns'
output_base_dir = r'yolo_data\labels'



def round_to_nearest_ten(z):
    """Rounds an integer to the nearest multiple of 10."""
    remainder = z % 10
    if remainder < 5:
        return z - remainder
    else:
        return z + (10 - remainder)

rerun_file_path = r"yolo_data\labels\TS_5_4\54.txt"
file_path =  r"yolo_data\labels\TS_5_4"

if os.path.exists(file_path) and os.path.getsize(rerun_file_path) == 0:
    for subdir, dirs, files in os.walk(base_dir):
        for filename in files:
            if filename == 'beta-amylase.json':
                continue  # Skip beta-amylase.json files

            if filename.endswith('.json'):
                filepath = os.path.join(subdir, filename)
                try:
                    with open(filepath, 'r') as f:
                        data = json.load(f)

                    # Extract TS folder name
                    parts = subdir.split(os.sep)
                    if 'ExperimentRuns' in parts and 'Picks' in parts:
                        ts_folder_index = parts.index('ExperimentRuns') + 1
                        if ts_folder_index < len(parts):
                            ts_folder_name = parts[ts_folder_index]
                            output_dir = os.path.join(output_base_dir, ts_folder_name)
                            os.makedirs(output_dir, exist_ok=True)
                        else:
                            print(f"Warning: Could not determine TS folder name for {filename}")
                            continue
                    else:
                        print(f"Warning: Unexpected subdirectory structure for {filename}")
                        continue

                    particle_name = filename.replace('.json', '')

                    if particle_name not in p2i:
                        print(f"Warning: Particle name '{particle_name}' not found in p2i dictionary for {filename}")
                        continue

                    if particle_name not in particle_radius:
                        print(f"Warning: Particle radius not found for {particle_name} in {filename}")
                        continue

                    radius = particle_radius[particle_name]
                    radius_offset = radius / 10

                    # Check if 'points' exists and is a list
                    if 'points' in data and isinstance(data['points'], list):
                        for point in data['points']:
                            if 'location' in point and 'x' in point['location'] and 'y' in point['location'] and 'z' in point['location']:
                                x_value = point['location']['x']
                                y_value = point['location']['y']
                                z_value = point['location']['z']

                                rounded_z = round_to_nearest_ten(int(z_value))
                                rounded_z_div_10 = rounded_z // 10

                                lower_range = max(0 , round_to_nearest_ten(int(rounded_z - radius)) // 10)
                                upper_range = min(183, round_to_nearest_ten(int(rounded_z + radius)) // 10)

                                class_id = p2i[particle_name]
                                x_center = x_value / 10 / 630 # devided by the original shape of the 2D sliced image to get the ratio coordonates on both X and Y
                                y_center = y_value / 10 / 630
                                width = width_height_particles[particle_name]
                                height = width_height_particles[particle_name]

                                label_line = f"{class_id} {x_center} {y_center} {width} {height}\n"

                                for i in range(lower_range, upper_range + 1):
                                    output_filepath = os.path.join(output_dir, f"{i}.txt")
                                    with open(output_filepath, 'a') as outfile:
                                        outfile.write(label_line)

                            else:
                                print(f"Warning: Missing 'location', 'x', 'y', or 'z' key in a 'point' in {filename}")
                    else:
                        print(f"Warning: 'points' key not found or is not a list in {filename}")

                except json.JSONDecodeError:
                    print(f"Error: Could not decode JSON in {filename}")
                except Exception as e:
                    print(f"An error occurred while processing {filename}: {e}")
else:
    print('Locations are already extracted')

Locations are already extracted


In [9]:
import os
import shutil

# Define the base directory
base_dir = 'yolo_data'
images_dir = os.path.join(base_dir, 'images')
labels_dir = os.path.join(base_dir, 'labels')
training_set_dir = os.path.join(base_dir, 'training_set')
train_images_dir = os.path.join(training_set_dir, 'images', 'train')
train_labels_dir = os.path.join(training_set_dir, 'labels', 'train')
val_images_dir = os.path.join(training_set_dir, 'images', 'val')
val_labels_dir = os.path.join(training_set_dir, 'labels', 'val')

# Create the destination directories if they don't exist, and clear them if they do
for dest_dir in [train_images_dir, train_labels_dir, val_images_dir, val_labels_dir]:
    os.makedirs(dest_dir, exist_ok=True)
    # Clear the directory
    for filename in os.listdir(dest_dir):
        file_path = os.path.join(dest_dir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

paths_r = ['TS_5_4', 'TS_6_4', 'TS_6_6', 'TS_69_2', 'TS_73_6', 'TS_86_3' , 'TS_99_9']

# Split the TS folders into train and validation
train_ts_folders = paths_r[:5]
val_ts_folders = paths_r[5:]

# Process training data
print("Processing training data...")
for i, ts_folder in enumerate(train_ts_folders):
    print(f"Processing TS folder: {ts_folder}")
    current_offset = i * 184

    # Process images for the current TS folder
    source_image_dir = os.path.join(images_dir, ts_folder)
    if os.path.isdir(source_image_dir):
        for filename in sorted(os.listdir(source_image_dir), key=lambda x: int(os.path.splitext(x)[0])):
            source_path = os.path.join(source_image_dir, filename)
            base_name = os.path.splitext(filename)[0]
            extension = os.path.splitext(filename)[1]
            try:
                new_base_name = int(base_name) + current_offset
                new_filename = f"{new_base_name}{extension}"
                destination_path = os.path.join(train_images_dir, new_filename)
                shutil.copy2(source_path, destination_path)
                # print(f"  Copied image: {filename} -> {new_filename}")
            except ValueError:
                print(f"  Skipping image with non-numeric name: {filename}")

    # Process labels for the current TS folder
    source_label_dir = os.path.join(labels_dir, ts_folder)
    if os.path.isdir(source_label_dir):
        for filename in sorted(os.listdir(source_label_dir), key=lambda x: int(os.path.splitext(x)[0])):
            source_path = os.path.join(source_label_dir, filename)
            base_name = os.path.splitext(filename)[0]
            extension = os.path.splitext(filename)[1]
            try:
                new_base_name = int(base_name) + current_offset
                new_filename = f"{new_base_name}{extension}"
                destination_path = os.path.join(train_labels_dir, new_filename)
                shutil.copy2(source_path, destination_path)
                # print(f"  Copied label: {filename} -> {new_filename}")
            except ValueError:
                print(f"  Skipping label with non-numeric name: {filename}")

# Process validation data
print("Processing validation data...")
for i, ts_folder in enumerate(val_ts_folders):
    print(f"Processing TS folder: {ts_folder}")
    current_offset = i * 184 + (len(train_ts_folders) * 184) # Offset starts after the last training file

    # Process images for the current TS folder
    source_image_dir = os.path.join(images_dir, ts_folder)
    if os.path.isdir(source_image_dir):
        for filename in sorted(os.listdir(source_image_dir), key=lambda x: int(os.path.splitext(x)[0])):
            source_path = os.path.join(source_image_dir, filename)
            base_name = os.path.splitext(filename)[0]
            extension = os.path.splitext(filename)[1]
            try:
                new_base_name = int(base_name) + current_offset
                new_filename = f"{new_base_name}{extension}"
                destination_path = os.path.join(val_images_dir, new_filename)
                shutil.copy2(source_path, destination_path)
                # print(f"  Copied image: {filename} -> {new_filename}")
            except ValueError:
                print(f"  Skipping image with non-numeric name: {filename}")

    # Process labels for the current TS folder
    source_label_dir = os.path.join(labels_dir, ts_folder)
    if os.path.isdir(source_label_dir):
        for filename in sorted(os.listdir(source_label_dir), key=lambda x: int(os.path.splitext(x)[0])):
            source_path = os.path.join(source_label_dir, filename)
            base_name = os.path.splitext(filename)[0]
            extension = os.path.splitext(filename)[1]
            try:
                new_base_name = int(base_name) + current_offset
                new_filename = f"{new_base_name}{extension}"
                destination_path = os.path.join(val_labels_dir, new_filename)
                shutil.copy2(source_path, destination_path)
                # print(f"  Copied label: {filename} -> {new_filename}")
            except ValueError:
                print(f"  Skipping label with non-numeric name: {filename}")

print("Data processing complete.")
print("Train folders:", train_ts_folders)
print("Validation folders:", val_ts_folders)

Processing training data...
Processing TS folder: TS_5_4
Processing TS folder: TS_6_4
Processing TS folder: TS_6_6
Processing TS folder: TS_69_2
Processing TS folder: TS_73_6
Processing validation data...
Processing TS folder: TS_86_3
Processing TS folder: TS_99_9
Data processing complete.
Train folders: ['TS_5_4', 'TS_6_4', 'TS_6_6', 'TS_69_2', 'TS_73_6']
Validation folders: ['TS_86_3', 'TS_99_9']


In [10]:
for i in range(5):
  print(width_height_particles[i2p[i]])

0.044444444444444446
0.01904761904761905
0.02857142857142857
0.047619047619047616
0.04126984126984127


In [11]:
i2p

{0: 'virus-like-particle',
 1: 'apo-ferritin',
 2: 'beta-galactosidase',
 3: 'ribosome',
 4: 'thyroglobulin'}

In [12]:
from ultralytics import YOLO
model = YOLO(r"models\Kaggle_1st_try.pt")
model.info()

YOLO11x summary: 631 layers, 56,879,551 parameters, 0 gradients, 195.5 GFLOPs


(631, 56879551, 0, 195.47489280000002)

In [137]:

results = model(r'yolo_data\images\TS_5_4\92.png')
results[0].show()

In [13]:
# Get the training parameters (most common for tweaking)
print("--- Training Parameters (via model.train()) ---")
help(model.train)

# Get the prediction parameters (for inference tweaking)
print("\n--- Prediction Parameters (via model.predict()) ---")
help(model.predict)

# Get the validation parameters
print("\n--- Validation Parameters (via model.val()) ---")
help(model.val)

--- Training Parameters (via model.train()) ---
Help on method train in module ultralytics.engine.model:

train(trainer=None, **kwargs: Any) method of ultralytics.models.yolo.model.YOLO instance
    Trains the model using the specified dataset and training configuration.

    This method facilitates model training with a range of customizable settings. It supports training with a
    custom trainer or the default training approach. The method handles scenarios such as resuming training
    from a checkpoint, integrating with Ultralytics HUB, and updating model and configuration after training.

    When using Ultralytics HUB, if the session has a loaded model, the method prioritizes HUB training
    arguments and warns if local arguments are provided. It checks for pip updates and combines default
    configurations, method-specific defaults, and user-provided arguments to configure the training process.

    Args:
        trainer (BaseTrainer | None): Custom trainer instance for model

In [14]:
%%writefile czii_conf.yaml

path: E:/ML Projects/Kaggle/CryoET  # Update this to the directory containing 'training_set'
train: yolo_data/training_set/images/train
val: yolo_data/training_set/images/val

# Classes
names:
  0: virus-like-particle
  1: apo-ferritin
  2: beta-galactosidase
  3: ribosome
  4: thyroglobulin

Overwriting czii_conf.yaml


In [15]:
import os
import yaml

# Define the root directory of your data
root_dir = 'yolo_data'
images_dir = os.path.join(root_dir, 'images')
labels_dir = os.path.join(root_dir, 'labels')

# Define the validation folders
val_folders = ['TS_86_3', 'TS_99_9']

# Define the class names
class_names = {
    0: 'virus-like-particle',
    1: 'apo-ferritin',
    2: 'beta-galactosidase',
    3: 'ribosome',
    4: 'thyroglobulin'
}

# Create lists for training and validation image and label paths
train_images = []
train_labels = []
val_images = []
val_labels = []

# Iterate through the subfolders in the images directory
for folder_name in os.listdir(images_dir):
    image_folder_path = os.path.join(images_dir, folder_name)
    label_folder_path = os.path.join(labels_dir, folder_name)

    # Check if it's a directory
    if os.path.isdir(image_folder_path) and os.path.isdir(label_folder_path):
        # Get all image files in the current image folder
        image_files = sorted([os.path.join(image_folder_path, f) for f in os.listdir(image_folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        # Get all label files in the current label folder
        label_files = sorted([os.path.join(label_folder_path, f) for f in os.listdir(label_folder_path) if f.lower().endswith('.txt')])

        # Ensure the number of image and label files matches (optional but good practice)
        if len(image_files) != len(label_files):
            print(f"Warning: Number of image and label files does not match in folder {folder_name}")
            continue

        # Assign to train or validation based on the folder name
        if folder_name in val_folders:
            val_images.extend(image_files)
            val_labels.extend(label_files)
        else:
            train_images.extend(image_files)
            train_labels.extend(label_files)

# Create a YAML configuration file for YOLO
data_yaml = {
    'train': train_images,
    'val': val_images,
    'nc': len(class_names),
    'names': list(class_names.values())
}

# Save the YAML file
yaml_file_path = os.path.join(root_dir, 'data.yaml')
with open(yaml_file_path, 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=False)

print(f"YOLO data configuration file created at: {yaml_file_path}")
print("\nNow you can use this 'data.yaml' file to train your YOLO model.")
print("For example, if you are using YOLOv5, you can train with a command like:")
print("```bash")
print(f"python train.py --data {yaml_file_path} --cfg yolov5s.yaml --weights '' --epochs 100") # Replace yolov5s.yaml and epochs as needed
print("```")

YOLO data configuration file created at: yolo_data\data.yaml

Now you can use this 'data.yaml' file to train your YOLO model.
For example, if you are using YOLOv5, you can train with a command like:
```bash
python train.py --data yolo_data\data.yaml --cfg yolov5s.yaml --weights '' --epochs 100
```


In [128]:
import zarr
import matplotlib.pyplot as plt

# Define the path to your Zarr file
zarr_path = r"data\train\static\ExperimentRuns\TS_5_4\VoxelSpacing10.000\denoised.zarr"

# Open the Zarr array
try:
    tomogram = zarr.open(zarr_path, mode='r')

    # The tomogram is a 3D array, and you want the 5th slice.
    # Remember that Python uses 0-based indexing, so the 5th image is at index 4.
    # Assuming the slices are along the first dimension (axis 0), access the 5th slice.
    fifth_slice = tomogram[4]

    # Display the 5th image using matplotlib
    plt.imshow(fifth_slice, cmap='gray')  # Use 'gray' colormap for grayscale images
    plt.title("5th Slice of the Tomogram")
    plt.colorbar()  # Add a colorbar to visualize intensity values
    plt.show()

except Exception as e:
    print(f"Error reading or displaying the Zarr file: {e}")

Error reading or displaying the Zarr file: 4


# Submissions using a trained YOLO MODEL

In [17]:
import os
import zarr
import numpy as np
import cv2
import pandas as pd
from ultralytics import YOLO
from PIL import Image as PILImage
import time  # Import time module

def convert_to_8bit(vol):
    pmin = np.percentile(vol, 0.0)
    pmax = np.percentile(vol, 99.9)
    return rescale_intensity(vol, in_range=(pmin, pmax), out_range=np.uint8)

# Load your trained YOLO model
model = YOLO(r'models\Kaggle_1st_try.pt')  # Replace 'path/to/your/best.pt' with the actual path to your trained model

set_type = 'test'
base_dir = f'data/{set_type}/static/ExperimentRuns/'
results_df = pd.DataFrame(columns=['id', 'experiment', 'particle', 'x', 'y', 'z', 'Confidence_Score']) # Added 'Confidence_Score' column
row_id = 0
reverse_ratio = 63/64

# 1. Get the list of folder names dynamically
try:
    paths_r = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    print(f"Found experiment runs: {paths_r}")
except FileNotFoundError:
    print(f"Error: Directory not found: {base_dir}")
    paths_r = []

start_time = time.time() # Track the total time
# 2. Loop through the dynamically retrieved paths
for r in paths_r:
    zarr_path = os.path.join(base_dir, r, 'VoxelSpacing10.000', 'denoised.zarr')
    
    try:
        
        vol = zarr.open(zarr_path, mode='r')
        vol = vol[0]
        vol2 = convert_to_8bit(vol)
        n_imgs = vol2.shape[0]
        print(f"Processing {r} with {n_imgs} images...")

        for i in range(n_imgs):
            
            # Create tmp_img directly from the slice with correct dimensions
            tmp_img = vol2[i]
            inp_arr = np.stack([tmp_img] * 3, axis=-1)
            inp_arr = cv2.resize(inp_arr, (640, 640))
            if r == 'TS_5_4' and i == 5:
                # Resize before passing it to yolo, keep same processing as in the display
                
                cv2.imwrite('Try.png', inp_arr)



            results = model(inp_arr, verbose=False) # Run YOLO inference, suppress output

            for result in results:
                boxes = result.boxes.xyxy.float()
                confidences = result.boxes.conf
                class_ids = result.boxes.cls.float()
                class_names = result.names

                for j in range(len(boxes)):
                    xmin, ymin, xmax, ymax = boxes[j]
                    x_mean = np.mean([xmin, xmax]) * reverse_ratio * 10
                    y_mean = np.mean([ymin, ymax]) * reverse_ratio * 10
                    class_name = class_names[int(class_ids[j])]
                    confidence = confidences[j]  # Retrieve confidence score


                    global row_id
                    results_df.loc[len(results_df)] = {
                        'id': row_id,
                        'experiment': r,
                        'particle': class_name,
                        'x': x_mean,
                        'y': y_mean,
                        'z': i * 10,
                        'Confidence_Score': confidence.item() # Add confidence score
                    }
                    row_id += 1
            
        print(f"Finished processing {r}")
    except Exception as e:
        print(f"An error occurred while processing {r}: {e}")


end_time = time.time()
total_time = end_time - start_time
print("Finished processing all volumes.")

print("YOLO Results DataFrame:")
print(results_df.head())
print(f"Shape of the dataframe is : {results_df.shape}")
print(f"Total processing time: {total_time:.2f} seconds")

Found experiment runs: ['TS_5_4', 'TS_69_2', 'TS_6_4']
Processing TS_5_4 with 184 images...
Finished processing TS_5_4
Processing TS_69_2 with 184 images...
Finished processing TS_69_2
Processing TS_6_4 with 184 images...
Finished processing TS_6_4
Finished processing all volumes.
YOLO Results DataFrame:
   id experiment      particle            x            y   z  Confidence_Score
0   0     TS_5_4      ribosome  6075.860352  2228.152832  20          0.314776
1   1     TS_5_4  apo-ferritin  5870.116699  5125.254395  40          0.441963
2   2     TS_5_4  apo-ferritin  5872.069824  5127.577148  50          0.555828
3   3     TS_5_4      ribosome  6086.560059  2229.816650  50          0.310630
4   4     TS_5_4  apo-ferritin  5700.407227  5001.157715  50          0.267982
Shape of the dataframe is : (9923, 7)
Total processing time: 327.73 seconds


In [209]:
# Define the particles with the lower threshold
low_threshold_particles = ['thyroglobulin', 'beta-galactosidase']

# Create a boolean mask for rows to keep
mask = (
    ((results_df['particle'].isin(low_threshold_particles)) & (results_df['Confidence_Score'] >= 0.2)) |
    ((~results_df['particle'].isin(low_threshold_particles)) & (results_df['Confidence_Score'] >= 0.5))
)

# Filter the dataframe to keep only the rows where the mask is True
filtered_df = results_df[mask]
filtered_df

Unnamed: 0,id,experiment,particle,x,y,z,Confidence_Score
2,2,TS_5_4,apo-ferritin,5872.069824,5127.577148,50,0.555828
5,5,TS_5_4,apo-ferritin,5869.651367,5130.336426,60,0.573386
10,10,TS_5_4,thyroglobulin,5085.006348,590.016846,60,0.252252
11,11,TS_5_4,apo-ferritin,5867.575684,5131.310547,70,0.604671
16,16,TS_5_4,apo-ferritin,5866.863281,5131.467773,80,0.654792
...,...,...,...,...,...,...,...
9884,9884,TS_6_4,ribosome,2745.919434,6053.562500,1420,0.571500
9885,9885,TS_6_4,ribosome,3280.358887,6082.302246,1420,0.549957
9886,9886,TS_6_4,ribosome,2440.175049,5151.458008,1420,0.504395
9893,9893,TS_6_4,ribosome,2746.192383,6055.903320,1430,0.530952


In [218]:
import pandas as pd

def find_3d_particles_v3(df):
    """
    Identifies potential 3D particles by grouping 2D detections based on proximity,
    creating new ranges if a detection doesn't fall into existing ones.
    This version processes data grouped by 'experiment'.

    Args:
        df (pd.DataFrame): DataFrame containing 'id', 'experiment', 'particle', 'x', 'y', 'z', 'Confidence_Score'.

    Returns:
        list: A list of dictionaries, where each dictionary represents an experiment.
              The first element is the experiment value, and the second is a list of
              dictionaries representing potential 3D particles within that experiment.
    """
    all_experiments_data = []
    for experiment_value in df['experiment'].unique():
        experiment_df = df[df['experiment'] == experiment_value].copy()
        three_d_particles = []
        processed_indices = set()

        for index, row in experiment_df.iterrows():
            if row['id'] not in processed_indices:
                added_to_existing = False
                for particle in three_d_particles:
                    if particle['x_range'][0] <= row['x'] <= particle['x_range'][1] and \
                       particle['y_range'][0] <= row['y'] <= particle['y_range'][1]:
                        particle['detections'].append(row.to_dict())
                        # Update the range to encompass the new detection
                        particle['x_range'] = (min(particle['x_range'][0], row['x']), max(particle['x_range'][1], row['x']))
                        particle['y_range'] = (min(particle['y_range'][0], row['y']), max(particle['y_range'][1], row['y']))
                        processed_indices.add(row['id'])
                        added_to_existing = True
                        break

                if not added_to_existing:
                    # Create a new 3D particle and its initial range
                    selected_range = 70
                    x_center = row['x']
                    y_center = row['y']
                    x_range = (max(0, x_center - selected_range), min(x_center + selected_range, 6300))
                    y_range = (max(0, y_center - selected_range), min(y_center + selected_range, 6300))

                    new_3d_particle = {'detections': [row.to_dict()], 'x_range': x_range, 'y_range': y_range}
                    three_d_particles.append(new_3d_particle)
                    processed_indices.add(row['id'])

        all_experiments_data.append([experiment_value, three_d_particles])

    return all_experiments_data

three_d_particles_found_v3 = find_3d_particles_v3(filtered_df.copy())

# Print the results with the dynamic ranges, now organized by experiment
for experiment_data in three_d_particles_found_v3:
    experiment_value = experiment_data[0]
    particles = experiment_data[1]
    print(f"Experiment: {experiment_value}")
    for i, particle in enumerate(particles):
        print(f"  Potential 3D Particle {i+1}:")
        print(f"    Dynamic X Range: {particle['x_range']}")
        print(f"    Dynamic Y Range: {particle['y_range']}")
        for detection in particle['detections']:
            print(f"    - id: {detection['id']}, particle: {detection['particle']}, x: {detection['x']}, y: {detection['y']}, z: {detection['z']}, Confidence: {detection['Confidence_Score']}")
        print("  " + "-" * 28)
    print("-" * 40)

Experiment: TS_5_4
  Potential 3D Particle 1:
    Dynamic X Range: (5802.06982421875, 5942.06982421875)
    Dynamic Y Range: (5057.5771484375, 5197.5771484375)
    - id: 2, particle: apo-ferritin, x: 5872.06982421875, y: 5127.5771484375, z: 50, Confidence: 0.5558279752731323
    - id: 5, particle: apo-ferritin, x: 5869.6513671875, y: 5130.33642578125, z: 60, Confidence: 0.5733864307403564
    - id: 11, particle: apo-ferritin, x: 5867.57568359375, y: 5131.310546875, z: 70, Confidence: 0.604671061038971
    - id: 16, particle: apo-ferritin, x: 5866.86328125, y: 5131.4677734375, z: 80, Confidence: 0.6547917723655701
    - id: 23, particle: apo-ferritin, x: 5865.494140625, y: 5131.0498046875, z: 90, Confidence: 0.6871874332427979
    - id: 28, particle: apo-ferritin, x: 5866.45068359375, y: 5129.2783203125, z: 100, Confidence: 0.7152366638183594
    - id: 36, particle: apo-ferritin, x: 5869.13037109375, y: 5125.623046875, z: 110, Confidence: 0.551835298538208
  ----------------------------

In [None]:
"""TO REMEMBER :
three_d_particles_found_v3[i] :  i being the index for the all stored results per experiment name ex TS_5_4
three_d_particles_found_v3[i][j] : j = 0 : indicates name of the experiment itself, j = 1 indicates all the identified ranges of potential particles, (ranges of X & Y through the Z axis)
three_d_particles_found_v3[i][j][k] : k being the index corresponding for each set of particles within a range
"""

In [265]:
three_d_particles_found_v3_filtered= []
for experiment_data in three_d_particles_found_v3:
    experiment_value = experiment_data[0]
    particles = experiment_data[1]
    filtered_particles = [particle for particle in particles if len(particle['detections']) >= 10]
    if filtered_particles:  # Only add the experiment if it has filtered particles
        three_d_particles_found_v3_filtered.append([experiment_value, filtered_particles])

In [266]:
three_d_particles_found_v3[1][1][0]

{'detections': [{'id': 2510,
   'experiment': 'TS_69_2',
   'particle': 'thyroglobulin',
   'x': 789.898681640625,
   'y': 2643.764892578125,
   'z': 490,
   'Confidence_Score': 0.25976359844207764},
  {'id': 2512,
   'experiment': 'TS_69_2',
   'particle': 'thyroglobulin',
   'x': 787.455810546875,
   'y': 2655.350341796875,
   'z': 500,
   'Confidence_Score': 0.3763004243373871},
  {'id': 2518,
   'experiment': 'TS_69_2',
   'particle': 'thyroglobulin',
   'x': 788.66796875,
   'y': 2646.2685546875,
   'z': 510,
   'Confidence_Score': 0.42095595598220825},
  {'id': 2527,
   'experiment': 'TS_69_2',
   'particle': 'thyroglobulin',
   'x': 787.6170654296875,
   'y': 2639.7939453125,
   'z': 520,
   'Confidence_Score': 0.44528743624687195},
  {'id': 2538,
   'experiment': 'TS_69_2',
   'particle': 'thyroglobulin',
   'x': 790.2908325195312,
   'y': 2636.570068359375,
   'z': 530,
   'Confidence_Score': 0.44900721311569214},
  {'id': 2554,
   'experiment': 'TS_69_2',
   'particle': 'thyr

In [261]:
three_d_particles_found_v3[1][1][13]

{'detections': [{'id': 2562,
   'experiment': 'TS_69_2',
   'particle': 'ribosome',
   'x': 3194.569091796875,
   'y': 3039.49267578125,
   'z': 550,
   'Confidence_Score': 0.5619056224822998},
  {'id': 2573,
   'experiment': 'TS_69_2',
   'particle': 'ribosome',
   'x': 3186.05908203125,
   'y': 3047.7958984375,
   'z': 560,
   'Confidence_Score': 0.6671814918518066},
  {'id': 2595,
   'experiment': 'TS_69_2',
   'particle': 'ribosome',
   'x': 3192.69091796875,
   'y': 3045.79345703125,
   'z': 570,
   'Confidence_Score': 0.7062949538230896},
  {'id': 2619,
   'experiment': 'TS_69_2',
   'particle': 'ribosome',
   'x': 3195.08740234375,
   'y': 3043.994140625,
   'z': 580,
   'Confidence_Score': 0.7335548996925354},
  {'id': 2646,
   'experiment': 'TS_69_2',
   'particle': 'ribosome',
   'x': 3194.97265625,
   'y': 3048.161865234375,
   'z': 590,
   'Confidence_Score': 0.7574697136878967},
  {'id': 2675,
   'experiment': 'TS_69_2',
   'particle': 'ribosome',
   'x': 3197.142578125,
 

In [239]:
print(f"RAW : \nTomogram {three_d_particles_found_v3[0][0]} has {len(three_d_particles_found_v3[0][1])} detected particles, Tomogram {three_d_particles_found_v3[1][0]} has {len(three_d_particles_found_v3[1][1])} detected particles, Tomogram {three_d_particles_found_v3[2][0]} has {len(three_d_particles_found_v3[2][1])} detected particles,  and the total is {len(three_d_particles_found_v3[0][1]) + len(three_d_particles_found_v3[1][1]) + len(three_d_particles_found_v3[2][1])}")

Tomogram TS_5_4 has 157 detected particles, Tomogram TS_69_2 has 177 detected particles, Tomogram TS_6_4 has 236 detected particles,  and the total is 570


In [267]:
print(f"FILETERED BY 10 PARTICLES + : \nTomogram {three_d_particles_found_v3_filtered[0][0]} has {len(three_d_particles_found_v3_filtered[0][1])} detected particles, Tomogram {three_d_particles_found_v3_filtered[1][0]} has {len(three_d_particles_found_v3_filtered[1][1])} detected particles, Tomogram {three_d_particles_found_v3_filtered[2][0]} has {len(three_d_particles_found_v3_filtered[2][1])} detected particles,  and the total is {len(three_d_particles_found_v3_filtered[0][1]) + len(three_d_particles_found_v3_filtered[1][1]) + len(three_d_particles_found_v3_filtered[2][1])}")

Tomogram TS_5_4 has 63 detected particles, Tomogram TS_69_2 has 75 detected particles, Tomogram TS_6_4 has 144 detected particles,  and the total is 282
