
# CZII: Convert czii10411 to Numpy arrays and annotations

[Forked from here](https://www.kaggle.com/code/sersasj/czii-making-datasets-for-yolo-synthetic-data)

My goal,  to denoise and modify the format and file structure to a simpler thing to work with, saving as .npy arrays for the volume, json files for each of the particle types.

So for the denoised volumes: Volumes/TS____.npy
For the annotations: Annotations/TS____/apo-ferritin.json   beta-amylase.json etc  

# Install and Import Modules

In [1]:
!pip install zarr opencv-python

Collecting zarr
  Downloading zarr-2.18.3-py3-none-any.whl.metadata (5.7 kB)
Collecting asciitree (from zarr)
  Downloading asciitree-0.3.3.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting numcodecs>=0.10.0 (from zarr)
  Downloading numcodecs-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading zarr-2.18.3-py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numcodecs-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: asciitree
  Building wheel for asciitree (setup.py) ... [?25l- \ done
[?25h  Created wheel for asciitree: filename=asciitree-0.3.3-py3-none-any.whl size=5034 sha256=aac3fa8e3970d999c08d963e

In [2]:
import json
import numpy as np
import matplotlib.pyplot as plt
import zarr
import glob, os
import cv2
import shutil
import pandas as pd
from tqdm.notebook import tqdm
from scipy.ndimage import gaussian_filter, median_filter
from joblib import Parallel, delayed
from tqdm import tqdm
from pathlib import Path

In [3]:
synthetic_runs = sorted(glob.glob('/kaggle/input/czii10441/10441/T*'))
print(synthetic_runs)
runs = [os.path.basename(x) for x in synthetic_runs]
i2r_dict = {i: r for i, r in zip(range(len(runs)), runs)}
r2t_dict = {r: i for i, r in zip(range(len(runs)), runs)}
print("Runs:", i2r_dict)

['/kaggle/input/czii10441/10441/TS_0', '/kaggle/input/czii10441/10441/TS_1', '/kaggle/input/czii10441/10441/TS_10', '/kaggle/input/czii10441/10441/TS_11', '/kaggle/input/czii10441/10441/TS_12', '/kaggle/input/czii10441/10441/TS_13', '/kaggle/input/czii10441/10441/TS_14', '/kaggle/input/czii10441/10441/TS_15', '/kaggle/input/czii10441/10441/TS_16', '/kaggle/input/czii10441/10441/TS_17', '/kaggle/input/czii10441/10441/TS_18', '/kaggle/input/czii10441/10441/TS_19', '/kaggle/input/czii10441/10441/TS_2', '/kaggle/input/czii10441/10441/TS_20', '/kaggle/input/czii10441/10441/TS_21', '/kaggle/input/czii10441/10441/TS_22', '/kaggle/input/czii10441/10441/TS_23', '/kaggle/input/czii10441/10441/TS_24', '/kaggle/input/czii10441/10441/TS_25', '/kaggle/input/czii10441/10441/TS_26', '/kaggle/input/czii10441/10441/TS_3', '/kaggle/input/czii10441/10441/TS_4', '/kaggle/input/czii10441/10441/TS_5', '/kaggle/input/czii10441/10441/TS_6', '/kaggle/input/czii10441/10441/TS_7', '/kaggle/input/czii10441/10441/T

# Normalize Function
Normalize the image to a value between 0 and 255.

In [4]:
def convert_to_8bit(x):
    lower, upper = np.percentile(x, (0.5, 99.5))
    x = np.clip(x, lower, upper)
    x = (x - x.min()) / (x.max() - x.min() + 1e-12) * 255
    return x.round().astype("uint8")

# Information about Labels

In [5]:
p2i_dict = {
    'apo-ferritin': 0,
    'beta-amylase': 1,
    'beta-galactosidase': 2,
    'ribosome': 3,
    'thyroglobulin': 4,
    'virus-like-particle': 5
}

i2p = {v: k for k, v in p2i_dict.items()}

particle_radius = {
    'apo-ferritin': 60,
    'beta-amylase': 65,
    'beta-galactosidase': 90,
    'ribosome': 150,
    'thyroglobulin': 130,
    'virus-like-particle': 135,
}

particle_names = ['apo-ferritin', 'beta-amylase', 'beta-galactosidase', 'ribosome', 'thyroglobulin', 'virus-like-particle']

In [6]:
def denoise_tomogram(tomogram, method='gaussian', **kwargs):
    """
    Apply denoising to a tomogram.

    Parameters:
        tomogram (np.ndarray): The input tomogram to denoise.
        method (str): The denoising method ('gaussian' or 'median').
        kwargs: Parameters for the respective method.
    
    Returns:
        np.ndarray: The denoised tomogram.
    """
    if method == 'gaussian':
        return gaussian_filter(tomogram, sigma=kwargs.get('sigma', 1))
    elif method == 'median':
        return median_filter(tomogram, size=kwargs.get('size', 3))
    else:
        raise ValueError(f"Unsupported denoising method: {method}")

In [7]:
name_map = {
    'apo-ferritin': 'ferritin_complex',
    'beta-amylase': 'beta_amylase',
    'beta-galactosidase': 'beta_galactosidase',
    'ribosome': 'cytosolic_ribosome',
    'thyroglobulin': 'thyroglobulin',
    'virus-like-particle': 'pp7_vlp',
}

In [8]:
def ndjson_to_json(ndjson_path):
    if not os.path.isfile(ndjson_path):
        raise FileNotFoundError(f"The file {ndjson_path} does not exist.")

    data = []
    try:
        with open(ndjson_path, 'r', encoding='utf-8') as ndjson_file:
            for line_number, line in enumerate(ndjson_file, start=1):
                stripped_line = line.strip()
                if stripped_line:  
                    try:
                        json_object = json.loads(stripped_line)
                        data.append(json_object)
                    except json.JSONDecodeError as e:
                        raise json.JSONDecodeError(
                            f"Error decoding JSON on line {line_number}: {e.msg}",
                            e.doc,
                            e.pos
                        )
    except Exception as e:
        raise e

    wrapped_data = {"points": data}  #to match the json structure of our real samples

    return wrapped_data

def make_denoised_vol(run_name):

    # Path to the volume  
    vol_path = glob.glob(f'/kaggle/input/czii10441/10441/{run_name}/**/Tomograms/**/*.zarr', recursive=True)
    if not vol_path:
        print(f"No volume found for run {run_name} in synthetic data.")
        return
    vol_path = vol_path[0]
    
    print(f"Volume path: {vol_path}")
    if not os.path.exists(vol_path):
        print(f"Volume file not found: {vol_path}")
        return

    # Read the volume
    vol = zarr.open(vol_path, mode='r')
    vol = vol[0]
    vol = denoise_tomogram(np.array(vol)[:184], method='gaussian', sigma=1)  # Apply denoise
    vol_2 = convert_to_8bit(vol)

    vol_path = Path(f'/kaggle/working/Volumes/{run_name}.npy')
    np.save(vol_path, vol_2)
    
    
    
    # Process each particle type
    os.makedirs(f'Annotations/{run_name}', exist_ok=True)
                  
    for p, particle in enumerate(particle_names):
        
        particle_name_in_file = name_map.get(particle)
        if not particle_name_in_file:
            print(f"Particle name mapping not found for: {particle}")
            continue
        
        ndjson_each_particle = glob.glob(f'/kaggle/input/czii10441/10441/{run_name}/**/Annotations/**/*.ndjson', recursive=True)
        if not ndjson_each_particle:
            print(f"No NDJSON files found for particle: {particle} in run: {run_name}")
            continue
        
        filtered_ndjson_files = [f for f in ndjson_each_particle if particle_name_in_file in f]
        if not filtered_ndjson_files:
            print(f"No NDJSON files match the particle: {particle} for run: {run_name}")
            continue
        
        json_each_particle = ndjson_to_json(filtered_ndjson_files[0])

        json_file_path = Path(f'Annotations/{run_name}/{particle}.json')
        with json_file_path.open("w") as json_file:
            json.dump(json_each_particle, json_file, indent=4)
        
        
        #df = pd.DataFrame(json_each_particle)

        #if  'location' not in df.columns:
        #    print(f"'{column_name}' column not found in DataFrame for particle: {particle}")
        #    continue
        
        #normalized_data = pd.json_normalize(df['location'])
        #df[['x', 'y', 'z']] = normalized_data * 10.012
        #df.dropna(subset=["x", "y", "z"], inplace=True)

# Prepare Folders

In [9]:
os.makedirs("Volumes", exist_ok=True)
os.makedirs("Annotations", exist_ok=True)

# Create Dataset

In [10]:
def parallel_denoising(runs, n_jobs=-1):
    """Runs make_denoised_vol in parallel for a list of runs."""
    Parallel(n_jobs=n_jobs)(
        delayed(make_denoised_vol)(run) for run in tqdm(runs, desc="Processing Runs")
    )

parallel_denoising(runs)

Processing Runs: 100%|██████████| 27/27 [01:05<00:00,  2.41s/it]
