gg-napari-env

In [1]:
import numpy as np
import pandas as pd
import os 

In [2]:
raw_data_dirs = os.listdir('../raw-data/')
raw_data_p14 = [d for d in raw_data_dirs if '14' in d]
raw_data_p14

['20250307 B1 P14 U34-B3-546 Chymotrypsin-B2-647 DAPI',
 '20250328 1 P14 T79-intergenic-b2-647 T79-exonic-b1-546 DAPI',
 '20250328 5 P14 LOC603-b3-488 9E108-b1-546 9E116-b2-647 DAPI',
 '20250325 5 p14 g1-b1-546 lnc7-b2-647 dapi',
 '20250328 2 P14 R2-b3-488 Q1-b1-546 Lnc6-b2-647 DAPI',
 '20250328 4 P14 9E129-b3-488 LOC104-b1-546 9E116-b2-647 dapi',
 '20250325 4 p14 u34-b3-488 lnc4-b1-546 u21-b5-647 dapi',
 '20250328 3 P14 Lnc3-b3-488 L16-b2-594 Lnc2-b5-647 DAPI']

In [3]:
input = 'Q1'
input = [d for d in raw_data_dirs if input in d][0]
print(f'Using {input} as input directory')
input_dir = f'../raw-data/{input}/'
assert os.path.exists(input_dir), 'Input directory does not exist'
czi_files = [f for f in os.listdir(input_dir) if f.endswith('.czi')]
print(f"Found {len(czi_files)} czi files in {input_dir}")
print(czi_files)

Using 20250328 2 P14 R2-b3-488 Q1-b1-546 Lnc6-b2-647 DAPI as input directory
Found 6 czi files in ../raw-data/20250328 2 P14 R2-b3-488 Q1-b1-546 Lnc6-b2-647 DAPI/
['20250328 2 Q1lnc sample 6.czi', '20250328 2 Q1lnc sample 1.czi', '20250328 2 Q1lnc sample 4.czi', '20250328 2 Q1lnc sample 3.czi', '20250328 2 Q1lnc sample 2.czi', '20250328 2 Q1lnc sample 5.czi']


In [4]:
def parse_marker_file_z_major(xml_file_path, num_channels):
    """
    Parses an ImageJ Cell Counter XML marker file where Z indices are flattened
    assuming Z-major ordering (Z slices vary fastest, channels slowest), and
    the flattened indices (MarkerZ) are 1-based.

    MarkerZ was calculated in ImageJ as:
        MarkerZ = (z - 1) * num_channels + (channel - 1) + 1
                = 1-based linear index assuming Z-major ordering.

    This function reverses that logic to recover the true Z slice and channel
    as 0-based indices:

        z = (MarkerZ - 1) // num_channels
        channel = (MarkerZ - 1) % num_channels

    Parameters:
    -----------
    xml_file_path : str
        Path to the XML file exported from the ImageJ Cell Counter plugin.

    num_channels : int
        The number of image channels (e.g., 4 if using DAPI, 488, 546, 647).

    Returns:
    --------
    pandas.DataFrame
        A DataFrame with the following columns:
            - 'name'    : The marker type name from the XML (e.g., "488C")
            - 'x'       : Marker X coordinate
            - 'y'       : Marker Y coordinate
            - 'z'       : Recovered Z slice index (0-based)
            - 'channel' : Recovered channel index (0-based)
            - 'image'   : Inferred image filename from XML name
            - 'cytoplasmic' : Derived label from marker name (e.g., "AF488")

    Notes:
    ------
    This has been validated with a test placing markers on:
        - z=1, c=1 → MarkerZ = 1
        - z=1, c=2 → MarkerZ = 2
        - z=1, c=3 → MarkerZ = 3
        - z=2, c=1 → MarkerZ = 4
        - z=2, c=2 → MarkerZ = 5
        - z=2, c=3 → MarkerZ = 6

    The function correctly recovers:
        - z = 0-based z index (e.g., MarkerZ = 4 → z=1)
        - channel = 0-based channel index (e.g., MarkerZ = 4 → channel=0)
    """
    import xml.etree.ElementTree as ET
    import pandas as pd

    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    data = []
    for marker_type in root.findall(".//Marker_Type"):
        name = marker_type.findtext("Name")
        for marker in marker_type.findall("Marker"):
            x = int(marker.findtext("MarkerX"))
            y = int(marker.findtext("MarkerY"))
            z_raw = int(marker.findtext("MarkerZ"))

            # Adjust for 1-based MarkerZ
            z_index = z_raw - 1
            z = z_index // num_channels        # 0-based Z slice
            channel = z_index % num_channels   # 0-based channel index

            data.append({
                'name': name,
                'x': x,
                'y': y,
                'z': z,
                'channel': channel
            })

    df = pd.DataFrame(data)
    df['image'] = xml_file_path.split('CellCounter_')[1].replace('.xml', '.czi')
    df['cytoplasmic'] = df['name'].apply(lambda x: "AF" + x.split("C")[0])
    return df


In [5]:
results_dir = f'../results/{input}'
results_files = os.listdir(results_dir)
results_files = [f for f in results_files if f.endswith('with_borders.csv')]

# Load all results 
results = pd.DataFrame() 
for f in results_files:
    results_path = os.path.join(results_dir, f)
    results_individual = pd.read_csv(results_path)
    results_individual.columns = results_individual.columns.str.split('-T').str[0]
    results = pd.concat([results, results_individual], axis=0) 
results = results.reset_index(drop=True)

num_channels = sum(results.columns.str.contains("-nucleus-sum"))+1 # number of total channels including DAPI 

cell_counts_dir = f'../cytoplasmic-markers/{input}'
cell_counts_files = os.listdir(cell_counts_dir)
cellcounts = [] 
for f in cell_counts_files:
    xml_path = os.path.join(cell_counts_dir, f)
    df = parse_marker_file_z_major(xml_path, num_channels=num_channels)
    cellcounts.append(df)
cellcounts = pd.concat(cellcounts, axis=0)

results['cytoplasmic'] = None

# Iterate through each image
for image in results['image'].unique(): 
    # Load ROIs 
    all_rois_path = f'../results/{input}/{image.replace(".czi", "_rois.npy")}'
    all_rois = np.load(all_rois_path)
    
    # Get subset of cellcounts for this image
    cellcounts_image = cellcounts[cellcounts['image'] == image]

    # Iterate through each cell 
    for i, cell in cellcounts_image.iterrows():
        rois_z = all_rois[cell['z'], :, :]

        # Get the region label at the cell's coordinates
        region_z_id = rois_z[cell['y'], cell['x']]  # Note: numpy uses row (y), col (x)

        if region_z_id > 0: 
            # Get the results row for this cell 
            results_row = results.loc[(results['image'] == cell['image']) & 
                                    (results['z'] == cell['z']) & 
                                    (results['z_id'] == region_z_id)]
            if len(results_row) == 1: 
                results.loc[results_row.index[0], 'cytoplasmic'] = cell['cytoplasmic']
            if len(results_row) > 1: 
                print(f'Found {len(results_row)} results rows for {cell["image"]} z={cell["z"]}')

# Save the updated results
results_path = os.path.join(results_dir, 'all_results_nuclei_cytoplasm_markers.csv')
results.to_csv(results_path, index=False)

## Run on all images

In [36]:
raw_data_dirs = os.listdir('../raw-data/')
raw_data_p14 = [d for d in raw_data_dirs if '14' in d]
raw_data_p14 = [d for d in raw_data_p14 if 'T79' not in d]
raw_data_p14

['20250307 B1 P14 U34-B3-546 Chymotrypsin-B2-647 DAPI',
 '20250328 5 P14 LOC603-b3-488 9E108-b1-546 9E116-b2-647 DAPI',
 '20250325 5 p14 g1-b1-546 lnc7-b2-647 dapi',
 '20250328 2 P14 R2-b3-488 Q1-b1-546 Lnc6-b2-647 DAPI',
 '20250328 4 P14 9E129-b3-488 LOC104-b1-546 9E116-b2-647 dapi',
 '20250325 4 p14 u34-b3-488 lnc4-b1-546 u21-b5-647 dapi',
 '20250328 3 P14 Lnc3-b3-488 L16-b2-594 Lnc2-b5-647 DAPI']

In [37]:
for input in raw_data_p14: 

    results_dir = f'../results/{input}'
    results_files = os.listdir(results_dir)
    results_files = [f for f in results_files if f.endswith('with_borders.csv')]

    # Load all results 
    results = pd.DataFrame() 
    for f in results_files:
        results_path = os.path.join(results_dir, f)
        results_individual = pd.read_csv(results_path)
        results_individual.columns = results_individual.columns.str.split('-T').str[0]
        results = pd.concat([results, results_individual], axis=0) 
    results = results.reset_index(drop=True)

    num_channels = sum(results.columns.str.contains("-nucleus-sum"))+1 # number of total channels including DAPI 

    cell_counts_dir = f'../cytoplasmic-markers/{input}'
    cell_counts_files = os.listdir(cell_counts_dir)
    cellcounts = [] 
    for f in cell_counts_files:
        xml_path = os.path.join(cell_counts_dir, f)
        df = parse_marker_file_z_major(xml_path, num_channels=num_channels)
        cellcounts.append(df)
    cellcounts = pd.concat(cellcounts, axis=0)

    results['cytoplasmic'] = None

    # Iterate through each image
    for image in results['image'].unique(): 
        # Load ROIs 
        all_rois_path = f'../results/{input}/{image.replace(".czi", "_rois.npy")}'
        all_rois = np.load(all_rois_path)
        
        # Get subset of cellcounts for this image
        cellcounts_image = cellcounts[cellcounts['image'] == image]

        # Iterate through each cell 
        for i, cell in cellcounts_image.iterrows():
            rois_z = all_rois[cell['z'], :, :]

            # Get the region label at the cell's coordinates
            region_z_id = rois_z[cell['y'], cell['x']]  # Note: numpy uses row (y), col (x)

            if region_z_id > 0: 
                # Get the results row for this cell 
                results_row = results.loc[(results['image'] == cell['image']) & 
                                        (results['z'] == cell['z']) & 
                                        (results['z_id'] == region_z_id)]
                if len(results_row) == 1: 
                    results.loc[results_row.index[0], 'cytoplasmic'] = cell['cytoplasmic']
                if len(results_row) > 1: 
                    print(f'Found {len(results_row)} results rows for {cell["image"]} z={cell["z"]}')

    # Save the updated results
    results_path = os.path.join(results_dir, 'all_results_nuclei_cytoplasm_markers.csv')
    results.to_csv(results_path, index=False)