In [12]:
import pandas as pd
import numpy as np
import os
from scipy.spatial import KDTree
from bisect import insort
from collections import defaultdict
import math

In [13]:
def match(parents, childs, arity, thresholds=None):

    kd_tree = KDTree(data=parents, leafsize=10)

    child_to_parent = {}
    visited_child = {}
    parent_to_child = defaultdict(list)

    num_parents = len(parents) + 1 # 1-based for KDTree query

    for child_idx, child_coords in enumerate(childs):

        # Initialize child_to_parent record in dictionary
        child_to_parent[child_idx] = {}
        child_to_parent[child_idx]["path_length"] = float("inf") # The length of the shortest path
        child_to_parent[child_idx]["parent"] = None # The index of the cell to which the shortest path corresponds

        # Record coordinates in visited_child dictionary
        visited_child[child_idx] = child_coords

        # Target child_idx to lookup in KDTree
        lookup_child_idx = child_idx

        for k in range(1, num_parents):

            # Query closest parent
            dist_arr, parent_idx_arr = kd_tree.query(x=visited_child[lookup_child_idx], k=[k], workers=1)
            dist = float(dist_arr)
            parent_idx = int(parent_idx_arr)

            # Get threshold
            if thresholds:
                threshold = 2.5 * thresholds[parent_idx]
            else:
                threshold = float("inf")

            # If closest parent distance is greater than threshold, child is automatically invalidated
            if dist > threshold:
                child_to_parent[lookup_child_idx]["path_length"] = -1
                child_to_parent[lookup_child_idx]["parent"] = -1
                break

            # Add parent information to child_to_parent dictionary
            child_to_parent[lookup_child_idx]["path_length"] = dist
            child_to_parent[lookup_child_idx]["parent"] = parent_idx

            # Add child information to parent_to_child dictionary
            insort(parent_to_child[parent_idx], (dist, lookup_child_idx))

            # Check if parent has number of childs exceeding arity
            if len(parent_to_child[parent_idx]) > arity:

                # Remove the furthest child
                _, child_to_remove = parent_to_child[parent_idx].pop()

                # Case of no possible match - no more parents left
                if k == num_parents:
                    child_to_parent[child_to_remove]["path_length"] = -1
                    child_to_parent[child_to_remove]["parent"] = -1
                else: 
                # Proceed to match with next possible closest parent

                    # Reintialize child_to_parent record in dictionary
                    child_to_parent[child_to_remove]["path_length"] = float("inf")
                    child_to_parent[child_to_remove]["parent"] = None

                    # Set removed child as lookup target to match with next available neighbor
                    lookup_child_idx = child_to_remove
            else:
                # If insertion suceed, proceed to next child in list
                break

    return child_to_parent      


In [14]:
cellprofiler_path = "/home/krarm/AutomatedCiliaMeasurements/sample_csvs"

In [15]:
# Targeted fields to read from cellprofiler csvs
fields = ["ImageNumber", "ObjectNumber", "Location_Center_X", "Location_Center_Y", "AreaShape_MeanRadius"]

In [16]:
nucleus_df = pd.read_csv(
    os.path.join(cellprofiler_path, "MyExpt_Nucleus.csv"), 
    skipinitialspace=True, 
    usecols=fields
)
nucleus_df.rename(columns={"ObjectNumber": "Nucleus"}, inplace=True)
nucleus_df

Unnamed: 0,ImageNumber,Nucleus,AreaShape_MeanRadius,Location_Center_X,Location_Center_Y
0,1,1,3.816239,364.820513,12.092593
1,1,2,4.293767,814.714777,19.994845
2,1,3,4.409043,782.897168,22.812221
3,1,4,4.164916,670.829710,36.637681
4,1,5,5.487574,422.262814,38.510360
...,...,...,...,...,...
7068,25,334,3.597257,441.176606,997.100917
7069,25,335,6.336996,704.082414,991.824140
7070,25,336,4.316920,38.092421,1007.036969
7071,25,337,3.565770,989.922353,1008.974118


In [17]:
centriole_df = pd.read_csv(
    os.path.join(cellprofiler_path, "MyExpt_Centriole.csv"), 
    skipinitialspace=True, 
    usecols=fields
)
centriole_df.rename(columns={"ObjectNumber": "Centriole"}, inplace=True)
centriole_df

Unnamed: 0,ImageNumber,Centriole,AreaShape_MeanRadius,Location_Center_X,Location_Center_Y
0,1,1,1.027614,741.066667,14.466667
1,1,2,1.096233,955.210526,19.157895
2,1,3,1.112132,998.350000,30.150000
3,1,4,1.031863,1018.076923,35.538462
4,1,5,1.140382,32.793103,41.517241
...,...,...,...,...,...
6838,25,319,1.082843,291.500000,1006.200000
6839,25,320,1.051777,2.125000,1007.875000
6840,25,321,1.046024,466.222222,1016.666667
6841,25,322,1.216176,654.666667,1017.266667


In [18]:
cilia_df = pd.read_csv(
    os.path.join(cellprofiler_path, "MyExpt_Cilia.csv"), 
    skipinitialspace=True, 
    usecols=fields
)
cilia_df.rename(columns={"ObjectNumber": "Cilia"}, inplace=True)
cilia_df

Unnamed: 0,ImageNumber,Cilia,AreaShape_MeanRadius,Location_Center_X,Location_Center_Y
0,1,1,1.128565,129.636364,3.090909
1,1,2,1.000000,873.000000,4.000000
2,1,3,1.000000,645.333333,6.222222
3,1,4,1.092047,427.333333,6.888889
4,1,5,1.000000,256.000000,9.000000
...,...,...,...,...,...
8071,25,223,1.000000,821.000000,1003.000000
8072,25,224,1.316908,641.661765,1008.397059
8073,25,225,1.176777,532.093750,1016.625000
8074,25,226,1.287819,905.607843,1017.568627


In [19]:
# Create location dictionary (for easy distance calculation)
nucleus_loc_dict = nucleus_df.groupby("ImageNumber")[["Nucleus", "Location_Center_X", "Location_Center_Y"]].apply(lambda x : x.set_index("Nucleus").to_dict(orient="index")).to_dict()
centriole_loc_dict = centriole_df.groupby("ImageNumber")[["Centriole", "Location_Center_X", "Location_Center_Y"]].apply(lambda x : x.set_index("Centriole").to_dict(orient="index")).to_dict()
cilia_loc_dict = cilia_df.groupby("ImageNumber")[["Cilia", "Location_Center_X", "Location_Center_Y"]].apply(lambda x : x.set_index("Cilia").to_dict(orient="index")).to_dict()

In [20]:
grouped_nucleus = nucleus_df.groupby("ImageNumber")
grouped_centriole = centriole_df.groupby("ImageNumber")
grouped_cilia = cilia_df.groupby("ImageNumber")

In [21]:
# Initialize c2c_df
c2c_df = pd.DataFrame(columns=['ImageNumber', 'Nucleus', 'Centriole1', 'Centriole2', 'Cilia', 'Nuc_Cent1', 'Nuc_Cent2', 'Nuc_Cil'])

# Iterate over groups 
# Note: groups in grouped_nucleus, grouped_centriole and grouped_cilia are expected to be aligned
for key in grouped_nucleus.groups.keys():
    
    # Fetch respective group
    nucleus_group = grouped_nucleus.get_group(key)
    centriole_group = grouped_centriole.get_group(key)
    cilia_group = grouped_cilia.get_group(key)

    coord_fields = ["Location_Center_X", "Location_Center_Y"]
    threshold_field = "AreaShape_MeanRadius"

    #region : Nucleus - Centriole Matching

    # Match nucleus (parent) with closest 2 centrioles (child) 
    nucleus_centriole_match_dict = match(
        parents=nucleus_group.loc[:, coord_fields].values, 
        childs=centriole_group.loc[:, coord_fields].values, 
        arity=2,
        thresholds=nucleus_group[threshold_field].to_list()
    )

    # Make df from dict and rename columns
    nucleus_centriole_match_df = pd.DataFrame.from_dict(nucleus_centriole_match_dict, orient='index')
    nucleus_centriole_match_df.rename(columns={"path_length":"Nuc_Cent", "parent":"Nucleus"}, inplace=True)
    nucleus_centriole_match_df.reset_index(inplace=True, names="Centriole") 

    # Drop unmatched/invalid centriole
    nucleus_centriole_match_df.drop(nucleus_centriole_match_df[nucleus_centriole_match_df.Nucleus == -1].index, inplace=True)   
    
    # Increment Centriole and Nucleus number since they are 1-based
    nucleus_centriole_match_df["Centriole"] += 1
    nucleus_centriole_match_df["Nucleus"] += 1

    # Sort values by nucleus number and distance from nucleus
    nucleus_centriole_match_df = nucleus_centriole_match_df.sort_values(by=['Nucleus', 'Nuc_Cent']).groupby(['Nucleus'], as_index=False).agg(list)

    # Split Centriole number and distances from nucleus
    try:
        nucleus_centriole_split_centriole_df = pd.DataFrame(nucleus_centriole_match_df['Centriole'].to_list(), columns = ['Centriole1', 'Centriole2'], dtype=pd.Int64Dtype())
    except ValueError:
        nucleus_centriole_split_centriole_df = pd.DataFrame(nucleus_centriole_match_df['Centriole'].to_list(), columns = ['Centriole1'], dtype=pd.Int64Dtype())
        nucleus_centriole_split_centriole_df['Centriole2'] = pd.NA

    try:
        nucleus_centriole_split_nc_df = pd.DataFrame(nucleus_centriole_match_df['Nuc_Cent'].to_list(), columns = ['Nuc_Cent1', 'Nuc_Cent2'])
    except ValueError:
        nucleus_centriole_split_nc_df = pd.DataFrame(nucleus_centriole_match_df['Nuc_Cent'].to_list(), columns = ['Nuc_Cent1'])
        nucleus_centriole_split_nc_df['Nuc_Cent2'] = np.nan

    nucleus_centriole_match_df = pd.concat([nucleus_centriole_match_df, nucleus_centriole_split_centriole_df, nucleus_centriole_split_nc_df], axis=1)
    nucleus_centriole_match_df.drop(['Centriole', 'Nuc_Cent'], axis=1, inplace=True)
    nucleus_centriole_match_df.drop_duplicates(inplace=True)

    #endregion

    #region : Nucleus - Cilia Matching
    
    # Match cilia (child) with closest nucleus (parent) 
    nucleus_cilia_match_dict = match(
        parents=nucleus_group.loc[:, coord_fields].values, 
        childs=cilia_group.loc[:, coord_fields].values, 
        arity=1
    )

    # Make df from dict and rename columns
    nucleus_cilia_match_df = pd.DataFrame.from_dict(nucleus_cilia_match_dict, orient='index')
    nucleus_cilia_match_df.rename(columns={"path_length":"Nuc_Cil", "parent":"Nucleus"}, inplace=True)
    nucleus_cilia_match_df.reset_index(inplace=True, names="Cilia")

    # Drop unmatched/invalid cilia
    nucleus_cilia_match_df.drop(nucleus_cilia_match_df[nucleus_cilia_match_df.Nucleus == -1].index, inplace=True)   
    
    # Increment Cilia and Nucleus number since they are 1-based
    nucleus_cilia_match_df["Cilia"] += 1
    nucleus_cilia_match_df["Nucleus"] += 1

    #endregion

    # Merge two matching dataframes
    nucleus_centriole_cilia_df = nucleus_centriole_match_df.merge(right=nucleus_cilia_match_df, how='left', on=['Nucleus'])

    # Set ImageNumber 
    nucleus_centriole_cilia_df["ImageNumber"] = key

    # Concat in c2c output
    c2c_df = pd.concat([c2c_df, nucleus_centriole_cilia_df], ignore_index=True)

# Ensure all columns are in appropriate datatypes
c2c_type_dict = {'ImageNumber': pd.Int64Dtype(), 'Nucleus': pd.Int64Dtype(), 'Centriole1': pd.Int64Dtype(), 'Centriole2': pd.Int64Dtype(), 'Cilia': pd.Int64Dtype()}
c2c_df = c2c_df.astype(c2c_type_dict)

c2c_df


Unnamed: 0,ImageNumber,Nucleus,Centriole1,Centriole2,Cilia,Nuc_Cent1,Nuc_Cent2,Nuc_Cil
0,1,11,9,,20,3.759711,,17.476649
1,1,17,12,,27,7.812985,,33.773248
2,1,22,15,18,39,6.353778,10.841737,95.237758
3,1,30,22,,58,11.429326,,32.988965
4,1,43,30,,72,4.197301,,29.567108
...,...,...,...,...,...,...,...,...
929,25,301,290,,183,7.540813,,28.705936
930,25,309,292,,196,15.195512,,21.685798
931,25,311,299,,,7.458743,,
932,25,331,311,,220,5.764670,,8.138111


In [23]:
c2c_df["Cent1_Cil"] = c2c_df.apply(lambda x : math.dist(
    [centriole_loc_dict[x["ImageNumber"]][x["Centriole1"]]["Location_Center_X"], centriole_loc_dict[x["ImageNumber"]][x["Centriole1"]]["Location_Center_Y"]], 
    [cilia_loc_dict[x["ImageNumber"]][x["Cilia"]]["Location_Center_X"], cilia_loc_dict[x["ImageNumber"]][x["Cilia"]]["Location_Center_Y"]]
    ) if pd.notna(x["Centriole1"]) and pd.notna(x["Cilia"]) else pd.NA, axis=1)
c2c_df["Cent2_Cil"] = c2c_df.apply(lambda x : math.dist(
    [centriole_loc_dict[x["ImageNumber"]][x["Centriole2"]]["Location_Center_X"], centriole_loc_dict[x["ImageNumber"]][x["Centriole2"]]["Location_Center_Y"]], 
    [cilia_loc_dict[x["ImageNumber"]][x["Cilia"]]["Location_Center_X"], cilia_loc_dict[x["ImageNumber"]][x["Cilia"]]["Location_Center_Y"]]
    ) if pd.notna(x["Centriole2"]) and pd.notna(x["Cilia"]) else np.NaN, axis=1)
c2c_df["Cent1_Cent2"] = c2c_df.apply(lambda x : math.dist(
    [centriole_loc_dict[x["ImageNumber"]][x["Centriole1"]]["Location_Center_X"], centriole_loc_dict[x["ImageNumber"]][x["Centriole1"]]["Location_Center_Y"]], 
    [centriole_loc_dict[x["ImageNumber"]][x["Centriole2"]]["Location_Center_X"], centriole_loc_dict[x["ImageNumber"]][x["Centriole2"]]["Location_Center_Y"]]
    ) if pd.notna(x["Centriole1"]) and pd.notna(x["Centriole2"]) else np.NaN, axis=1)
c2c_df

KeyError: 256