# Examples of using the PairwiseCompareManager
There are two primary steps:
1. Organize your data into a tidy pandas dataframe.
2. Compute the comparisons.

In [1]:
import pathlib
import sys

import pandas as pd

## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

# Custom Imports

In [3]:
sys.path.append(f"{root_dir}/0.data_analysis_and_processing/utils")

from MIC import MIC
from PairwiseCompareManager import PairwiseCompareManager

# Inputs

In [4]:
# Paths to original nuclear speckle data
data_dir = root_dir / "nuclear_speckles_data"
nuclear_mask_dir = (data_dir / "Nuclear_masks").resolve(strict=True)
sc_profiles_path = list((data_dir / "Preprocessed_data/single_cell_profiles").resolve(strict=True).glob("*feature_selected*.parquet"))

# Load single-cell profile data
scdfs = [pd.read_parquet(sc_path) for sc_path in sc_profiles_path if sc_path.is_file()]

# Outputs

In [5]:
distribution_figures_path = pathlib.Path("well_sirna_mic_distribution_figures")
distribution_figures_path.mkdir(parents=True, exist_ok=True)

mic_comparisons_path = pathlib.Path("mic_comparisons_data")
mic_comparisons_path.mkdir(parents=True, exist_ok=True)

# Processing

## Combine Common Data
Column names are used to combine common single-cell data.

In [6]:
common_columns = scdfs[0].columns
for scdf in scdfs[1:]:
    common_columns = common_columns.intersection(scdf.columns)

scdfs = pd.concat(scdfs, axis=0)[common_columns]

In [7]:
scdfs.dropna(inplace=True)

print(scdfs)

      Metadata_CellLine Metadata_Condition  Metadata_ImageNumber  \
0                  786O                NTC                     1   
1                  786O                NTC                     1   
2                  786O                NTC                     1   
3                  786O                NTC                     1   
4                  786O                NTC                     1   
...                 ...                ...                   ...   
63354              293T          untreated                   407   
63355              293T          untreated                   407   
63356              293T          untreated                   407   
63357              293T          untreated                   407   
63358              293T          untreated                   407   

      Metadata_Plate Metadata_Well Metadata_Site  Metadata_Nuclei_Site_Count  \
0             slide3            A1           M14                          40   
1             slide3   

## Seperate Gold, A647, and Dapi Features

In [8]:
gold_scdfs = scdfs.loc[:, scdfs.columns.str.contains("GOLD|Metadata", regex=True)]
a647_scdfs = scdfs.loc[:, scdfs.columns.str.contains("A647|Metadata", regex=True)]
dapi_scdfs = scdfs.loc[:, scdfs.columns.str.contains("DAPI|Metadata", regex=True)]

gold_scdfs.columns = gold_scdfs.columns.str.replace('_GOLD', '', regex=False)
a647_scdfs.columns = gold_scdfs.columns.str.replace('_A647', '', regex=False)
dapi_scdfs.columns = dapi_scdfs.columns.str.replace('_DAPI', '', regex=False)

## Combine Seperated Stain Features

In [9]:
gold_scdfs = gold_scdfs.assign(Metadata_Stain="GOLD")
dapi_scdfs = dapi_scdfs.assign(Metadata_Stain="DAPI")
a647_scdfs = a647_scdfs.assign(Metadata_Stain="A647")

common_cols = gold_scdfs.columns.intersection(dapi_scdfs.columns).intersection(a647_scdfs.columns)
scdfs = pd.concat([gold_scdfs[common_cols], dapi_scdfs[common_cols], a647_scdfs[common_cols]], axis=0)

## Specify Feature Metadata Columns

In [10]:
feat_cols = scdfs.columns[~scdfs.columns.str.contains("Metadata")]

## Mean Aggregation to the Well Level

In [11]:
# Metadata to retain
agg_funcs = {
    "Metadata_Condition": "first",
}

agg_funcs |= {feat_col: "mean" for feat_col in feat_cols}
staindf = scdfs.groupby(["Metadata_Plate", "Metadata_Well", "Metadata_Stain"]).agg(agg_funcs).reset_index()

In [12]:
staindf.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Stain,Metadata_Condition,Nuclei_Granularity_1,Nuclei_Intensity_MassDisplacement,Nuclei_Intensity_StdIntensityEdge,Nuclei_RadialDistribution_FracAtD_4of4,Nuclei_RadialDistribution_MeanFrac_1of4,Nuclei_RadialDistribution_MeanFrac_4of4,...,Nuclei_RadialDistribution_ZernikePhase_8_8,Nuclei_RadialDistribution_ZernikePhase_9_1,Nuclei_RadialDistribution_ZernikePhase_9_3,Nuclei_RadialDistribution_ZernikePhase_9_5,Nuclei_RadialDistribution_ZernikePhase_9_7,Nuclei_RadialDistribution_ZernikePhase_9_9,Nuclei_Texture_Correlation_3_00_256,Nuclei_Texture_Correlation_3_01_256,Nuclei_Texture_Correlation_3_02_256,Nuclei_Texture_Correlation_3_03_256
0,slide1,A1,A647,NTC,0.171474,0.477067,1.159588,-0.746587,-0.269572,0.163854,...,-0.015042,0.018605,-0.058823,-0.03001,-0.012822,-0.053868,-0.329853,-0.279762,-0.340624,-0.32691
1,slide1,A1,DAPI,NTC,-0.217538,0.719026,0.331419,0.609352,0.146927,0.117445,...,-0.006253,0.021927,-0.07589,-0.050538,-0.030914,-0.023099,0.37765,0.40396,0.40906,0.397982
2,slide1,A1,GOLD,NTC,-0.382057,0.474737,1.114561,0.885131,-0.330499,0.59085,...,0.006725,0.008139,-0.054102,-0.042331,0.003489,-0.040262,0.237765,0.097259,0.193286,0.059259
3,slide1,A2,A647,ALY kd8,0.035772,0.765541,1.081828,-0.434666,-0.045196,0.463068,...,-0.000985,0.00611,0.018343,-0.002243,0.002698,-0.014026,0.205998,0.206157,0.231749,0.189171
4,slide1,A2,DAPI,ALY kd8,-0.449247,0.681245,0.724933,-0.116966,0.847064,-0.687609,...,0.005417,-0.016053,-0.000847,0.018493,0.008329,-0.033349,0.771391,0.715149,0.782281,0.70023


# MIC Comparisons
In this dataset, each instance (row) corresponds to a stain for a well.
This means that multiple rows reference the same wells, but different stain features.
When using the PairwiseCompareManager the _same_columns and _different_columns parameters must follow two conditions:
1. _same_columns must include at least one list element if _different_columns has less than two list elements.
2. _different columns must contain one or more list elements

## Compare between the same well from the same plate across different stains

In [13]:
mic_comparator = MIC()

comparer = PairwiseCompareManager(
    _df=staindf.copy(),
    _comparator=mic_comparator,
    _same_columns=["Metadata_Plate", "Metadata_Well"],
    _different_columns=["Metadata_Stain"],
    _feat_cols=feat_cols
)

micdf = comparer()

In [14]:
micdf.head()

Unnamed: 0,Metadata_Plate__antehoc_group0,Metadata_Plate__antehoc_group1,Metadata_Well__antehoc_group0,Metadata_Well__antehoc_group1,Metadata_Stain__posthoc_group0
0,slide1,slide1,A1,A1,A647
1,slide1,slide1,A1,A1,A647
2,slide1,slide1,A1,A1,DAPI
3,slide1,slide1,A2,A2,A647
4,slide1,slide1,A2,A2,A647


## Compare between different stains and conditions

In [15]:
mic_comparator = MIC()

comparer = PairwiseCompareManager(
    _df=staindf.copy(),
    _comparator=mic_comparator,
    _different_columns=["Metadata_Stain", "Metadata_Condition"],
    _feat_cols=feat_cols
)

micdf = comparer()

In [16]:
micdf.head()

Unnamed: 0,mic_e,Metadata_Stain__antehoc_group0,Metadata_Stain__antehoc_group1,Metadata_Condition__posthoc_group0,Metadata_Condition__posthoc_group1
0,0.767101,A647,DAPI,ALY kd5,ALY kd8
1,0.678485,A647,DAPI,ALY kd5,DDX39A kd1
2,0.830519,A647,DAPI,ALY kd5,DDX39A kd4
3,0.72797,A647,DAPI,ALY kd5,FIBP kd6
4,0.718055,A647,DAPI,ALY kd5,FIBP kd7


## Compare between different stains, conditions, and wells
Excludes wells from the output dataframe

In [17]:
mic_comparator = MIC()

comparer = PairwiseCompareManager(
    _df=staindf.copy(),
    _comparator=mic_comparator,
    _different_columns=["Metadata_Stain", "Metadata_Plate", "Metadata_Well"],
    _feat_cols=feat_cols,
    _drop_cols=["Metadata_Well"]
)

micdf = comparer()

In [18]:
micdf.head()

Unnamed: 0,mic_e,Metadata_Stain__antehoc_group0,Metadata_Stain__antehoc_group1,Metadata_Plate__posthoc_group0,Metadata_Plate__posthoc_group1,Metadata_Well__posthoc_group0,Metadata_Well__posthoc_group1
0,0.857133,A647,DAPI,slide1,slide2,A1,A2
1,0.818074,A647,DAPI,slide1,slide2,A1,A3
2,0.693374,A647,DAPI,slide1,slide2,A1,A4
3,0.558206,A647,DAPI,slide1,slide2,A1,B1
4,0.782552,A647,DAPI,slide1,slide2,A1,B2
