In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, pearsonr
from tqdm import tqdm
from cell_paint_seg.utils import get_id_to_path
from cell_paint_seg.image_io import read_ims, read_seg
from skimage import exposure, measure
import umap
import umap.plot
from graspologic.plot import heatmap
import pickle
from ast import literal_eval
import random

In [None]:
channels = ["Brightfield", "ER", "AGP", "Mito", "DNA", "RNA"]

# Get remote info for plotting

In [None]:
id_to_path_im_0 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/Images/",
    tag=".tif",
    remote=True,
)
id_to_path_seg_0 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/segmentations/",
    tag=".tif",
    remote=True,
)

id_to_path_im_1 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142688__2024-03-29T19_57_13-Measurement 1/Images/",
    tag=".tif",
    remote=True,
)
id_to_path_seg_1 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142688__2024-03-29T19_57_13-Measurement 1/segmentations_v1/",
    tag=".tif",
    remote=True,
)

In [None]:
def plot(id, id_to_path_im, id_to_path_seg):
    paths_ims = id_to_path_im[0][id]
    paths_segs = id_to_path_seg[0][id]

    images = read_ims(paths_ims, sftp_client=id_to_path_im[1])
    segs = read_ims(paths_segs, sftp_client=id_to_path_seg[1])

    image_dna = images[channels.index("DNA")].astype("float") / (2**16 - 1)
    image_dna_adj = exposure.equalize_adapthist(
        image_dna, clip_limit=0.03, kernel_size=[s // 64 for s in image_dna.shape]
    )
    image_rna = images[channels.index("RNA")].astype("float") / (2**16 - 1)
    image_rna_adj = exposure.equalize_adapthist(image_rna, clip_limit=0.03)
    image_agp = images[channels.index("AGP")].astype("float") / (2**16 - 1)
    image_agp_adj = exposure.equalize_adapthist(image_agp, clip_limit=0.03)
    image_rgb = np.stack([image_agp, image_rna, image_dna], axis=2)
    image_rgb_adj = np.stack([image_agp_adj, image_rna_adj, image_dna_adj], axis=2)

    seg_cell, seg_soma, seg_nuc = segs
    seg_cyto = np.logical_and(seg_soma > 0, seg_nuc == 0)

    seg_cell_masked = np.ma.masked_array(seg_cell, mask=seg_cell == 0)
    seg_soma_masked = np.ma.masked_array(seg_soma, mask=seg_soma == 0)
    seg_nuc_masked = np.ma.masked_array(seg_nuc, mask=seg_nuc == 0)
    seg_cyto_masked = np.ma.masked_array(seg_cyto, mask=seg_cyto == 0)

    f, axs = plt.subplots(nrows=1, ncols=3, dpi=300)

    axs[0].imshow(image_rgb, cmap="gray")
    axs[0].set_title(f"Image {id}")
    axs[1].imshow(image_rgb_adj, cmap="gray")
    axs[2].imshow(image_dna, cmap="gray")
    axs[2].imshow(seg_cell_masked % 20, cmap="tab20", alpha=1)
    axs[2].imshow(seg_cyto_masked, cmap="Accent", alpha=1)
    axs[2].imshow(seg_nuc_masked > 0, cmap="Set1", alpha=1)

    # axs[2].imshow(image_rgb_adj, cmap="gray")
    # axs[2].imshow(seg_soma_masked % 20, cmap="tab20", alpha=1)
    f.set_figheight(10)
    f.set_figwidth(30)

    for ax in axs.flatten():
        ax.axis("off")

    f.savefig(
        "/Users/thomasathey/Documents/shavit-lab/fraenkel/presentation/answer-als/example/hierch-example-colors.svg"
    )

In [None]:
def plot_fields(id, id_to_path_im, id_to_path_seg):
    fig, axs = plt.subplots(nrows=3, ncols=3, dpi=300, figsize=(8, 8))

    for f in range(9):
        new_id = f"{id[:8]}{f+1}{id[-3:]}"

        paths_ims = id_to_path_im[0][new_id]
        paths_segs = id_to_path_seg[0][new_id]

        images = read_ims(paths_ims, sftp_client=id_to_path_im[1])
        segs = read_ims(paths_segs, sftp_client=id_to_path_seg[1])

        image_dna = images[channels.index("DNA")].astype("float") / (2**16 - 1)
        image_dna_adj = exposure.equalize_adapthist(
            image_dna, clip_limit=0.03, kernel_size=[s // 64 for s in image_dna.shape]
        )
        image_rna = images[channels.index("RNA")].astype("float") / (2**16 - 1)
        image_rna_adj = exposure.equalize_adapthist(image_rna, clip_limit=0.03)
        image_agp = images[channels.index("AGP")].astype("float") / (2**16 - 1)
        image_agp_adj = exposure.equalize_adapthist(image_agp, clip_limit=0.03)
        image_rgb = np.stack([image_agp, image_rna, image_dna], axis=2)
        image_rgb_adj = np.stack([image_agp_adj, image_rna_adj, image_dna_adj], axis=2)

        seg_cell, seg_soma, seg_nuc = segs
        seg_soma_masked = np.ma.masked_array(seg_soma, mask=seg_soma == 0)
        for lbl in np.unique(seg_nuc):
            if lbl == 0:
                continue
            soma_area = np.sum(seg_soma == lbl)
            ratio = np.sum(seg_nuc == lbl) / soma_area
            if ratio >= 0.9 or soma_area < 278:
                seg_soma_masked = np.ma.masked_array(
                    seg_soma_masked, mask=seg_soma_masked == lbl
                )
            else:
                pass  # seg_soma_masked = np.ma.masked_array(seg_soma_masked, mask=seg_soma_masked == lbl)
        seg_soma_masked[0, 0] = 0

        axs[f // 3, f % 3].imshow(image_rgb_adj, cmap="gray")
        axs[f // 3, f % 3].imshow(seg_soma_masked % 20, cmap="tab20")

    for ax in axs.flatten():
        ax.axis("off")
    # fig.suptitle(f"Image {id}")

    # fig.tight_layout()

    fig.subplots_adjust(wspace=0.01, hspace=0.01)
    plt.show()

In [None]:
def plot_feature(id, id_to_path_im, id_to_path_seg):
    paths_ims = id_to_path_im[0][id]
    paths_segs = id_to_path_seg[0][id]

    images = read_ims(paths_ims, sftp_client=id_to_path_im[1])
    segs = read_ims(paths_segs, sftp_client=id_to_path_seg[1])

    image_dna = images[channels.index("DNA")].astype("float") / (2**16 - 1)
    image_dna_adj = exposure.equalize_adapthist(
        image_dna, clip_limit=0.03, kernel_size=[s // 64 for s in image_dna.shape]
    )
    image_rna = images[channels.index("RNA")].astype("float") / (2**16 - 1)
    image_rna_adj = exposure.equalize_adapthist(image_rna, clip_limit=0.03)
    image_agp = images[channels.index("AGP")].astype("float") / (2**16 - 1)
    image_agp_adj = exposure.equalize_adapthist(image_agp, clip_limit=0.03)
    image_rgb_adj = np.stack([image_agp_adj, image_rna_adj, image_dna_adj], axis=2)

    seg_cell, seg_soma, seg_nuc = segs
    seg_cyto = np.logical_and(seg_soma > 0, seg_nuc == 0)

    features = np.zeros_like(seg_soma, dtype="float")
    for lbl in np.unique(seg_nuc):
        if lbl == 0:
            continue
        feature = np.sum(seg_nuc == lbl) / np.sum(seg_soma == lbl)
        feature = np.sum(seg_soma == lbl)
        features[seg_soma == lbl] = feature

    features_masked = np.ma.masked_array(features, mask=features == 0)

    f, axs = plt.subplots(nrows=1, ncols=2, dpi=300)

    axs[0].imshow(image_rgb_adj, cmap="gray")
    axs[0].set_title(f"Image {id}")
    axs[1].imshow(image_dna, cmap="gray")
    im = axs[1].imshow(features_masked, cmap="autumn", alpha=1)
    plt.colorbar(im, ax=axs[1])
    f.set_figheight(10)
    f.set_figwidth(20)

    for ax in axs.flatten():
        ax.axis("off")

In [None]:
def plot_crop(id, id_to_path_im, id_to_path_seg):
    paths_ims = id_to_path_im[0][id]
    paths_segs = id_to_path_seg[0][id]

    images = read_ims(paths_ims, sftp_client=id_to_path_im[1])
    segs = read_ims(paths_segs, sftp_client=id_to_path_seg[1])

    image_dna = images[channels.index("DNA")].astype("float") / (2**16 - 1)
    image_dna_adj = exposure.equalize_adapthist(
        image_dna, clip_limit=0.03, kernel_size=[s // 64 for s in image_dna.shape]
    )
    image_rna = images[channels.index("RNA")].astype("float") / (2**16 - 1)
    image_rna_adj = exposure.equalize_adapthist(image_rna, clip_limit=0.03)
    image_agp = images[channels.index("AGP")].astype("float") / (2**16 - 1)
    image_agp_adj = exposure.equalize_adapthist(image_agp, clip_limit=0.03)
    image_rgb = np.stack([image_agp, image_rna, image_dna], axis=2)
    image_rgb_adj = np.stack([image_agp_adj, image_rna_adj, image_dna_adj], axis=2)

    seg_cell, seg_soma, seg_nuc = segs
    seg_cyto = np.logical_and(seg_soma > 0, seg_nuc == 0)

    seg_cell_masked = np.ma.masked_array(seg_cell, mask=seg_cell == 0)
    seg_soma_masked = np.ma.masked_array(seg_soma, mask=seg_soma == 0)
    seg_nuc_masked = np.ma.masked_array(seg_nuc, mask=seg_nuc == 0)
    seg_cyto_masked = np.ma.masked_array(seg_cyto, mask=seg_cyto == 0)

    soma_id = random.choice(np.unique(seg_soma))
    mask = measure.label(seg_soma == soma_id)
    bbox = measure.regionprops(mask)[0].bbox
    crop = image_rgb_adj[bbox[0] : bbox[2], bbox[1] : bbox[3], :]
    crop_seg = seg_soma[bbox[0] : bbox[2], bbox[1] : bbox[3]]
    crop_seg = np.repeat(crop_seg[:, :, np.newaxis], 3, axis=2)
    crop[crop_seg == 0] = 0

    f, axs = plt.subplots(nrows=1, ncols=2, dpi=300)

    axs[0].imshow(image_rgb_adj, cmap="gray")
    axs[0].set_title(f"Image {id}")
    axs[1].imshow(crop, cmap="gray")
    # axs[1].imshow(seg_cell_masked % 20, cmap="tab20", alpha=1)
    # axs[1].imshow(seg_cyto_masked, cmap="Accent", alpha=1)
    # axs[1].imshow(seg_nuc_masked > 0, cmap="Set1", alpha=1)

    # axs[2].imshow(image_rgb_adj, cmap="gray")
    # axs[2].imshow(seg_soma_masked % 20, cmap="tab20", alpha=1)
    f.set_figheight(10)
    f.set_figwidth(30)

    for ax in axs.flatten():
        ax.axis("off")

# Image averages

## Get data

In [None]:
data_paths = [
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/stats/Image.csv",
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/BR00142688__2024-03-29T19_57_13-Measurement 1/stats/Image.csv",
]

In [None]:
feats_to_exclude = [
    "Metadata_Series",
    "Metadata_Site",
    "Metadata_Well",
    "Metadata_WellColumn",
    "Metadata_WellRow",
    "AreaShape_Orientation",
    "AreaShape_BoundingBoxMaximum_X",
    "AreaShape_BoundingBoxMaximum_Y",
    "AreaShape_BoundingBoxMinimum_X",
    "AreaShape_BoundingBoxMinimum_Y",
    "AreaShape_Center_X",
    "AreaShape_Center_Y",
    "Children_Cytoplasm_Count",
    "Location_CenterMassIntensity_X_scaled_AGP",
    "Location_CenterMassIntensity_Y_scaled_AGP",
    "Location_CenterMassIntensity_Z_scaled_AGP",
    "Location_CenterMassIntensity_X_scaled_DNA",
    "Location_CenterMassIntensity_Y_scaled_DNA",
    "Location_CenterMassIntensity_Z_scaled_DNA",
    "Location_CenterMassIntensity_X_scaled_ER",
    "Location_CenterMassIntensity_Y_scaled_ER",
    "Location_CenterMassIntensity_Z_scaled_ER",
    "Location_CenterMassIntensity_X_scaled_mito",
    "Location_CenterMassIntensity_Y_scaled_mito",
    "Location_CenterMassIntensity_Z_scaled_mito",
    "Location_CenterMassIntensity_X_scaled_RNA",
    "Location_CenterMassIntensity_Y_scaled_RNA",
    "Location_CenterMassIntensity_Z_scaled_RNA",
    "Location_MaxIntensity_X_scaled_DNA",
    "Location_MaxIntensity_X_scaled_AGP",
    "Location_MaxIntensity_X_scaled_mito",
    "Location_MaxIntensity_X_scaled_ER",
    "Location_Center_X",
    "Location_Center_Y",
    "Location_MaxIntensity_X_scaled_RNA",
    "Location_MaxIntensity_Y_scaled_DNA",
    "Location_MaxIntensity_Y_scaled_AGP",
    "Location_MaxIntensity_Y_scaled_mito",
    "Location_MaxIntensity_Y_scaled_ER",
    "Location_MaxIntensity_Y_scaled_RNA",
    "Location_MaxIntensity_Z_scaled_DNA",
    "Location_MaxIntensity_Z_scaled_AGP",
    "Location_MaxIntensity_Z_scaled_mito",
    "Location_MaxIntensity_Z_scaled_ER",
    "Location_MaxIntensity_Z_scaled_RNA",
    "Neighbors_AngleBetweenNeighbors_Adjacent",
    "Neighbors_AngleBetweenNeighbors_25",
    "Neighbors_FirstClosestObjectNumber_Adjacent",
    "Neighbors_SecondClosestObjectNumber_Adjacent",
    "Number_Object_Number",
    "Parent_EdgeNuclei",
    "Parent_Nuclei",
    "Neighbors_FirstClosestObjectNumber_25",
    "Neighbors_SecondClosestObjectNumber_25",
]


def include_feat(feat_name):
    for feat_to_exclude in feats_to_exclude:
        if feat_to_exclude in feat_name:
            return False
    return True


dfs = []
for i, data_path in enumerate(data_paths):
    df = pd.read_csv(data_path)

    row = [int(fname[1:3]) for fname in list(df["FileName_AGP"])]
    col = [int(fname[4:6]) for fname in list(df["FileName_AGP"])]
    well = [(i, r, c) for r, c in zip(row, col)]
    field = [int(fname[7:9]) for fname in list(df["FileName_AGP"])]
    id = [fname[:12] for fname in list(df["FileName_AGP"])]

    df["Row"] = row
    df["Column"] = col
    df["Well"] = well
    df["Field"] = field
    df["ID"] = id

    col_names = list(df.columns)
    col_names = [col_name for col_name in col_names if "Mean_" in col_name]
    col_names = [col_name for col_name in col_names if include_feat(col_name)]
    col_names += ["Plate", "Row", "Column", "Well", "Field", "ID"]
    print(f"{len(col_names)} different features")
    df["Plate"] = i
    dfs.append(df[col_names])
df_means = pd.concat(dfs, axis="rows")

In [None]:
df_conditions = pd.read_excel(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/Answer ALS_Pilot2_March2024.xlsx"
)
df_conditions = df_conditions.iloc[:16, 2:]

data_dmso = []
data_autophagy = []
data_verdinexor = []
data_h2o2 = []
data_tunicamycin = []
data_importazole = []
data_celltype = []

for i, row in df_means.iterrows():
    r = row["Row"] - 1
    c = row["Column"] - 1
    p = row["Plate"]

    condition = df_conditions.iloc[r, c]

    if c % 6 == 0:
        data_dmso.append(1)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 1:
        data_dmso.append(0)
        data_autophagy.append(1)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 2:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(float(condition))
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 3:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(float(condition))
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 4:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(float(condition))
        data_importazole.append(0)
    elif c % 6 == 5:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(float(condition))

    if p == 0:
        if (r < 8 and c < 6) or (r >= 8 and c >= 18):
            data_celltype.append("Control 1")
        elif (r < 8 and c >= 6 and c < 12) or (r >= 8 and c < 6):
            data_celltype.append("ALS 1")
        elif (r < 8 and c >= 18) or (r >= 8 and c < 18 and c >= 12):
            data_celltype.append("Control 2")
        else:
            data_celltype.append("ALS 2")
    elif p == 1:
        if (r < 8 and c < 6) or (r >= 8 and c >= 18):
            data_celltype.append("ALS 2")
        elif (r < 8 and c >= 6 and c < 12) or (r >= 8 and c < 6):
            data_celltype.append("Control 2")
        elif (r < 8 and r >= 6 and c >= 18) or (
            r >= 8 and r < 10 and c < 18 and c >= 12
        ):
            data_celltype.append("ALS 1")
        elif (r < 8 and c >= 12 and c < 18) or (r >= 8 and c < 12 and c >= 6):
            data_celltype.append("Control 1")
        else:
            data_celltype.append("EMPTY")


data = {
    "DMSO": data_dmso,
    "Autophagy": data_autophagy,
    "Verdinexor": data_verdinexor,
    "H2O2": data_h2o2,
    "Tunicamysin": data_tunicamycin,
    "Importazole": data_importazole,
    "Cell Type": data_celltype,
}
df_covariates = pd.DataFrame(data)

In [None]:
# why do we need this?
df_covariates.reset_index(drop=True, inplace=True)
df_means.reset_index(drop=True, inplace=True)

In [None]:
df = pd.concat((df_means, df_covariates), axis="columns")

## Compute distance matrix

In [None]:
aggregation_functions = {k: "median" for k in list(df.columns) if "Mean_" in k}
aggregation_functions_2 = {k: "first" for k in list(df.columns) if "Mean_" not in k}
aggregation_functions.update(aggregation_functions_2)
df_agg = df.groupby(df["Well"]).aggregate(aggregation_functions)

df_agg.sort_values(by=["Cell Type"], inplace=True)

col_names = list(df.columns)
col_names = [col_name for col_name in col_names if "Mean_" in col_name]
print(len(col_names))
df_feats_agg = df_agg[col_names]

In [None]:
feats = df_feats_agg.to_numpy()
print(f"Fraction of finite feature values: {np.sum(np.isfinite(feats))/feats.size}")
feats = np.nan_to_num(feats)
D = pairwise_distances(feats, metric="cosine")

## Plot distance matrix

In [None]:
heatmap(D, inner_hier_labels=df_agg["Cell Type"])

In [None]:
data_dist = []
data_type = []
data_ids = []
cell_types = df_agg["Cell Type"]

for i in range(D.shape[0]):
    type1 = cell_types[i]
    for j in range(i):
        type2 = cell_types[j]
        data_dist.append(D[i, j])
        data_ids.append((id[i], id[j]))
        if type1 == type2:
            data_type.append("Same")
        else:
            data_type.append("Different")

df_types = pd.DataFrame(
    data={
        "Distance": data_dist,
        "Cell Type Relationship": data_type,
        "ID Pair": data_ids,
    }
)
sns.boxplot(df_types, x="Distance", y="Cell Type Relationship")

x = df_types[df_types["Cell Type Relationship"] == "Same"]["Distance"].to_numpy()
y = df_types[df_types["Cell Type Relationship"] == "Different"]["Distance"].to_numpy()
res = mannwhitneyu(x, y, alternative="less")
plt.title(
    f"Inter vs. Intra Cell Type Feature Distances (Mann-Whitney p-val: {res.pvalue :.2E})"
)

### Search for features

In [None]:
data_feats = []
data_pvals = []

for col in tqdm(list(df_feats_agg.columns)):
    x = df_agg[(df_agg["Cell Type"] == "ALS 1") | (df_agg["Cell Type"] == "ALS 2")][
        col
    ].to_numpy()
    y = df_agg[
        (df_agg["Cell Type"] == "Control 1") | (df_agg["Cell Type"] == "Control 2")
    ][col].to_numpy()

    x = np.nan_to_num(x).reshape(-1, 1)
    y = np.nan_to_num(y).reshape(-1, 1)
    res = mannwhitneyu(x, y)

    data_feats.append(col)
    data_pvals.append(res.pvalue[0])

df_pvals = pd.DataFrame(data={"P-value": data_pvals, "Feature": data_feats})
# sns.histplot(df_pvals, x="P-value")

In [None]:
df_pvals.sort_values("P-value", inplace=True)
df_pvals.to_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/pvals-0.csv"
)

## UMAP

In [None]:
np.save(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/D_av.npy",
    D,
)

df_agg.to_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/df_agg_av.csv"
)

with open(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/mapper.pickle",
    "rb",
) as file:
    mapper = pickle.load(file)

# mapper = umap.UMAP().fit(D)

In [None]:
import umap
import pandas as pd
import numpy as np
import pickle

df_agg = pd.read_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/df_agg_av.csv"
)
D = np.load(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/D_av.npy",
)
mapper = umap.UMAP().fit(D)
with open(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/mapper.pickle",
    "wb",
) as handle:
    pickle.dump(mapper, handle)

In [None]:
umap.plot.points(mapper, labels=df_agg["Cell Type"])

## Extremes

In [None]:
vert = np.argsort(mapper.embedding_[:, 1])
horiz = np.argsort(mapper.embedding_[:, 0])

bottom, top = vert[0], vert[-1]
left, right = horiz[0], horiz[-1]

ax = umap.plot.points(mapper, labels=df_agg["Cell Type"])
for extreme in [bottom, top, left, right]:
    ax.scatter(mapper.embedding_[extreme, 0], mapper.embedding_[extreme, 1], c="blue")
print(df_agg.iloc[top, :]["ID"])
plt.show()

In [None]:
plot("r08c17f08p01", id_to_path_im_1, id_to_path_seg_1)
plt.savefig(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/presentation/answer-als/example/example-seg.svg"
)

## Plot extremes

In [None]:
for extreme in vert[-5:]:
    well = df_agg.iloc[extreme, :]["Well"]
    id = df_agg.iloc[extreme, :]["ID"]
    print(df_agg.iloc[extreme, :]["Cell Type"])
    print(well)
    if well[0] == 0:
        plot(id, id_to_path_im_0, id_to_path_seg_0)
    elif well[0] == 1:
        plot(id, id_to_path_im_1, id_to_path_seg_1)

In [None]:
for extreme in vert[:5]:
    well = df_agg.iloc[extreme, :]["Well"]
    id = df_agg.iloc[extreme, :]["ID"]
    print(df_agg.iloc[extreme, :]["Cell Type"])
    print(well)
    if well[0] == 0:
        plot(id, id_to_path_im_0, id_to_path_seg_0)
    elif well[0] == 1:
        plot(id, id_to_path_im_1, id_to_path_seg_1)

In [None]:
for extreme in [bottom, top, left, right]:
    well = df_agg.iloc[extreme, :]["Well"]
    id = df_agg.iloc[extreme, :]["ID"]
    if well[0] == 0:
        plot(id, id_to_path_im_0, id_to_path_seg_0)
    elif well[0] == 1:
        plot(id, id_to_path_im_1, id_to_path_seg_1)

# Individual Cells

In [None]:
data_soma_path = [
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/stats/Somas.csv",
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/BR00142688__2024-03-29T19_57_13-Measurement 1/stats/Somas.csv",
]

# data_nuc_path = [
#     "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/stats/Nuclei.csv"
# ]

In [None]:
feats_to_exclude = [
    "Metadata_Series",
    "Metadata_Site",
    "Metadata_Well",
    "Metadata_WellColumn",
    "Metadata_WellRow",
    "AreaShape_Orientation",
    "AreaShape_BoundingBoxMaximum_X",
    "AreaShape_BoundingBoxMaximum_Y",
    "AreaShape_BoundingBoxMinimum_X",
    "AreaShape_BoundingBoxMinimum_Y",
    "AreaShape_Center_X",
    "AreaShape_Center_Y",
    "Children_Cytoplasm_Count",
    "Location_CenterMassIntensity_X_scaled_AGP",
    "Location_CenterMassIntensity_Y_scaled_AGP",
    "Location_CenterMassIntensity_Z_scaled_AGP",
    "Location_CenterMassIntensity_X_scaled_DNA",
    "Location_CenterMassIntensity_Y_scaled_DNA",
    "Location_CenterMassIntensity_Z_scaled_DNA",
    "Location_CenterMassIntensity_X_scaled_ER",
    "Location_CenterMassIntensity_Y_scaled_ER",
    "Location_CenterMassIntensity_Z_scaled_ER",
    "Location_CenterMassIntensity_X_scaled_mito",
    "Location_CenterMassIntensity_Y_scaled_mito",
    "Location_CenterMassIntensity_Z_scaled_mito",
    "Location_CenterMassIntensity_X_scaled_RNA",
    "Location_CenterMassIntensity_Y_scaled_RNA",
    "Location_CenterMassIntensity_Z_scaled_RNA",
    "Location_MaxIntensity_X_scaled_DNA",
    "Location_MaxIntensity_X_scaled_AGP",
    "Location_MaxIntensity_X_scaled_mito",
    "Location_MaxIntensity_X_scaled_ER",
    "Location_Center_X",
    "Location_Center_Y",
    "Location_MaxIntensity_X_scaled_RNA",
    "Location_MaxIntensity_Y_scaled_DNA",
    "Location_MaxIntensity_Y_scaled_AGP",
    "Location_MaxIntensity_Y_scaled_mito",
    "Location_MaxIntensity_Y_scaled_ER",
    "Location_MaxIntensity_Y_scaled_RNA",
    "Location_MaxIntensity_Z_scaled_DNA",
    "Location_MaxIntensity_Z_scaled_AGP",
    "Location_MaxIntensity_Z_scaled_mito",
    "Location_MaxIntensity_Z_scaled_ER",
    "Location_MaxIntensity_Z_scaled_RNA",
    "Neighbors_AngleBetweenNeighbors_Adjacent",
    "Neighbors_AngleBetweenNeighbors_25",
    "Neighbors_FirstClosestObjectNumber_Adjacent",
    "Neighbors_SecondClosestObjectNumber_Adjacent",
    "Number_Object_Number",
    "Parent_EdgeNuclei",
    "Parent_Nuclei",
    "Neighbors_FirstClosestObjectNumber_25",
    "Neighbors_SecondClosestObjectNumber_25",
]


def include_feat(feat_name):
    if (
        "Texture" not in feat_name
        and "AreaShape" not in feat_name
        and "Intensity" not in feat_name
    ):
        return False
    for feat_to_exclude in feats_to_exclude:
        if feat_to_exclude in feat_name:
            return False
    return True

## Write csv

In [None]:
dfs = []
for i, data_path in enumerate(data_soma_path):
    df = pd.read_csv(data_path)

    object = [item for item in list(df["ObjectNumber"])]
    col = [item for item in list(df["Metadata_WellColumn"])]
    row = [item for item in list(df["Metadata_WellRow"])]
    well = [(i, r, c) for r, c in zip(row, col)]
    field = [item for item in list(df["Metadata_Site"])]
    id = [
        f"r{str(r).zfill(2)}c{str(c).zfill(2)}f{str(f).zfill(2)}p01"
        for r, c, f in zip(row, col, field)
    ]

    df["Plate"] = i
    df["Row"] = row
    df["Column"] = col
    df["Well"] = well
    df["Field"] = field
    df["Object"] = object
    df["ID"] = id

    col_names = list(df.columns)
    col_names = [col_name for col_name in col_names if include_feat(col_name)]
    print(f"{len(col_names)} different features")
    col_names += ["Plate", "Row", "Column", "Well", "Field", "Object", "ID"]
    dfs.append(df[col_names])
df = pd.concat(dfs, axis="rows")

# dfs = []
# for i, data_path in enumerate(data_nuc_path):
#     df = pd.read_csv(data_path)

#     object = [item for item in list(df["ObjectNumber"])]
#     col = [item for item in list(df["Metadata_WellColumn"])]
#     row = [item for item in list(df["Metadata_WellRow"])]
#     well = [(i, r, c) for r, c in zip(row, col)]
#     field = [item for item in list(df["Metadata_Site"])]
#     id = [
#         f"r{str(r).zfill(2)}c{str(c).zfill(2)}f{str(f).zfill(2)}p01"
#         for r, c, f in zip(row, col, field)
#     ]

#     df["Plate"] = i
#     df["Row"] = row
#     df["Column"] = col
#     df["Well"] = well
#     df["Field"] = field
#     df["Object"] = object
#     df["ID"] = id

#     col_names = list(df.columns)
#     col_names = [col_name for col_name in col_names if include_feat(col_name)]
#     print(f"{len(col_names)} different features")
#     col_names += ["Plate", "Row", "Column", "Well", "Field", "Object", "ID"]
#     dfs.append(df[col_names])
# df_nucs = pd.concat(dfs, axis="rows")

In [None]:
df.to_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/plates.csv"
)

In [None]:
df_conditions = pd.read_excel(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/first-sample/Assay Dev 20230329/Answer ALS_Pilot2_March2024.xlsx"
)
df_conditions = df_conditions.iloc[:16, 2:]

data_dmso = []
data_autophagy = []
data_verdinexor = []
data_h2o2 = []
data_tunicamycin = []
data_importazole = []
data_celltype = []

for i, row in tqdm(df.iterrows()):
    r = row["Row"] - 1
    c = row["Column"] - 1
    p = row["Plate"]

    condition = df_conditions.iloc[r, c]

    if c % 6 == 0:
        data_dmso.append(1)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 1:
        data_dmso.append(0)
        data_autophagy.append(1)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 2:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(float(condition))
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 3:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(float(condition))
        data_tunicamycin.append(0)
        data_importazole.append(0)
    elif c % 6 == 4:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(float(condition))
        data_importazole.append(0)
    elif c % 6 == 5:
        data_dmso.append(0)
        data_autophagy.append(0)
        data_verdinexor.append(0)
        data_h2o2.append(0)
        data_tunicamycin.append(0)
        data_importazole.append(float(condition))

    if p == 0:
        if (r < 8 and c < 6) or (r >= 8 and c >= 18):
            data_celltype.append("Control 1")
        elif (r < 8 and c >= 6 and c < 12) or (r >= 8 and c < 6):
            data_celltype.append("ALS 1")
        elif (r < 8 and c >= 18) or (r >= 8 and c < 18 and c >= 12):
            data_celltype.append("Control 2")
        else:
            data_celltype.append("ALS 2")
    elif p == 1:
        if (r < 8 and c < 6) or (r >= 8 and c >= 18):
            data_celltype.append("ALS 2")
        elif (r < 8 and c >= 6 and c < 12) or (r >= 8 and c < 6):
            data_celltype.append("Control 2")
        elif (r < 8 and r >= 6 and c >= 18) or (
            r >= 8 and r < 10 and c < 18 and c >= 12
        ):
            data_celltype.append("ALS 1")
        elif (r < 8 and c >= 12 and c < 18) or (r >= 8 and c < 12 and c >= 6):
            data_celltype.append("Control 1")
        else:
            data_celltype.append("EMPTY")


data = {
    "DMSO": data_dmso,
    "Autophagy": data_autophagy,
    "Verdinexor": data_verdinexor,
    "H2O2": data_h2o2,
    "Tunicamysin": data_tunicamycin,
    "Importazole": data_importazole,
    "Cell Type": data_celltype,
}
df_covariates = pd.DataFrame(data)

In [None]:
df_covariates.to_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/plates-covs.csv"
)

In [None]:
# why do we need this?
df_covariates.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df = pd.read_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/plates.csv"
)
df_covariates = pd.read_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/plates-covs.csv"
)

In [None]:
df = pd.concat((df, df_covariates), axis="columns")

In [None]:
df.to_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/plates-both.csv"
)

## Read csv

In [None]:
df = pd.read_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/plates-both.csv"
)
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

## Filtering exploration

In [None]:
df["Nucleus/Soma Area Ratio"] = df["Mean_Nuclei_AreaShape_Area"] / df["AreaShape_Area"]

In [None]:
# sns.histplot(df, x="Nucleus/Soma Area Ratio")
# sns.histplot(df, x="AreaShape_Area")

In [None]:
df = df[(df["Nucleus/Soma Area Ratio"] < 0.9) & (df["AreaShape_Area"] >= 268)]

In [None]:
# df["Cell Type"].unique()
df = df[df["Cell Type"] != "EMPTY"]
df = df[df["Verdinexor"] > 0]

In [None]:
df["Cell Type"].value_counts()

#### Check that labels are consecutive

In [None]:
id_to_obj = {}
for i, row in tqdm(df_nucs.iterrows()):
    id = row["ID"]
    obj = row["Object"]

    if id in id_to_obj.keys():
        new = id_to_obj[id] + [obj]
        id_to_obj[id] = new
    else:
        id_to_obj[id] = [obj]

for key, val in id_to_obj.items():
    if sorted(val) == list(range(min(val), max(val) + 1)):
        pass
    else:
        print((key, val))

In [None]:
obj_id_soma = [f"{id}-{obj}" for obj, id in zip(list(df["Object"]), list(df["ID"]))]
obj_id_nuc = [
    f"{id}-{obj}" for obj, id in zip(list(df_nucs["Object"]), list(df_nucs["ID"]))
]
assert len(obj_id_soma) == len(set(obj_id_soma))
assert len(obj_id_nuc) == len(set(obj_id_nuc))

for id in obj_id_soma:
    if id not in obj_id_nuc:
        print(id)
        break

In [None]:
id_to_path_im_0 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/Images/",
    tag=".tif",
    remote=True,
)
id_to_path_seg_0 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142687__2024-03-29T18_18_57-Measurement 1/segmentations/",
    tag=".tif",
    remote=True,
)

id_to_path_im_1 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142688__2024-03-29T19_57_13-Measurement 1/Images/",
    tag=".tif",
    remote=True,
)
id_to_path_seg_1 = get_id_to_path(
    "/imagestore/Aneesh/Assay Dev 20230329/BR00142688__2024-03-29T19_57_13-Measurement 1/segmentations/",
    tag=".tif",
    remote=True,
)

In [None]:
paths_segs = id_to_path_seg_0[0]["r01c01f01p01"]
segs = read_ims(paths_segs, sftp_client=id_to_path_seg_0[1])

In [None]:
np.unique(segs[0], return_counts=True)

## Compute distance matrix

In [None]:
agg = True
balance = True
sample = 10000

col_names = list(df.columns)
col_names = [col_name for col_name in col_names if include_feat(col_name)]
print(len(col_names))

if agg:
    aggregation_functions = {k: "mean" for k in list(df.columns) if include_feat(k)}
    aggregation_functions_2 = {
        k: "first" for k in list(df.columns) if not include_feat(k)
    }
    aggregation_functions.update(aggregation_functions_2)
    df = df.groupby(df["Well"]).aggregate(aggregation_functions)

    df.sort_values(by=["Cell Type"], inplace=True)

    df_feats = df[col_names]
else:
    if sample > 0:
        if balance:
            dfs = []
            for type in df["Cell Type"].unique():
                dfs.append(df[df["Cell Type"] == type].sample(n=sample // 4))
            df = pd.concat(dfs, axis="rows")
            df_feats = df[col_names]
        else:
            df = df.sample(n=sample)
            df_feats = df[col_names]
    else:
        df_feats = df[col_names]

print(df.shape)

In [None]:
df_feats = (df_feats - df_feats.mean()) / df_feats.std()

In [None]:
feats = df_feats.to_numpy()
print(f"Fraction of finite feature values: {np.sum(np.isfinite(feats))/feats.size}")
feats = np.nan_to_num(feats, nan=0.0, posinf=0.0, neginf=0.0)
D = pairwise_distances(feats, metric="cosine")

## Classifiers

In [None]:
wells = [literal_eval(well) for well in list(df["Well"])]
grp = np.array([0 if well[0] == 0 else 1 for well in wells])
X = feats
y = np.array([0 if "Control" in type else 1 for type in list(df["Cell Type"])])

X_train = X[grp == 0, :]
X_test = X[grp == 1, :]
y_train = y[grp == 0]
y_test = y[grp == 1]

print(f"train: {X_train.shape} test: {X_test.shape}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# preprocess dataset, split into training and test part

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.4, random_state=42
# )

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f"{name}: {score}")

## Plot distance matrix

In [None]:
a = np.random.rand(100, 100)
heatmap(a, cmap="viridis")

In [None]:
plate = [w[1] for w in df["Well"]]

ax = heatmap(D, outer_hier_labels=df["Cell Type"], inner_hier_labels=plate)
# plt.savefig("/Users/thomasathey/Documents/shavit-lab/fraenkel/presentation/ljosa-analysis/filter_dead/dist-mat.png")

In [None]:
np.save(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/D_av.npy",
    D,
)

df.to_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/df_av.csv"
)

In [None]:
import umap
import pandas as pd
import numpy as np
import pickle


D = np.load(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/D_av.npy",
)
mapper = umap.UMAP().fit(D)
with open(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/mapper.pickle",
    "wb",
) as handle:
    pickle.dump(mapper, handle)

In [None]:
df = pd.read_csv(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/df_av.csv"
)

with open(
    "/Users/thomasathey/Documents/shavit-lab/fraenkel/cell_paint_seg/experiments/mapper.pickle",
    "rb",
) as file:
    mapper = pickle.load(file)

# mapper = umap.UMAP().fit(D)

In [None]:
condition = "Verdinexor"
lvl = 0.02

df[condition].unique()

In [None]:
labels = []
for v, s in zip(df[condition], df["Cell Type"]):
    if v > lvl and "ALS" in s:
        labels.append(f"ALS x {condition} > {lvl}")
    elif v > lvl and "Control" in s:
        labels.append(f"Control x {condition} > {lvl}")
    else:
        labels.append("Other")
labels = np.array(labels)

umap.plot.points(
    mapper,
    labels=labels,
    color_key={
        f"ALS x {condition} > {lvl}": "red",
        f"Control x {condition} > {lvl}": "orange",
        "Other": "green",
    },
)
# umap.plot.points(mapper, labels=df["Cell Type"], color_key={"ALS 1": "red", "ALS 2": "orange", "Control 1": "green", "Control 2": "blue"})

coords = mapper.embedding_
group = np.array([0 if "Control" in t else 1 for t in df["Cell Type"]])

## Statistics

In [None]:
within = []
between = []

ctypes = list(df["Cell Type"])

for i in range(D.shape[0]):
    for j in range(i + 1, D.shape[1]):
        dist = D[i, j]
        if ("ALS" in ctypes[i] and "ALS" in ctypes[j]) or (
            "ALS" not in ctypes[i] and "ALS" not in ctypes[j]
        ):
            within.append(dist)
        else:
            between.append(dist)

mannwhitneyu(within, between)

In [None]:
feat_to_r = {}

for col in df.columns:
    if include_feat(col):
        feat = df[col].to_numpy()
        feat = np.nan_to_num(feat, nan=0.0, posinf=0.0, neginf=0.0)
        res = pearsonr(feat, mapper.embedding_[:, 0])
        if res.pvalue < 0.05 / 1164 and np.abs(res.statistic) > 0.8:
            feat_to_r[col] = res.statistic
feat_to_r

In [None]:
feat_to_r = {}

for col in df.columns:
    if include_feat(col):
        feat = df[col].to_numpy()
        feat = np.nan_to_num(feat, nan=0.0, posinf=0.0, neginf=0.0)
        res = pearsonr(feat, mapper.embedding_[:, 1])
        if res.pvalue < 0.05 / 1164 and np.abs(res.statistic) > 0.7:
            feat_to_r[col] = res.statistic
feat_to_r

## Plot extremes

In [None]:
vert = np.argsort(mapper.embedding_[:, 1])
horiz = np.argsort(mapper.embedding_[:, 0])

bottom, top = vert[0], vert[-1]
left, right = horiz[0], horiz[-1]

ax = umap.plot.points(
    mapper,
    labels=df["Cell Type"],
    color_key={
        "ALS 1": "red",
        "ALS 2": "orange",
        "Control 1": "green",
        "Control 2": "blue",
    },
)
extremes = list(horiz[:3]) + list(horiz[-3:])
for extreme in extremes:
    ax.scatter(mapper.embedding_[extreme, 0], mapper.embedding_[extreme, 1], c="purple")
print(df.iloc[top, :]["ID"])
plt.show()

In [None]:
for extreme in horiz[-3:]:
    well = literal_eval(df.iloc[extreme, :]["Well"])
    id = df.iloc[extreme, :]["ID"]
    print(df.iloc[extreme, :]["Cell Type"])
    print(well)
    print(id)
    # if well[0] == 0:
    #     plot_fields(id, id_to_path_im_0, id_to_path_seg_0)
    # elif well[0] == 1:
    #     plot_fields(id, id_to_path_im_1, id_to_path_seg_1)

In [None]:
for extreme in horiz[:3]:
    well = literal_eval(df.iloc[extreme, :]["Well"])
    id = df.iloc[extreme, :]["ID"]
    print(df.iloc[extreme, :]["Cell Type"])
    print(well)
    if well[0] == 0:
        plot_fields(id, id_to_path_im_0, id_to_path_seg_0)
    elif well[0] == 1:
        plot_fields(id, id_to_path_im_1, id_to_path_seg_1)

In [None]:
plot("r08c17f08p01", id_to_path_im_1, id_to_path_seg_1)

In [None]:
plot_fields("r09c24f01p01", id_to_path_im_0, id_to_path_seg_0)

In [None]:
plot_feature("r01c01f01p01", id_to_path_im_0, id_to_path_seg_0)

In [None]:
plot_crop("r07c24f01p01", id_to_path_im_0, id_to_path_seg_0)

# Plot results

In [None]:
data_mean = df_means.to_numpy()
data_mean = np.nan_to_num(data_mean)
# data_mean = [row for row, f in zip(data_mean, field) if f != 5]
D = pairwise_distances(data_mean, metric="cosine")

lim = 9 * 10
plt.imshow(D[:lim, :lim])
plt.colorbar()

for i in np.arange(0, lim, 9):
    plt.plot([0, lim], [i - 0.5, i - 0.5], "r--", linewidth=0.5)
    plt.plot([i - 0.5, i - 0.5], [0, lim], "r--", linewidth=0.5)

plt.title("Feature Cosine Distances between Images (First 90)")
plt.xticks([])
plt.yticks([])

In [None]:
data_dist = []
data_type = []
data_ids = []
for i in range(D.shape[0]):
    tile1 = i // 9
    for j in range(i):
        tile2 = j // 9
        data_dist.append(D[i, j])
        data_ids.append((id[i], id[j]))
        if tile1 == tile2:
            data_type.append("Same")
        else:
            data_type.append("Different")

df_types = pd.DataFrame(
    data={"Distance": data_dist, "Well Relationship": data_type, "ID Pair": data_ids}
)
sns.boxplot(df_types, x="Distance", y="Well Relationship")

x = df_types[df_types["Well Relationship"] == "Same"]["Distance"].to_numpy()
y = df_types[df_types["Well Relationship"] == "Different"]["Distance"].to_numpy()
res = mannwhitneyu(x, y, alternative="less")
plt.title(
    f"Inter vs. Intrawell Feature Distances (Mann-Whitney p-val: {res.pvalue :.2E})"
)

In [None]:
df_outliers = df_types[
    (df_types["Well Relationship"] == "Same") & (df_types["Distance"] > 0.5)
]
for pair in df_outliers["ID Pair"]:
    plot(pair[0], id_to_path_im, id_to_path_seg)
    plt.show()
    plot(pair[1], id_to_path_im, id_to_path_seg)
    plt.show()
    print("**")

# Cell Type Distances

In [None]:
data_dist = []
data_type = []
data_ids = []
for i in range(D.shape[0]):
    type1 = cell_types[i]
    for j in range(i):
        type2 = cell_types[j]
        data_dist.append(D[i, j])
        data_ids.append((id[i], id[j]))
        if type1 == type2:
            data_type.append("Same")
        else:
            data_type.append("Different")

df_types = pd.DataFrame(
    data={
        "Distance": data_dist,
        "Cell Type Relationship": data_type,
        "ID Pair": data_ids,
    }
)
sns.boxplot(df_types, x="Distance", y="Cell Type Relationship")

x = df_types[df_types["Cell Type Relationship"] == "Same"]["Distance"].to_numpy()
y = df_types[df_types["Cell Type Relationship"] == "Different"]["Distance"].to_numpy()
res = mannwhitneyu(x, y, alternative="less")
plt.title(
    f"Inter vs. Intrawell Feature Distances (Mann-Whitney p-val: {res.pvalue :.2E})"
)

# Intrawell distances

In [None]:
data_dist = []
data_type = []

D_tile = np.zeros((9, 9, D.shape[0] // 9))
for i in range(D.shape[0]):
    tile1 = i // 9
    for j in range(i):
        tile2 = j // 9
        if tile1 == tile2:
            f1 = i % 9
            f2 = j % 9
            D_tile[f1, f2, tile1] = D[i, j]


D_tile2 = np.zeros((9, (D.shape[0] // 9) * 8))
for field in range(D_tile2.shape[0]):
    distances = []
    for i in range(1, D_tile.shape[0]):
        for j in range(i):
            if i == field or j == field:
                distances += list(D_tile[i, j, :])

    D_tile2[field, :] = distances

dfs = []
for field in range(D_tile2.shape[0]):
    df = pd.DataFrame({"Distance": D_tile2[field, :]})
    df["Field"] = str(field)
    dfs.append(df)
df = pd.concat(dfs)

sns.boxplot(df, x="Distance", y="Field")
plt.title("Distances from Other Fields of Same Well")

# # f, axs = plt.subplots(nrows=9, ncols=9)
# # for i, row in enumerate(axs):
# #     for j, ax in enumerate(row):
# #         sns.histplot(D_tile[i,j,:], ax=ax)
# #         ax.set_xlim(left=0, right=1)

# # f.set_size_inches(20,20)

# f, axs = plt.subplots(ncols=9)
# for i, ax in enumerate(axs):
#     sns.histplot(D_tile2[i,:], ax=ax)
#     ax.set_xlim(left=0, right=1)
# f.set_size_inches(15,5)