In [1]:
import pathlib

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tifffile as tf  # write tiff files
from cytocherrypick.calculations import find_median
from PIL import Image  # read tiff files
from tqdm import tqdm  # progress bar

In [2]:
CELL_TYPE = "SHSY5Y"

In [3]:
agg_cell_path = pathlib.Path(f"../../../data/{CELL_TYPE}_preprocessed_sc_norm.parquet")
agg_cell_df = pd.read_parquet(agg_cell_path)

In [4]:
# Get the current working directory of the repository
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
root_dir

PosixPath('/home/lippincm/Documents/ML/Interstellar_Analysis')

In [5]:
image_out_dir_path = pathlib.Path(f"{root_dir}/8.cytopick_analysis/figures/PBMC/")


# if path does not exist, create it
image_out_dir_path.mkdir(parents=True, exist_ok=True)

In [6]:
# define directories
# where the images are on a local machine
# this is a hard coded path to the 1TB image directory

#####
# THIS PATH NEEDS TO BE CHANGED TO THE LOCAL IMAGE DIRECTORY ON YOUR MACHINE
#####

image_dir_path = pathlib.Path(
    "/media/lippincm/18T/interstellar_data/70117_20230210MM1_Gasdermin514_CP_BC430856__2023-03-22T15_42_38-Measurement1/2.IC/"
).resolve(strict=True)

In [7]:
# path
anova_path = pathlib.Path(
    f"../../../1.Exploratory_Data_Analysis/results/PBMC_combined.parquet"
)
# read in the anova results
anova_results = pd.read_parquet(anova_path)

In [8]:
# create a column that adds group1 and group2 together
anova_results["group"] = anova_results["group1"] + "_" + anova_results["group2"]
print(anova_results.shape)

# filter out rows that have p-adj_abs > 0.05
anova_results = anova_results[anova_results["p-adj_abs"] < 0.05]
print(anova_results.shape)

# change the group names to replace healthy with control
anova_results["group"] = anova_results["group"].str.replace("healthy", "control")
# make a -log10(p-adj) column
anova_results["neg-log10(p-adj_abs)"] = -np.log10(anova_results["p-adj_abs"])
# sort by neg-log10(p-adj_abs)
anova_results = anova_results.sort_values(by="neg-log10(p-adj_abs)", ascending=False)
# split the dfs into comparisons
c_p_df = anova_results[anova_results["group"] == "control_pyroptosis"]
a_c_df = anova_results[anova_results["group"] == "apoptosis_control"]
a_p_df = anova_results[anova_results["group"] == "apoptosis_pyroptosis"]
# sort by neg-log10(p-adj_abs)
c_p_df = c_p_df.sort_values(by="neg-log10(p-adj_abs)", ascending=False)
a_c_df = a_c_df.sort_values(by="neg-log10(p-adj_abs)", ascending=False)
a_p_df = a_p_df.sort_values(by="neg-log10(p-adj_abs)", ascending=False)

(3735, 11)
(2415, 11)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
# get the top 10 features for each comparison
c_p_top10 = c_p_df.iloc[:10, :]
a_c_top10 = a_c_df.iloc[:10, :]
a_p_top10 = a_p_df.iloc[:10, :]

c_p_top10["features"].to_list()
a_c_top10["features"].to_list()
a_p_top10["features"].to_list()

['Cytoplasm_RadialDistribution_ZernikePhase_CorrGasdermin_9_1',
 'Cells_RadialDistribution_ZernikePhase_CorrMito_7_1',
 'Nuclei_RadialDistribution_ZernikePhase_CorrPM_7_1',
 'Cells_Intensity_IntegratedIntensityEdge_CorrER',
 'Cytoplasm_RadialDistribution_ZernikePhase_CorrGasdermin_6_4',
 'Cytoplasm_RadialDistribution_ZernikePhase_CorrMito_8_4',
 'Nuclei_RadialDistribution_ZernikePhase_CorrPM_1_1',
 'Cells_RadialDistribution_FracAtD_CorrPM_1of4',
 'Cytoplasm_Texture_AngularSecondMoment_CorrPM_3_02_256',
 'Cytoplasm_AreaShape_Zernike_8_2']