In [1]:
import pathlib
import warnings

import numpy as np
import pandas as pd
import statsmodels.api as sm
import toml
from matplotlib import rcParams
from tqdm import tqdm

rcParams.update({"figure.autolayout": True})

# create a venn diagram of the features that are significant in all conditions
from matplotlib_venn import venn2, venn3, venn3_unweighted

warnings.filterwarnings("ignore")
from pycytominer.cyto_utils import infer_cp_features
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
cell_type = "PBMC"

In [3]:
# Import Data
# set data file path under pathlib path for multi-system use
file_path = pathlib.Path(f"../data/{cell_type}_preprocessed_sc_norm.parquet")
df = pd.read_parquet(file_path)

In [4]:
# toml file path
ground_truth_file = pathlib.Path(
    "../4.sc_Morphology_Neural_Network_MLP_Model/MLP_utils/ground_truth.toml"
).resolve(strict=True)
# read toml file
ground_truth = toml.load(ground_truth_file)
apopotosis_trts = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_trts = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
healthy_trts = ground_truth["Healthy"]["healthy_groups_list"]

# make a column that has the class of each treatment


df["apoptosis"] = df.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] in apopotosis_trts,
    axis=1,
)
df["pyroptosis"] = df.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] in pyroptosis_trts,
    axis=1,
)
df["healthy"] = df.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] in healthy_trts,
    axis=1,
)

# merge apoptosis, pyroptosis, and healthy columns into one column

df["labels"] = df.apply(
    lambda row: "apoptosis"
    if row["apoptosis"]
    else "pyroptosis"
    if row["pyroptosis"]
    else "healthy",
    axis=1,
)
# drop apoptosis, pyroptosis, and healthy columns
df.drop(columns=["apoptosis", "pyroptosis", "healthy"], inplace=True)

In [5]:
df_metadata = df.filter(regex="Metadata")
df_data = df.drop(df_metadata.columns, axis=1)
df_data["Metadata_number_of_singlecells"] = df_metadata[
    "Metadata_number_of_singlecells"
]
cp_features = infer_cp_features(df)

In [6]:
# anova for each feature in the dataframe with posthoc tukey test to determine which groups are different from each other
lst = []
# for i in cp_features:
for i in tqdm(cp_features):
    formula = f"{i} ~ C(labels) + C(Metadata_number_of_singlecells)"
    model = ols(formula, df_data).fit()
    aov_table = sm.stats.anova_lm(model, typ=2)
    posthoc = pairwise_tukeyhsd(
        df_data[i],
        df_data["labels"],
        alpha=0.001,
    )
    # print(posthoc)
    lst.append([posthoc, i])

  0%|          | 0/1245 [00:00<?, ?it/s]

  0%|          | 1/1245 [01:15<26:13:54, 75.91s/it]

  0%|          | 2/1245 [02:31<26:07:28, 75.66s/it]

  0%|          | 3/1245 [03:46<26:03:21, 75.52s/it]

  0%|          | 4/1245 [05:02<26:02:37, 75.55s/it]

  0%|          | 5/1245 [06:18<26:02:22, 75.60s/it]

  0%|          | 6/1245 [07:33<26:02:53, 75.69s/it]

  1%|          | 7/1245 [08:49<25:59:06, 75.56s/it]

  1%|          | 8/1245 [10:04<25:56:38, 75.50s/it]

  1%|          | 9/1245 [11:20<25:55:32, 75.51s/it]

  1%|          | 10/1245 [12:35<25:55:55, 75.59s/it]

  1%|          | 11/1245 [13:51<25:53:33, 75.54s/it]

  1%|          | 12/1245 [15:06<25:52:47, 75.56s/it]

  1%|          | 13/1245 [16:22<25:50:22, 75.51s/it]

  1%|          | 14/1245 [17:37<25:49:54, 75.54s/it]

  1%|          | 15/1245 [18:53<25:48:47, 75.55s/it]

  1%|▏         | 16/1245 [20:08<25:45:54, 75.47s/it]

  1%|▏         | 17/1245 [21:23<25:41:43, 75.33s/it]

  1%|▏         | 18/1245 [22:39<25:40:08, 75.31s/it]

  2%|▏         | 19/1245 [23:54<25:38:42, 75.30s/it]

  2%|▏         | 20/1245 [25:09<25:37:09, 75.29s/it]

  2%|▏         | 21/1245 [26:24<25:34:07, 75.20s/it]

  2%|▏         | 22/1245 [27:39<25:32:46, 75.20s/it]

  2%|▏         | 23/1245 [28:54<25:31:09, 75.18s/it]

  2%|▏         | 24/1245 [30:10<25:30:12, 75.19s/it]

  2%|▏         | 25/1245 [31:25<25:27:59, 75.15s/it]

  2%|▏         | 26/1245 [32:40<25:27:46, 75.20s/it]

  2%|▏         | 27/1245 [33:55<25:25:50, 75.16s/it]

  2%|▏         | 28/1245 [35:10<25:24:48, 75.18s/it]

  2%|▏         | 29/1245 [36:26<25:26:48, 75.34s/it]

  2%|▏         | 30/1245 [37:42<25:27:54, 75.45s/it]

  2%|▏         | 31/1245 [38:57<25:25:04, 75.37s/it]

  3%|▎         | 32/1245 [40:12<25:24:16, 75.40s/it]

  3%|▎         | 33/1245 [41:28<25:24:00, 75.45s/it]

  3%|▎         | 34/1245 [42:43<25:21:59, 75.41s/it]

  3%|▎         | 35/1245 [43:59<25:22:40, 75.50s/it]

  3%|▎         | 36/1245 [45:15<25:22:24, 75.55s/it]

  3%|▎         | 37/1245 [46:30<25:21:08, 75.55s/it]

  3%|▎         | 38/1245 [47:45<25:17:57, 75.46s/it]

  3%|▎         | 39/1245 [49:01<25:16:55, 75.47s/it]

  3%|▎         | 40/1245 [50:16<25:14:36, 75.42s/it]

  3%|▎         | 41/1245 [51:32<25:13:19, 75.41s/it]

  3%|▎         | 42/1245 [52:47<25:09:31, 75.29s/it]

  3%|▎         | 43/1245 [54:02<25:08:30, 75.30s/it]

  4%|▎         | 44/1245 [55:17<25:06:30, 75.26s/it]

  4%|▎         | 45/1245 [56:32<25:03:49, 75.19s/it]

  4%|▎         | 46/1245 [57:48<25:03:37, 75.24s/it]

  4%|▍         | 47/1245 [59:03<25:02:20, 75.24s/it]

  4%|▍         | 48/1245 [1:00:18<25:01:49, 75.28s/it]

  4%|▍         | 49/1245 [1:01:34<25:04:36, 75.48s/it]

  4%|▍         | 50/1245 [1:02:50<25:08:42, 75.75s/it]

  4%|▍         | 51/1245 [1:04:06<25:09:06, 75.83s/it]

  4%|▍         | 52/1245 [1:05:22<25:06:32, 75.77s/it]

  4%|▍         | 53/1245 [1:06:37<25:01:35, 75.58s/it]

  4%|▍         | 54/1245 [1:07:53<25:00:07, 75.57s/it]

  4%|▍         | 55/1245 [1:09:09<25:01:38, 75.71s/it]

  4%|▍         | 56/1245 [1:10:25<25:02:29, 75.82s/it]

  5%|▍         | 57/1245 [1:11:41<25:02:09, 75.87s/it]

  5%|▍         | 58/1245 [1:12:57<25:01:44, 75.91s/it]

  5%|▍         | 59/1245 [1:14:14<25:06:56, 76.24s/it]

  5%|▍         | 60/1245 [1:15:31<25:10:57, 76.50s/it]

  5%|▍         | 61/1245 [1:16:48<25:14:49, 76.76s/it]

  5%|▍         | 62/1245 [1:18:04<25:08:41, 76.52s/it]

  5%|▌         | 63/1245 [1:19:20<25:03:52, 76.34s/it]

  5%|▌         | 64/1245 [1:20:36<24:59:38, 76.19s/it]

  5%|▌         | 65/1245 [1:21:52<24:55:08, 76.02s/it]

  5%|▌         | 66/1245 [1:23:08<24:53:06, 75.99s/it]

  5%|▌         | 67/1245 [1:24:24<24:53:12, 76.05s/it]

  5%|▌         | 68/1245 [1:25:40<24:52:11, 76.07s/it]

  6%|▌         | 69/1245 [1:26:56<24:49:02, 75.97s/it]

  6%|▌         | 70/1245 [1:28:11<24:46:22, 75.90s/it]

  6%|▌         | 71/1245 [1:29:27<24:43:44, 75.83s/it]

  6%|▌         | 72/1245 [1:30:43<24:40:20, 75.72s/it]

  6%|▌         | 73/1245 [1:31:58<24:35:39, 75.55s/it]

  6%|▌         | 74/1245 [1:33:13<24:34:15, 75.54s/it]

  6%|▌         | 75/1245 [1:34:29<24:33:38, 75.57s/it]

  6%|▌         | 76/1245 [1:35:45<24:36:55, 75.80s/it]

  6%|▌         | 77/1245 [1:37:01<24:32:46, 75.66s/it]

  6%|▋         | 78/1245 [1:38:16<24:32:38, 75.71s/it]

  6%|▋         | 79/1245 [1:39:32<24:31:51, 75.74s/it]

  6%|▋         | 80/1245 [1:40:49<24:34:29, 75.94s/it]

  7%|▋         | 81/1245 [1:42:05<24:37:06, 76.14s/it]

  7%|▋         | 82/1245 [1:43:22<24:38:05, 76.26s/it]

  7%|▋         | 83/1245 [1:44:38<24:38:13, 76.33s/it]

  7%|▋         | 84/1245 [1:45:55<24:38:41, 76.42s/it]

  7%|▋         | 85/1245 [1:47:11<24:36:46, 76.39s/it]

  7%|▋         | 86/1245 [1:48:28<24:36:40, 76.45s/it]

  7%|▋         | 87/1245 [1:49:44<24:34:35, 76.40s/it]

  7%|▋         | 88/1245 [1:51:01<24:34:14, 76.45s/it]

  7%|▋         | 89/1245 [1:52:17<24:32:44, 76.44s/it]

  7%|▋         | 90/1245 [1:53:34<24:32:17, 76.48s/it]

  7%|▋         | 91/1245 [1:54:49<24:23:21, 76.08s/it]

  7%|▋         | 92/1245 [1:56:04<24:19:11, 75.93s/it]

  7%|▋         | 93/1245 [1:57:20<24:13:44, 75.72s/it]

  8%|▊         | 94/1245 [1:58:35<24:09:18, 75.55s/it]

  8%|▊         | 95/1245 [1:59:50<24:08:31, 75.58s/it]

  8%|▊         | 96/1245 [2:01:06<24:07:00, 75.56s/it]

  8%|▊         | 97/1245 [2:02:21<24:03:52, 75.46s/it]

  8%|▊         | 98/1245 [2:03:37<24:02:49, 75.47s/it]

  8%|▊         | 99/1245 [2:04:52<24:00:46, 75.43s/it]

  8%|▊         | 100/1245 [2:06:08<24:00:19, 75.48s/it]

  8%|▊         | 101/1245 [2:07:23<23:59:04, 75.48s/it]

  8%|▊         | 102/1245 [2:08:38<23:58:03, 75.49s/it]

  8%|▊         | 103/1245 [2:09:55<24:02:47, 75.80s/it]

  8%|▊         | 104/1245 [2:11:12<24:06:48, 76.08s/it]

  8%|▊         | 105/1245 [2:12:28<24:08:16, 76.23s/it]

  9%|▊         | 106/1245 [2:13:43<24:00:51, 75.90s/it]

  9%|▊         | 107/1245 [2:14:59<23:57:03, 75.77s/it]

  9%|▊         | 108/1245 [2:16:15<23:59:46, 75.98s/it]

  9%|▉         | 109/1245 [2:17:32<24:01:04, 76.11s/it]

  9%|▉         | 110/1245 [2:18:48<24:01:04, 76.18s/it]

  9%|▉         | 111/1245 [2:20:04<23:59:08, 76.15s/it]

  9%|▉         | 112/1245 [2:21:20<23:58:28, 76.18s/it]

  9%|▉         | 113/1245 [2:22:37<23:57:44, 76.21s/it]

  9%|▉         | 114/1245 [2:23:53<23:58:20, 76.30s/it]

  9%|▉         | 115/1245 [2:25:09<23:56:05, 76.25s/it]

  9%|▉         | 116/1245 [2:26:26<23:56:14, 76.33s/it]

  9%|▉         | 117/1245 [2:27:42<23:55:33, 76.36s/it]

  9%|▉         | 118/1245 [2:28:59<23:54:06, 76.35s/it]

 10%|▉         | 119/1245 [2:30:15<23:51:14, 76.27s/it]

 10%|▉         | 120/1245 [2:31:31<23:51:48, 76.36s/it]

 10%|▉         | 121/1245 [2:32:48<23:54:41, 76.59s/it]

 10%|▉         | 122/1245 [2:34:04<23:49:50, 76.39s/it]

 10%|▉         | 123/1245 [2:35:20<23:46:13, 76.27s/it]

 10%|▉         | 124/1245 [2:36:36<23:44:10, 76.23s/it]

 10%|█         | 125/1245 [2:37:53<23:44:51, 76.33s/it]

 10%|█         | 126/1245 [2:39:10<23:44:34, 76.38s/it]

 10%|█         | 127/1245 [2:40:26<23:43:52, 76.42s/it]

 10%|█         | 128/1245 [2:41:43<23:45:14, 76.56s/it]

 10%|█         | 129/1245 [2:43:00<23:45:54, 76.66s/it]

 10%|█         | 130/1245 [2:44:17<23:44:53, 76.68s/it]

 11%|█         | 131/1245 [2:45:33<23:41:06, 76.54s/it]

 11%|█         | 132/1245 [2:46:49<23:40:15, 76.56s/it]

 11%|█         | 133/1245 [2:48:06<23:36:47, 76.45s/it]

 11%|█         | 134/1245 [2:49:22<23:35:12, 76.43s/it]

 11%|█         | 135/1245 [2:50:39<23:34:54, 76.48s/it]

 11%|█         | 136/1245 [2:51:55<23:30:54, 76.33s/it]

 11%|█         | 137/1245 [2:53:11<23:29:05, 76.31s/it]

 11%|█         | 138/1245 [2:54:27<23:27:03, 76.26s/it]

 11%|█         | 139/1245 [2:55:43<23:25:27, 76.25s/it]

 11%|█         | 140/1245 [2:56:59<23:23:23, 76.20s/it]

 11%|█▏        | 141/1245 [2:58:15<23:21:41, 76.18s/it]

 11%|█▏        | 142/1245 [2:59:32<23:20:40, 76.19s/it]

 11%|█▏        | 143/1245 [3:00:48<23:19:53, 76.22s/it]

 12%|█▏        | 144/1245 [3:02:04<23:18:34, 76.22s/it]

 12%|█▏        | 145/1245 [3:03:21<23:20:34, 76.39s/it]

 12%|█▏        | 146/1245 [3:04:37<23:15:02, 76.16s/it]

 12%|█▏        | 147/1245 [3:05:51<23:05:26, 75.71s/it]

 12%|█▏        | 148/1245 [3:07:06<22:57:36, 75.35s/it]

 12%|█▏        | 149/1245 [3:08:20<22:52:49, 75.15s/it]

 12%|█▏        | 150/1245 [3:09:35<22:48:39, 74.99s/it]

 12%|█▏        | 151/1245 [3:10:49<22:41:49, 74.69s/it]

 12%|█▏        | 152/1245 [3:12:04<22:39:50, 74.65s/it]

 12%|█▏        | 153/1245 [3:13:18<22:37:58, 74.61s/it]

 12%|█▏        | 154/1245 [3:14:33<22:36:48, 74.62s/it]

 12%|█▏        | 155/1245 [3:15:47<22:34:39, 74.57s/it]

 13%|█▎        | 156/1245 [3:17:01<22:31:57, 74.49s/it]

 13%|█▎        | 157/1245 [3:18:17<22:37:53, 74.88s/it]

 13%|█▎        | 158/1245 [3:19:33<22:40:11, 75.08s/it]

 13%|█▎        | 159/1245 [3:20:48<22:41:29, 75.22s/it]

 13%|█▎        | 160/1245 [3:22:03<22:39:21, 75.17s/it]

 13%|█▎        | 161/1245 [3:23:18<22:33:02, 74.89s/it]

 13%|█▎        | 162/1245 [3:24:32<22:28:52, 74.73s/it]

 13%|█▎        | 163/1245 [3:25:46<22:24:49, 74.57s/it]

 13%|█▎        | 164/1245 [3:27:01<22:22:26, 74.51s/it]

 13%|█▎        | 165/1245 [3:28:16<22:24:26, 74.69s/it]

 13%|█▎        | 166/1245 [3:29:31<22:25:49, 74.84s/it]

 13%|█▎        | 167/1245 [3:30:47<22:31:23, 75.22s/it]

 13%|█▎        | 168/1245 [3:32:02<22:29:51, 75.20s/it]

 14%|█▎        | 169/1245 [3:33:17<22:28:13, 75.18s/it]

 14%|█▎        | 170/1245 [3:34:32<22:26:01, 75.13s/it]

 14%|█▎        | 171/1245 [3:35:47<22:24:58, 75.14s/it]

 14%|█▍        | 172/1245 [3:37:03<22:23:25, 75.12s/it]

 14%|█▍        | 173/1245 [3:38:18<22:24:25, 75.25s/it]

 14%|█▍        | 174/1245 [3:39:33<22:23:04, 75.24s/it]

 14%|█▍        | 175/1245 [3:40:49<22:22:36, 75.29s/it]

 14%|█▍        | 176/1245 [3:42:04<22:19:48, 75.20s/it]

 14%|█▍        | 177/1245 [3:43:19<22:18:32, 75.20s/it]

 14%|█▍        | 178/1245 [3:44:34<22:18:33, 75.27s/it]

 14%|█▍        | 179/1245 [3:45:51<22:23:40, 75.63s/it]

 14%|█▍        | 180/1245 [3:47:07<22:27:10, 75.90s/it]

 15%|█▍        | 181/1245 [3:48:22<22:20:28, 75.59s/it]

 15%|█▍        | 182/1245 [3:49:37<22:14:30, 75.32s/it]

 15%|█▍        | 183/1245 [3:50:53<22:15:07, 75.43s/it]

 15%|█▍        | 184/1245 [3:52:08<22:13:04, 75.39s/it]

 15%|█▍        | 185/1245 [3:53:23<22:12:01, 75.40s/it]

 15%|█▍        | 186/1245 [3:54:38<22:09:25, 75.32s/it]

 15%|█▌        | 187/1245 [3:55:54<22:09:15, 75.38s/it]

 15%|█▌        | 188/1245 [3:57:09<22:06:22, 75.29s/it]

 15%|█▌        | 189/1245 [3:58:24<22:05:53, 75.34s/it]

 15%|█▌        | 190/1245 [3:59:40<22:04:35, 75.33s/it]

 15%|█▌        | 191/1245 [4:00:55<22:03:29, 75.34s/it]

 15%|█▌        | 192/1245 [4:02:11<22:03:18, 75.40s/it]

 16%|█▌        | 193/1245 [4:03:26<22:02:22, 75.42s/it]

 16%|█▌        | 194/1245 [4:04:42<22:01:21, 75.43s/it]

 16%|█▌        | 195/1245 [4:05:57<21:59:59, 75.43s/it]

 16%|█▌        | 196/1245 [4:07:12<21:58:02, 75.39s/it]

 16%|█▌        | 197/1245 [4:08:27<21:55:02, 75.29s/it]

 16%|█▌        | 198/1245 [4:09:43<21:56:29, 75.44s/it]

 16%|█▌        | 199/1245 [4:10:58<21:53:30, 75.34s/it]

 16%|█▌        | 200/1245 [4:12:14<21:54:45, 75.49s/it]

 16%|█▌        | 201/1245 [4:13:30<21:54:27, 75.54s/it]

 16%|█▌        | 202/1245 [4:14:46<21:57:49, 75.81s/it]

 16%|█▋        | 203/1245 [4:16:02<21:57:08, 75.84s/it]

 16%|█▋        | 204/1245 [4:17:18<21:56:16, 75.87s/it]

 16%|█▋        | 205/1245 [4:18:34<21:55:08, 75.87s/it]

 17%|█▋        | 206/1245 [4:19:49<21:49:51, 75.64s/it]

 17%|█▋        | 207/1245 [4:21:04<21:46:36, 75.53s/it]

 17%|█▋        | 208/1245 [4:22:20<21:44:31, 75.48s/it]

 17%|█▋        | 209/1245 [4:23:35<21:39:56, 75.29s/it]

 17%|█▋        | 210/1245 [4:24:50<21:37:06, 75.19s/it]

 17%|█▋        | 211/1245 [4:26:05<21:38:42, 75.36s/it]

 17%|█▋        | 212/1245 [4:27:21<21:38:27, 75.42s/it]

 17%|█▋        | 213/1245 [4:28:35<21:32:35, 75.15s/it]

 17%|█▋        | 214/1245 [4:29:50<21:29:48, 75.06s/it]

 17%|█▋        | 215/1245 [4:31:05<21:27:47, 75.02s/it]

 17%|█▋        | 216/1245 [4:32:20<21:25:16, 74.94s/it]

 17%|█▋        | 217/1245 [4:33:35<21:23:06, 74.89s/it]

 18%|█▊        | 218/1245 [4:34:49<21:19:56, 74.78s/it]

 18%|█▊        | 219/1245 [4:36:04<21:20:38, 74.89s/it]

 18%|█▊        | 220/1245 [4:37:19<21:19:54, 74.92s/it]

 18%|█▊        | 221/1245 [4:38:34<21:18:23, 74.91s/it]

 18%|█▊        | 222/1245 [4:39:49<21:17:51, 74.95s/it]

 18%|█▊        | 223/1245 [4:41:05<21:21:11, 75.22s/it]

 18%|█▊        | 224/1245 [4:42:20<21:19:53, 75.21s/it]

 18%|█▊        | 225/1245 [4:43:36<21:20:11, 75.31s/it]

In [None]:
tukey_df = pd.DataFrame()
for i in lst:
    j = pd.DataFrame(i[0]._results_table.data[1:])
    j["features"] = np.repeat(i[1], len(j))
    tukey_df = pd.concat([tukey_df, j], axis=0)

    np.repeat(i[1], len(j))

tukey_df.columns = [
    "group1",
    "group2",
    "meandiff",
    "lower",
    "upper",
    "p-adj",
    "reject",
    "features",
]
# drop the other organelle
# make new column with the absolute value of the p-adj
tukey_df["p-adj_abs"] = abs(tukey_df["p-adj"])
# make new column that states if the relationship is positive or negative
tukey_df["pos_neg"] = np.where(tukey_df["p-adj"] > 0, "positive", "negative")
# order the features by p-adj value

In [None]:
tukey_df.head()

In [None]:
# save the dataframe as a parquet file
anova_results_path = pathlib.Path(
    f"./results/{cell_type}_anova_results_all_treatments.parquet"
)
tukey_df.to_parquet(anova_results_path)