In [1]:
import pathlib

import pandas as pd
import statsmodels.stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
endpoint_path = pathlib.Path(
    "../../data/CP_feature_select/endpoint_whole_image/feature_selected_whole_image.parquet"
).resolve(strict=True)
intensity_feature_path = pathlib.Path(
    "../data/0.ground_truth/annexinv_intensity_features_df.parquet"
).resolve()
intensity_feature_path.parent.mkdir(parents=True, exist_ok=True)
tukey_results_path = pathlib.Path(
    "../data/0.ground_truth/tukey_results.parquet"
).resolve()
tukey_results_path.parent.mkdir(parents=True, exist_ok=True)

endpoint_df = pd.read_parquet(endpoint_path)
endpoint_df.head()

Unnamed: 0,Metadata_plate,Metadata_compound,Metadata_dose,Metadata_control,Metadata_Channel,Metadata_FOV,Metadata_FileLocation,Metadata_Time,Metadata_Well,Metadata_Z_slice,...,Texture_DifferenceEntropy_AnnexinV_3_00_256,Texture_DifferenceEntropy_DNA_3_00_256,Texture_DifferenceVariance_AnnexinV_3_03_256,Texture_DifferenceVariance_DNA_3_00_256,Texture_InfoMeas1_AnnexinV_3_00_256,Texture_InfoMeas1_DNA_3_03_256,Texture_InfoMeas2_AnnexinV_3_03_256,Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Texture_SumVariance_DNA_3_03_256,Texture_Variance_AnnexinV_3_02_256
0,1,Staurosporine,1.22,test,,4,,14,C-04,1,...,-1.110862,0.606494,0.884984,-0.791035,-2.241106,-0.819993,0.47363,1.205834,1.411462,-1.206193
1,1,Staurosporine,1.22,test,,2,,14,C-04,1,...,-2.003338,1.068383,1.500758,-1.526473,-1.202551,1.091309,-0.650447,1.964588,1.485453,-2.165498
2,1,Staurosporine,1.22,test,,3,,14,C-04,1,...,-0.601772,-1.718307,0.899249,-1.245227,-1.165456,0.598269,0.267862,0.873253,-2.096121,-0.39619
3,1,Staurosporine,1.22,test,,1,,14,C-04,1,...,-1.984114,0.603107,0.349924,-0.790168,-2.994923,1.594837,0.747283,2.022597,0.75748,-1.438568
4,1,Staurosporine,78.13,test,,4,,14,D-10,1,...,4.430621,0.418727,-0.579661,-0.731447,-2.363835,-7.952185,3.768172,-4.995891,12.379506,2.193259


In [3]:
metadata_columns = [x for x in endpoint_df.columns if "Metadata_dose" in x]
# get the annexinV columns
annexinV_columns = [x for x in endpoint_df.columns if "Intensity" in x]
annexinv_df = endpoint_df[metadata_columns + annexinV_columns]

annexinv_df.head()
# save the intensity feature df

annexinv_df.to_parquet(intensity_feature_path)

Interesting result here - should be faceted by the channel. 
I am interested in determining the key dose that is the most effective

In [4]:
# perform ANOVA for each intensity column for each dose
list_of_anova_results = []
for column in annexinv_df.columns:
    if column == "Metadata_dose":
        continue
    model = ols(f"{column} ~ C(Metadata_dose)", data=annexinv_df).fit()
    anova_results = anova_lm(model, typ=2)
    anova_results.reset_index(inplace=True)
    anova_results["feature"] = column
    # post hoc test
    tukey = pairwise_tukeyhsd(
        endog=annexinv_df[column], groups=annexinv_df["Metadata_dose"], alpha=0.05
    )
    tukey_results = pd.DataFrame(
        data=tukey._results_table.data[1:], columns=tukey._results_table.data[0]
    )
    tukey_results["feature"] = column
    list_of_anova_results.append(tukey_results)
df = pd.concat(list_of_anova_results)
# correct for multiple testing
df["p-adj_bh"] = statsmodels.stats.multitest.multipletests(
    df["p-adj"], method="fdr_bh"
)[1]

df.to_parquet(tukey_results_path)
df.head()

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject,feature,p-adj_bh
0,0.0,0.61,-0.5173,0.9764,-1.978,0.9435,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
1,0.0,1.22,0.0414,1.0,-1.4194,1.5022,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
2,0.0,2.44,0.3673,0.998,-1.0935,1.828,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
3,0.0,4.88,-0.1914,1.0,-1.6522,1.2694,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
4,0.0,9.77,0.2276,1.0,-1.2332,1.6884,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
