In [1]:
import anndata as ad
import scanpy as sc
import pandas as pd
from tqdm import tqdm
from scipy.stats import energy_distance

In [2]:
ROOT = 'C:\\Users\\curea\\Documents\\bioFM for drug discovery\\dege-fm\\'

adata = ad.read_h5ad(ROOT + "data\\adata_preprocessed.h5ad")


In [8]:
def calculate_e_distance(adata, save_file=None):

    def __get_energy_distance(treated, control):
        samples_treated = treated.X.tolist()
        samples_control = control

        e_dist = energy_distance(samples_control, samples_treated)
        return e_dist

    results = list()

    control_A549_list = adata[(adata.obs['cell_type'] == "A549") & (adata.obs['product_name'] == "Vehicle")].X.tolist()
    control_K562_list = adata[(adata.obs['cell_type'] == "K562") & (adata.obs['product_name'] == "Vehicle")].X.tolist()
    control_MCF7_list = adata[(adata.obs['cell_type'] == "MCF7") & (adata.obs['product_name'] == "Vehicle")].X.tolist()

    for compound in tqdm(list(adata.obs['product_name'].unique())):
        if compound == "Vehicle":
            continue

        for cell_type in list(adata.obs['cell_type'].unique()):
            for dose in list(adata.obs['dose'].unique()):

                adata_subset = adata[
                    (adata.obs['product_name'] == compound) &
                    (adata.obs['cell_type'] == cell_type) &
                    (adata.obs['dose'] == dose)
                ]

                if adata_subset.n_obs == 0:
                    print("Found adata subset with zero values")
                    continue

                reference = None

                if cell_type == "A549":
                    reference = control_A549_list
                elif cell_type == "K562":
                    reference = control_K562_list
                elif cell_type == "MCF7":
                    reference = control_MCF7_list
                else:
                    raise RuntimeError("Invalid Cell Type")


                #calculate statistics between adata_subset and reference
                e_dist = __get_energy_distance(adata_subset, reference)

                size_treated = adata_subset.n_obs

                results.append({"compound": compound, "dose": dose, "cell_type": cell_type, "e_dist": e_dist, 'sample_size': size_treated})


    results = pd.DataFrame(results)
    if save_file is not None:
        results.to_csv(save_file, index=False)
    return results

In [None]:
res = calculate_e_distance(adata, ROOT + "results\\test.csv")

  0%|          | 0/186 [00:00<?, ?it/s]