In [1]:
import anndata
import umap
import xgboost
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
genes = ["CTNNB1", "JUND", "CD320", "CTNNB1", "IFI6"]
levels = ["0", "0.5", "1.0"]
data_dir = "/data/scgpt_perturbation_colon_epithel2/perturbations"
file_pattern = data_dir + "/perturbation_experiment_{gene}_level_{level}"
base_fname = file_pattern.format(**{"gene": "JUND", "level": "1.0"})
adata = anndata.read_h5ad(f"{base_fname}.h5ad")


In [3]:
! ls {data_dir}

perturbation_experiment_CD320_level_0.5.embeddings.npy
perturbation_experiment_CD320_level_0.5.h5ad
perturbation_experiment_CD320_level_0.embeddings.npy
perturbation_experiment_CD320_level_0.h5ad
perturbation_experiment_CD320_level_1.0.embeddings.npy
perturbation_experiment_CD320_level_1.0.h5ad
perturbation_experiment_CD320_level_5.0.embeddings.npy
perturbation_experiment_CD320_level_5.0.h5ad
perturbation_experiment_CTNNB1_level_0.5.embeddings.npy
perturbation_experiment_CTNNB1_level_0.5.h5ad
perturbation_experiment_CTNNB1_level_0.embeddings.npy
perturbation_experiment_CTNNB1_level_0.h5ad
perturbation_experiment_IFI6_level_0.5.embeddings.npy
perturbation_experiment_IFI6_level_0.5.h5ad
perturbation_experiment_IFI6_level_0.embeddings.npy
perturbation_experiment_IFI6_level_0.h5ad
perturbation_experiment_JUND_level_0.5.embeddings.npy
perturbation_experiment_JUND_level_0.5.h5ad
perturbation_experiment_JUND_level_0.embeddings.npy
perturbation_experiment_JUND_level_0.h5ad
perturbation_experim

In [4]:
base = adata.obs[
    [
        "donor_id",
        "age group",
        "cell_type",
        "assay",
        "disease",
        "organism",
        "sex",
        "tissue",
        "development_stage",
    ]
].copy()

base.sample(5).T

cell_id,N175041_N1-GCGCGATAGTTGCAGG,N128400_E-TCCCAGTCAGGTTACT,N51_Epi_B-GACAGAGGTACGACCC,N110204_L-GGTTGTACAGCTGTAT,N1108147_L-AATTTCCAGCACCTGC
donor_id,175041,128400,N51,110204,1108147
age group,60-64,40-44,,25-29,70-74
cell_type,enterocyte,enterocyte,stem cell,enterocyte,enterocyte
assay,10x 3' v2,10x 3' v3,10x 3' v2,10x 3' v3,10x 3' v3
disease,Crohn disease,Crohn disease,normal,Crohn disease,Crohn disease
organism,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens
sex,male,female,male,male,male
tissue,left colon,colonic epithelium,caecum,lamina propria of mucosa of colon,lamina propria of mucosa of colon
development_stage,seventh decade human stage,fifth decade human stage,human adult stage,third decade human stage,eighth decade human stage


In [5]:
# train umap...
embedding_base = np.load(f"{base_fname}.embeddings.npy")
base = base.copy()
if not "umap_x" in base.columns:
    umap_base = umap.UMAP().fit(embedding_base)
    base["umap_x"], base["umap_y"] = umap_base.transform(embedding_base).T

In [6]:
for gene in tqdm(genes):
    for level in ["0"]:
        if not f"umap_x_{gene}_KO" in base.columns:
            emb_fname = file_pattern.format(**{"gene": gene, "level": level}) + ".embeddings.npy"
            if Path(emb_fname).exists():
                base[[f"umap_x_{gene}_KO", f"umap_y_{gene}_KO"]] = umap_base.transform(np.load(emb_fname))
base.head()

100%|██████████| 5/5 [01:59<00:00, 23.93s/it]


Unnamed: 0_level_0,donor_id,age group,cell_type,assay,disease,organism,sex,tissue,development_stage,umap_x,umap_y,umap_x_CTNNB1_KO,umap_y_CTNNB1_KO,umap_x_JUND_KO,umap_y_JUND_KO,umap_x_CD320_KO,umap_y_CD320_KO,umap_x_IFI6_KO,umap_y_IFI6_KO
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
N105446_L-ATTGTTCCAAACGTGG,105446,25-29,paneth cell,10x 3' v3,Crohn disease,Homo sapiens,male,lamina propria of mucosa of colon,third decade human stage,8.771056,6.354915,8.857376,6.336743,8.851371,6.327375,8.771056,6.354915,8.776049,6.388428
N105446_L-TCGACGGGTGAGACCA,105446,25-29,paneth cell,10x 3' v3,Crohn disease,Homo sapiens,male,lamina propria of mucosa of colon,third decade human stage,8.553398,5.880143,8.541348,5.852661,8.454823,5.896939,8.553398,5.880143,8.543283,5.872672
N105446_L-AGTAACCGTTAAGGGC,105446,25-29,paneth cell,10x 3' v3,Crohn disease,Homo sapiens,male,lamina propria of mucosa of colon,third decade human stage,7.695316,5.954654,7.679074,5.951934,7.699548,5.959259,7.695316,5.954654,7.872936,5.994734
N105446_L-GCAGGCTTCGCTAAAC,105446,25-29,goblet cell,10x 3' v3,Crohn disease,Homo sapiens,male,lamina propria of mucosa of colon,third decade human stage,12.286396,6.362712,12.31464,6.404858,12.300744,6.481446,12.286396,6.362712,12.281522,6.412534
N105446_L-ATCTTCATCTGAGAGG,105446,25-29,goblet cell,10x 3' v3,Crohn disease,Homo sapiens,male,lamina propria of mucosa of colon,third decade human stage,12.271574,6.429347,12.22049,6.525215,12.25596,6.455842,12.271574,6.429347,12.268441,6.439162


In [7]:
def add_distance_using_eval(df, gene):
    expression = f"sqrt((umap_x_{gene}_KO - umap_x) ** 2 + (umap_y_{gene}_KO - umap_y) ** 2)"
    df[f'distance_{gene}_KO'] = df.eval(expression)

for gene in genes:
    add_distance_using_eval(base, gene)



In [8]:
base.to_csv("/scratch/ColonEpithelial_GenePerturbations_umaps_Results_2024-05-01.csv.zip", compression="zip")

In [13]:
! ls -lh /scratch/

total 6.3M
-rw-r--r-- 1 root root 6.3M May  1 12:19 ColonEpithelial_GenePerturbations_umaps_Results_2024-05-01.csv.zip


In [16]:
df = pd.read_csv("/scratch/ColonEpithelial_GenePerturbations_umaps_Results_2024-05-01.csv.zip",low_memory=False)
df.sample(10).T

Unnamed: 0,51644,23333,69234,34686,70524,45823,54716,11201,47737,94487
cell_id,N15_Epi_A-ACCTTTGATTGCTT,H197396_N1-CAGCTAACAGATCCAT,N51_Epi_A-GTAGGCCGTCAAAGAT,I175041_N1-CAAGAAAGTCAGAAGC,N51_Epi_B-TGTTCCGAGCTAGTTC,N10_Epi_B-AGGGCCACGTGTAC,N15_Epi_B-ATAGCCGACCAAGT,N128624_E-TCCACGTTCGAACGGA,N11_Epi_A-TGCACGCTGAGGTG,N130084_E-TGATTCTAGAATCGAT
donor_id,N15,197396,N51,175041,N51,N10,N15,128624,N11,130084
age group,,70-74,,60-64,,,,40-44,,45-49
cell_type,enterocyte,stem cell,enterocyte,enterocyte,goblet cell,epithelial cell,enterocyte,epithelial cell,goblet cell,goblet cell
assay,10x 3' v1,10x 3' v2,10x 3' v2,10x 3' v2,10x 3' v2,10x 3' v1,10x 3' v1,10x 3' v3,10x 3' v1,10x 3' v3
disease,normal,normal,normal,Crohn disease,normal,normal,normal,Crohn disease,normal,Crohn disease
organism,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens
sex,male,male,male,male,male,female,male,female,male,female
tissue,right colon,colon,caecum,sigmoid colon,caecum,right colon,right colon,colonic epithelium,right colon,colonic epithelium
development_stage,human adult stage,eighth decade human stage,human adult stage,seventh decade human stage,human adult stage,human adult stage,human adult stage,fifth decade human stage,human adult stage,fifth decade human stage
