In [1]:
! pip install mlflow -q

[0m

# Import required libraries, data and results

In [1]:
import numpy as np
import pandas as pd
import pickle
import anndata
import xgboost
from pathlib import Path

base_dataset_path = "/data/perturbation_bining_colon_epothelial_take_7/base_dataset.h5ad"
control_data_path = "/data/perturbation_bining_colon_epothelial_take_7/perturbation_experiment_JUND_level_1.0.embeddings.npz"
knockout_data_path = "/data/perturbation_bining_colon_epothelial_take_7/perturbation_experiment_JUND_level_0.embeddings.npz"

# load the datasets
if not "adata" in locals():
    adata = anndata.read_h5ad(base_dataset_path)
    control_embedding = np.load(control_data_path)["emb"]
    ko_embedding = np.load(knockout_data_path)["emb"]

In [2]:
print(f"The reaults from the scGPT capsule: embeddings CELL x Vector {ko_embedding.shape} matrix")
ko_embedding

The reaults from the scGPT capsule: embeddings CELL x Vector (97788, 512) matrix


array([[ 0.40102956,  1.1505572 , -0.846087  , ..., -0.22407077,
        -0.617475  ,  0.15734558],
       [ 0.41551918,  1.0006112 , -0.5593975 , ..., -0.22076431,
        -0.6000564 ,  0.09469572],
       [ 0.40629452,  0.94172704, -0.50550914, ..., -0.25236434,
        -0.48028022, -0.013081  ],
       ...,
       [ 1.0662296 ,  1.412075  ,  0.11929194, ..., -0.5681815 ,
        -0.5062771 , -0.19895682],
       [ 0.62741625,  1.1626298 , -0.16092153, ..., -0.6266747 ,
         0.01310916, -0.31598997],
       [ 0.9090164 ,  1.4771746 , -0.1576058 , ..., -0.70694625,
        -0.3748046 , -0.49906144]], dtype=float32)

In [3]:
print(f"Single-cell transcriptomics data shape: cells: {adata.shape[0]:,}, genes: {adata.shape[1]:,}")
print(f"control ('WT')  embedding shape:        cells: {control_embedding.shape[0]:,} features: {control_embedding.shape[1]}")
print(f"KnockOut ('KO') embedding shape:        cells: {ko_embedding.shape[0]:,} features: {ko_embedding.shape[1]}")

Single-cell transcriptomics data shape: cells: 97,788, genes: 27,289
control ('WT')  embedding shape:        cells: 97,788 features: 512
KnockOut ('KO') embedding shape:        cells: 97,788 features: 512


In [4]:
# explore the datasets
print("The main metdata of the cell transcriptomics dataset")
adata.obs[["cell_type", "development_stage", "tissue", "disease"]].sample(frac=1)

The main metdata of the cell transcriptomics dataset


Unnamed: 0_level_0,cell_type,development_stage,tissue,disease
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N175041_N1-TAGAGCTCAGGAATGC,goblet cell,seventh decade human stage,left colon,Crohn disease
N178961_L-AGGTTGTCACAGTGAG,enterocyte,seventh decade human stage,lamina propria of mucosa of colon,Crohn disease
H197396_N2-GTACTCCCACGGTAAG,goblet cell,eighth decade human stage,colon,normal
N128624_E-GAAGGGTGTATCGCTA,enterocyte,fifth decade human stage,colonic epithelium,Crohn disease
H197396_N3-GATCGATGTTGCGTTA,enterocyte,eighth decade human stage,colon,normal
...,...,...,...,...
N15_Epi_B-TAGGGACTGAGGTG,enterocyte,human adult stage,right colon,normal
N130084_E-TCATTTGTCAACCCGG,enterocyte,fifth decade human stage,colonic epithelium,Crohn disease
I114902_N-CACACCTCAAGTTGTC,goblet cell,fourth decade human stage,colon,Crohn disease
H197396_N3-AGGGATGCAACAACCT,stem cell,eighth decade human stage,colon,normal


# Train a model to seperate the Crohn's and Normal cells

In [5]:
# train a classifier on the control_embedding_data
trained_model_path = "xgboost_model.pkl"

if Path(trained_model_path).exists():
    # load the trained model
    with open(trained_model_path, "rb") as f:
        model = pickle.load(f)

else:
    # convert the disease data to numerical:
    disease_categories = pd.Categorical(adata.obs["disease"]).codes
    model = xgboost.XGBClassifier() # initialize the model
    model.fit(control_embedding, disease_categories) # train the model
    
    # save the model
    with open(trained_model_path, "wb") as f:
        pickle.dump(model, f)

model

# Use the model and the results from scGPT to estimate the effect of KO on the Crohn's state

In [6]:
# get the cells' metadata
result = adata.obs[["tissue","cell_type", "disease"]].copy()
result.sample(10)

Unnamed: 0_level_0,tissue,cell_type,disease
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H197396_N1-CGTCACTGTTAAGAAC,colon,enterocyte,normal
N15_Epi_A-GTAACGTGGTATGC,right colon,stem cell,normal
N10_Epi_B-AGGCAACTATGTGC,right colon,epithelial cell,normal
H197396_N4-CAGAGAGGTGGGTATG,colon,enterocyte,normal
N114902_N-CAAGATCGTTCGTGAT,colon,goblet cell,Crohn disease
N13_Epi_A-CGTGTAGACGCCTT,right colon,goblet cell,normal
I175041_N2-CACACTCCAGCTGCTG,sigmoid colon,enterocyte,Crohn disease
N51_Epi_B-TCGGGACAGAGCTTCT,caecum,goblet cell,normal
I175041_N1-CTAACTTGTATCTGCA,sigmoid colon,enterocyte,Crohn disease
N18_Epi_A-ATTGCTTGATCAGC,transverse colon,enterocyte,normal


In [7]:
# get the probability of chron's disease for each cell, based on the model:
result["control_prediction"] = model.predict_proba(control_embedding)[:, 1]

# visualize:
(
    result.sample(10, random_state=23)
    .style.format("{:.3f}", subset=["control_prediction"])
    .background_gradient(subset=["control_prediction"], cmap="coolwarm")
)


Unnamed: 0_level_0,tissue,cell_type,disease,control_prediction
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
I175041_N1-ACCAGTACACTAGTAC,sigmoid colon,enterocyte,Crohn disease,0.046
H197396_N1-TTGGCAAAGATATGGT,colon,enterocyte,normal,0.66
N20_Epi_B-AGATTCCTCCTTTA,colon,enterocyte,normal,0.919
N128400_L-ACTACGATCGTTCTAT,lamina propria of mucosa of colon,enterocyte,Crohn disease,0.122
N178961_L-CTAACTTCACCCTAGG,lamina propria of mucosa of colon,enterocyte,Crohn disease,0.041
H180844_N2-TGACAACTCACCCTCA,colon,epithelial cell,normal,0.022
N128624_E-TATTCCACAAGTAGTA,colonic epithelium,enterocyte,Crohn disease,0.121
N130084_E-GGAGGTACAGGTTCCG,colonic epithelium,enterocyte,Crohn disease,0.288
N51_Epi_A-CGGTTAAGTTCATGGT,caecum,enterocyte,normal,0.095
N175041_N2-CCTAGCTGTACCGGCT,left colon,enterocyte,Crohn disease,0.24


In [8]:
result["ko_prediction"] = model.predict_proba(ko_embedding)[:, 1]

# visualize:
(
    result.sample(10, random_state=23)
    .style.format("{:.3f}", subset=["control_prediction",'ko_prediction'])
    .background_gradient(subset=["control_prediction",'ko_prediction'], cmap="coolwarm")
)


Unnamed: 0_level_0,tissue,cell_type,disease,control_prediction,ko_prediction
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I175041_N1-ACCAGTACACTAGTAC,sigmoid colon,enterocyte,Crohn disease,0.046,0.099
H197396_N1-TTGGCAAAGATATGGT,colon,enterocyte,normal,0.66,0.899
N20_Epi_B-AGATTCCTCCTTTA,colon,enterocyte,normal,0.919,0.949
N128400_L-ACTACGATCGTTCTAT,lamina propria of mucosa of colon,enterocyte,Crohn disease,0.122,0.478
N178961_L-CTAACTTCACCCTAGG,lamina propria of mucosa of colon,enterocyte,Crohn disease,0.041,0.025
H180844_N2-TGACAACTCACCCTCA,colon,epithelial cell,normal,0.022,0.003
N128624_E-TATTCCACAAGTAGTA,colonic epithelium,enterocyte,Crohn disease,0.121,0.018
N130084_E-GGAGGTACAGGTTCCG,colonic epithelium,enterocyte,Crohn disease,0.288,0.171
N51_Epi_A-CGGTTAAGTTCATGGT,caecum,enterocyte,normal,0.095,0.167
N175041_N2-CCTAGCTGTACCGGCT,left colon,enterocyte,Crohn disease,0.24,0.908


## calculate the 'delta'

In [9]:
result["delta"] = result["control_prediction"] - result["ko_prediction"]

(
    result.sample(10, random_state=23)
    .style.format("{:.3f}", subset=["control_prediction", "ko_prediction", "delta"])
    .background_gradient(
        subset=["control_prediction", "ko_prediction", "delta"], cmap="coolwarm"
    )
)


Unnamed: 0_level_0,tissue,cell_type,disease,control_prediction,ko_prediction,delta
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I175041_N1-ACCAGTACACTAGTAC,sigmoid colon,enterocyte,Crohn disease,0.046,0.099,-0.053
H197396_N1-TTGGCAAAGATATGGT,colon,enterocyte,normal,0.66,0.899,-0.239
N20_Epi_B-AGATTCCTCCTTTA,colon,enterocyte,normal,0.919,0.949,-0.03
N128400_L-ACTACGATCGTTCTAT,lamina propria of mucosa of colon,enterocyte,Crohn disease,0.122,0.478,-0.356
N178961_L-CTAACTTCACCCTAGG,lamina propria of mucosa of colon,enterocyte,Crohn disease,0.041,0.025,0.016
H180844_N2-TGACAACTCACCCTCA,colon,epithelial cell,normal,0.022,0.003,0.02
N128624_E-TATTCCACAAGTAGTA,colonic epithelium,enterocyte,Crohn disease,0.121,0.018,0.104
N130084_E-GGAGGTACAGGTTCCG,colonic epithelium,enterocyte,Crohn disease,0.288,0.171,0.118
N51_Epi_A-CGGTTAAGTTCATGGT,caecum,enterocyte,normal,0.095,0.167,-0.073
N175041_N2-CCTAGCTGTACCGGCT,left colon,enterocyte,Crohn disease,0.24,0.908,-0.667


In [57]:
# visualize the cells with JUND expression
# find the cells with JUND expression:
genes = adata.var.reset_index()[["feature_name"]].copy()
jund_index = genes[genes["feature_name"] == "JUND"].index[0]
jund_expression = adata.X[:, jund_index].toarray().flatten()
# get the index of the cells with JUND expression:
jund_cells = jund_expression > 0
jund_cells_index = np.where(jund_cells)[0]


44056

In [None]:
# cross validation of a model, xgboost on the embeddings, we wantto predict the disease:
# cross validation:
from sklearn.model_selection import cross_val_score

x = control_embedding[colon_index]
y = adata.obs["disease"].values[colon_index]
y = pd.Categorical(y).codes
model = xgboost.XGBClassifier()
scores = cross_val_score(model, x, y, cv=5, scoring="accuracy", n_jobs=5, verbose=1)

print(f"Accuracy: {scores.mean()}")

In [None]:
# cross validation score of xgboost classifier trained to classify 'disease' state using the control embedding

from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

x = control_embedding[colon_index]
y = adata.obs["disease"].iloc[colon_index]

# y should be a categorical codes:
y = pd.Categorical(y).codes

scores = cross_val_score(XGBClassifier(), x, y, cv=5, scoring="accuracy")