In [30]:
# ! pip install mlflow -q

# Import required libraries, data and results

In [28]:
import numpy as np
import pandas as pd
import pickle
import anndata
import xgboost
from pathlib import Path

base_dataset_path = "/data/perturbation_bining_colon_epothelial_take_7/base_dataset.h5ad"
control_data_path = "/data/perturbation_bining_colon_epothelial_take_7/perturbation_experiment_JUND_level_1.0.embeddings.npz"
knockout_data_path = "/data/perturbation_bining_colon_epothelial_take_7/perturbation_experiment_JUND_level_0.embeddings.npz"

# load the datasets
if not "adata" in locals():
    adata = anndata.read_h5ad(base_dataset_path)
    control_embedding = np.load(control_data_path)["emb"]
    ko_embedding = np.load(knockout_data_path)["emb"]

In [None]:
print(f"The reaults from the scGPT capsule: embeddings CELL x Vector {ko_embedding.shape} matrix")
ko_embedding

In [42]:
print(f"Single-cell transcriptomics data shape: cells: {adata.shape[0]:,}, genes: {adata.shape[1]:,}")
print(f"control ('WT')  embedding shape:        cells: {control_embedding.shape[0]:,} features: {control_embedding.shape[1]}")
print(f"KnockOut ('KO') embedding shape:        cells: {ko_embedding.shape[0]:,} features: {ko_embedding.shape[1]}")

Single-cell transcriptomics data shape: cells: 97,788, genes: 27,289
control ('WT')  embedding shape:        cells: 97,788 features: 512
KnockOut ('KO') embedding shape:        cells: 97,788 features: 512


In [46]:
# explore the datasets
print("The main metdata of the cell transcriptomics dataset")
adata.obs[["cell_type", "development_stage", "tissue", "disease"]].sample(frac=1)

The main metdata of the cell transcriptomics dataset


Unnamed: 0_level_0,cell_type,development_stage,tissue,disease
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N110204_E-ATATCCTTCGCTCTAC,goblet cell,third decade human stage,colonic epithelium,Crohn disease
N130084_E-CCTCAACTCTTGGTGA,goblet cell,fifth decade human stage,colonic epithelium,Crohn disease
N18_Epi_A-ACCACGCTTGAGCT,enterocyte,human adult stage,transverse colon,normal
N15_Epi_B-CAGAGGGAGTTGTG,enterocyte,human adult stage,right colon,normal
N164969_E-AGCTCAAAGTAACCTC,enterocyte,sixth decade human stage,colonic epithelium,Crohn disease
...,...,...,...,...
N51_Epi_B-TCGGTAAGTCCAGTGC,enterocyte,human adult stage,caecum,normal
N114902_N-GCCTCTAGTGACGGTA,stem cell,fourth decade human stage,colon,Crohn disease
N10_Epi_B-CTCAGGCTATCACG,stem cell,human adult stage,right colon,normal
H180844_N1-GTAGGCCTCTGTACGA,enterocyte,seventh decade human stage,colon,normal


# Train a model to seperate the Crohn's and Normal cells

In [None]:
# train a classifier on the control_embedding_data
trained_model_path = "xgboost_model.pkl"

if Path(trained_model_path).exists():
    # load the trained model
    with open(trained_model_path, "rb") as f:
        model = pickle.load(f)

else:
    # convert the disease data to numerical:
    disease_categories = pd.Categorical(adata.obs["disease"]).codes
    model = xgboost.XGBClassifier() # initialize the model
    model.fit(control_embedding, disease_categories) # train the model
    
    # save the model
    with open(trained_model_path, "wb") as f:
        pickle.dump(model, f)

model

# Use the model and the results from scGPT to estimate the effect of KO on the Crohn's state

In [None]:
# get the cells' metadata
result = adata.obs[["tissue","cell_type", "disease"]].copy()
result.sample(10)

In [None]:
# get the probability of chron's disease for each cell, based on the model:
result["control_prediction"] = model.predict_proba(control_embedding)[:, 1]

# visualize:
(
    result.sample(10, random_state=23)
    .style.format("{:.3f}", subset=["control_prediction"])
    .background_gradient(subset=["control_prediction"], cmap="coolwarm")
)


In [None]:
result["ko_prediction"] = model.predict_proba(ko_embedding)[:, 1]

# visualize:
(
    result.sample(10, random_state=23)
    .style.format("{:.3f}", subset=["control_prediction",'ko_prediction'])
    .background_gradient(subset=["control_prediction",'ko_prediction'], cmap="coolwarm")
)


In [None]:
result["delta"] = result["control_prediction"] - result["ko_prediction"]
(
    result.sample(10, random_state=23)
    .style.format("{:.3f}", subset=["control_prediction", "ko_prediction", "delta"])
    .background_gradient(
        subset=["control_prediction", "ko_prediction", "delta"], cmap="coolwarm"
    )
)
