Generate different sciplex datasets for training/testing scFiLM.

2000 , 3500, 5000 hvg kept

all cells vs. balanced vs. subsample

pathway activation, genes, embeddings

COATI + RDKit compound emveddings

In [77]:
import anndata as ad
import scanpy as sc
import numpy as np

In [78]:
adata_raw = ad.read_h5ad("../../data/sciplex/sciplex3_uce_adata.h5ad")

In [79]:
adata_raw

AnnData object with n_obs × n_vars = 581766 × 17376
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'n_genes'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'n_cells'
    obsm: 'X_uce'

Create different representations for all cells

In [80]:
sc.pp.filter_cells(adata_raw, min_genes=100)
sc.pp.filter_genes(adata_raw, min_cells=3)

In [81]:
def get_preprocessed_expr(adata_raw, nhvg):
    adata = adata_raw.copy()
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=nhvg)
    adata = adata[:, adata.var['highly_variable']]
    return adata.X.toarray()

In [82]:
X_2000_hvg = get_preprocessed_expr(adata_raw, 2000)
X_3500_hvg = get_preprocessed_expr(adata_raw, 3500)
X_5000_hvg = get_preprocessed_expr(adata_raw, 5000)

In [83]:
adata_raw.obsm['X_2000_hvg'] = X_2000_hvg
adata_raw.obsm['X_3500_hvg'] = X_3500_hvg
adata_raw.obsm['X_5000_hvg'] = X_5000_hvg

In [84]:
adata_raw

AnnData object with n_obs × n_vars = 581565 × 17376
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'n_genes'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'n_cells'
    obsm: 'X_uce', 'X_2000_hvg', 'X_3500_hvg', 'X_5000_hvg'

Create different SM embeddings based on different models

In [85]:
adata_old = ad.read_h5ad("../../data/sciplex/sciplex_preprocessed.h5ad")

In [86]:
adata_old.obs['sm_embedding']

0         [-3.0742437839508057, 1.615997314453125, -3.41...
1         [-3.0815889835357666, 1.041774034500122, -3.38...
2                                                       NaN
3         [-3.334993839263916, 1.0232042074203491, -3.68...
5         [-3.1765522956848145, 1.1800906658172607, -3.5...
                                ...                        
581772    [-3.210461378097534, 0.06894481182098389, -3.5...
581773    [-3.4492268562316895, 1.4253485202789307, -3.7...
581774    [-3.4714298248291016, 1.2979487180709839, -3.8...
581775    [-3.3392510414123535, 0.9335826635360718, -3.6...
581776    [-3.2849373817443848, 0.9245714545249939, -3.5...
Name: sm_embedding, Length: 571906, dtype: category
Categories (183, object): ['[-2.67497181892395, 0.7255674004554749, -2.85..., '[-2.791207790374756, 0.5547888875007629, -3.0..., '[-2.829848289489746, 1.5994620323181152, -3.0..., '[-2.857618808746338, 1.0246357917785645, -3.0..., ..., '[-3.5625855922698975, 2.1696584224700928, -4...., '

In [87]:
drugname_smiles_map = dict(zip(adata_old.obs['product_name'], adata_old.obs['smiles']))
drugname_pubchemid_map = dict(zip(adata_old.obs['product_name'], adata_old.obs['pubchem_ID']))
drugname_coati_map = dict(zip(adata_old.obs['product_name'], adata_old.obs['sm_embedding']))

RDKIT Model

In [88]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [89]:
smiles_rdkit_map = dict()

for smiles in drugname_smiles_map.values():
    if type(smiles) ==  float:
        continue
    else:
        mol = Chem.MolFromSmiles(smiles)
        fcfp4 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, useFeatures=True, nBits=1024).ToBitString()
        fcfp4_list = np.array(list(fcfp4), dtype=np.float32)
    
        smiles_rdkit_map[smiles] = fcfp4_list



Adding the sm embeddings in final adata

In [90]:
adata_raw

AnnData object with n_obs × n_vars = 581565 × 17376
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'n_genes'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'n_cells'
    obsm: 'X_uce', 'X_2000_hvg', 'X_3500_hvg', 'X_5000_hvg'

In [91]:
adata_raw = adata_raw[adata_raw.obs['product_name'].isin(list(adata_old.obs['product_name'].unique()))]

In [92]:
smiles = list()
pubchem_id = list()
coati_emb = list()
morgan_emb = list()

for product_name in list(adata_raw.obs['product_name']):
    if product_name == "Vehicle":
        smiles.append(None)
        pubchem_id.append(None)
        coati_emb.append(None)
        morgan_emb.append(None)

    else:
        smiles.append(drugname_smiles_map[product_name])
        pubchem_id.append(drugname_pubchemid_map[product_name])
        coati_emb.append(drugname_coati_map[product_name])
    
        sm = drugname_smiles_map[product_name]
        morgan_emb.append(smiles_rdkit_map[sm])

In [93]:
adata_raw.obs['SMILES'] = smiles
adata_raw.obs['pubchem_id'] = pubchem_id
adata_raw.obs['sm_coati_emb'] = coati_emb
adata_raw.obs['sm_morgan_emb'] = morgan_emb

adata_raw.obs['sm_morgan_emb'] = adata_raw.obs['sm_morgan_emb'].apply(str)

  adata_raw.obs['SMILES'] = smiles


In [94]:
adata_raw.write_h5ad("../../data/sciplex/sciplex_final.h5ad")

In [95]:
adata_raw

AnnData object with n_obs × n_vars = 571696 × 17376
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'n_genes', 'SMILES', 'pubchem_id', 'sm_coati_emb', 'sm_morgan_emb'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'n_cells'
    obsm: 'X_uce', 'X_2000_hvg', 'X_3500_hvg', 'X_5000_hvg'

In [99]:
adata_raw.obsm['X_5000_hvg']

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 1.2840154, 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]], dtype=float32)