In [36]:
import anndata as ad
import pandas as pd
import scanpy as sc
from coati.models.io.coati import load_e3gnn_smiles_clip_e2e
from coati.generative.coati_purifications import embed_smiles
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from tqdm import tqdm
import torch
import numpy as np

In [2]:
ROOT = 'C:\\Users\\curea\\Documents\\bioFM for drug discovery\\dege-fm\\'

adata = ad.read_h5ad(ROOT + "data\\zhao_2021_uce_adata.h5ad")

In [7]:
list(adata.obs['perturbation'].unique())

['panobinostat',
 'control',
 'etoposide',
 'Ana-12',
 'RO4929097',
 'Tazemetostat',
 'Ispenisib']

In [33]:
#manually retrieve SMILES

names_to_smiles = {"panobinostat": "Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1",
 "control": None,
 "etoposide": "COc1cc([C@@H]2c3cc4c(cc3[C@@H](O[C@@H]3O[C@@H]5CO[C@@H](C)O[C@H]5[C@H](O)[C@H]3O)[C@H]3COC(=O)[C@H]23)OCO4)cc(OC)c1O",
 "Ana-12": "C1CCNC(=O)C(C1)NC(=O)C2=CC=CC=C2NC(=O)C3=CC4=CC=CC=C4S3",
 "RO4929097": "CC(C)(C(=O)NCC(F)(F)C(F)(F)F)C(=O)N[C@@H]1C(=O)Nc2ccccc2-c2ccccc21",
 "Tazemetostat": "CCN(c1cc(-c2ccc(CN3CCOCC3)cc2)cc(C(=O)NCc2c(C)cc(C)[nH]c2=O)c1C)C1CCOCC1",
 "Ispenisib": "Cc1ccc(C(=O)N(CCCN)[C@@H](c2nc3cc(Cl)ccc3c(=O)n2Cc2ccccc2)C(C)C)cc1"}

In [34]:
# load pretrained sm encoder model
encoder, tokenizer = load_e3gnn_smiles_clip_e2e(
    freeze=True,
    device=torch.device("cuda:0"),
    # model parameters to load.
    doc_url="s3://terray-public/models/barlow_closed.pkl",
)

names_to_emb = dict()

for key, value in names_to_smiles.items():
    sm_name = key
    sm = value
    print(sm_name)

    if sm_name == "control":
        continue

    mol = Chem.MolFromSmiles(sm)
    Chem.MolToSmiles(mol)

    remover = SaltRemover()
    stripped = remover.StripMol(mol)

    Chem.RemoveStereochemistry(stripped)
    smiles = Chem.MolToSmiles(stripped)
    smiles = Chem.CanonSmiles(smiles)
    vector = embed_smiles(smiles, encoder, tokenizer)
    names_to_emb[sm_name] = vector.cpu().tolist()

names_to_emb['control'] = None

Loading model from s3://terray-public/models/barlow_closed.pkl
Loading tokenizer may_closedparen from s3://terray-public/models/barlow_closed.pkl
number of parameters: 12.64M
number of parameters Total: 2.44M xformer: 17.92M Total: 20.36M 
vocab_name not found in tokenizer_vocabs, trying to load from file
Freezing encoder
20561664 params frozen!
panobinostat
control
etoposide
Ana-12
RO4929097
Tazemetostat
Ispenisib


In [42]:
smiles_col = [names_to_smiles[x] for x in list(adata.obs['perturbation'])]
emb_col = [names_to_emb[x] for x in list(adata.obs['perturbation'])]

adata.obs['smiles'] = smiles_col
adata.obs['sm_emb'] = emb_col

adata.obs['sm_emb'] = adata.obs['sm_emb'].apply(str)

In [43]:
adata.obs

Unnamed: 0_level_0,sample,GEO,Sample,tissue,age,sex,location,diagnosis,library,dose_value,...,organism,perturbation_type,ncounts,ngenes,percent_mito,percent_ribo,nperts,chembl-ID,smiles,sm_emb
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACGTCGCGACC-PW032-711,PW032,GSM4483757,PW032-711,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-711,0.2,...,human,drug,3249.0,1585,1.015697,5.755617,1,CHEMBL483254,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,"[-3.274730682373047, 0.9405636191368103, -3.47..."
GTTTCCTACTTG-PW032-711,PW032,GSM4483757,PW032-711,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-711,0.2,...,human,drug,3834.0,1929,1.225874,9.441836,1,CHEMBL483254,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,"[-3.274730682373047, 0.9405636191368103, -3.47..."
TTTGATTGGATC-PW032-711,PW032,GSM4483757,PW032-711,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-711,0.2,...,human,drug,7001.0,3219,1.028425,11.426939,1,CHEMBL483254,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,"[-3.274730682373047, 0.9405636191368103, -3.47..."
GTCATTTTCGGA-PW032-711,PW032,GSM4483757,PW032-711,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-711,0.2,...,human,drug,5259.0,2300,0.019015,5.400266,1,CHEMBL483254,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,"[-3.274730682373047, 0.9405636191368103, -3.47..."
TGCGAGCGTTGG-PW032-711,PW032,GSM4483757,PW032-711,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-711,0.2,...,human,drug,6114.0,2786,0.163559,11.939810,1,CHEMBL483254,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,"[-3.274730682373047, 0.9405636191368103, -3.47..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCTCCTCGCACT-PW032-705,PW032,GSM4483755,PW032-705,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-705,,...,human,drug,140.0,121,1.428571,7.857143,1,,,
TACGGAATGCGT-PW032-705,PW032,GSM4483755,PW032-705,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-705,,...,human,drug,101.0,92,0.000000,3.960396,1,,,
ACCGCAAGTACC-PW032-705,PW032,GSM4483755,PW032-705,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-705,,...,human,drug,118.0,114,0.000000,12.711864,1,,,
CGTGTTTTGCGA-PW032-705,PW032,GSM4483755,PW032-705,glioma surgical biopsy,61,m,left frontal,"Glioblastoma, WHO Grade IV",PW032-705,,...,human,drug,103.0,97,0.000000,5.825243,1,,,


In [44]:
adata.write_h5ad(ROOT + "data\\zhao_preprocessed.h5ad")