In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import sys
import pandas as pd
import numpy as np

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import sys
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
import scanpy as sc
from tqdm import tqdm

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
base_dir = "/cluster/work/bewi/data/tahoe100/metadata"
drug_metadata_path = os.path.join(base_dir, "drug_metadata.parquet")
drug_df = pd.read_parquet(drug_metadata_path)
drug_names = list(np.unique(drug_df["drug"]))

# Increase printing threshold to adequately export to csv:
np.set_printoptions(threshold=10_000)

## Generate drug embeddings and save to file

### Chemberta 

In [9]:
MODEL_ID = "DeepChem/ChemBERTa-100M-MLM"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Use case: get embeddings or representations of molecules
model = AutoModel.from_pretrained(MODEL_ID).to(device).eval()

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-100M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# TOKENIZE THE INPUT SMILES CODES
smiles = drug_df["canonical_smiles"].astype(str).tolist()

batch = tokenizer(
    smiles,
    padding=True,       # pad all sequences to same length
    truncation=True,    # cut off if too long
    max_length=256,     # max number of tokens
    return_tensors="pt" # return PyTorch tensors
).to(device)

# GET MOLECULE EMBEDDINGS
with torch.no_grad():
    outputs = model(**batch)
    hidden_states = outputs.last_hidden_state

attn = batch["attention_mask"].unsqueeze(-1).float()
emb = (hidden_states * attn).sum(1) / attn.sum(1)
emb = emb.cpu().numpy()

In [27]:
# Append to metadata
drug_df["chemberta"] = [embedding for embedding in emb]

In [30]:
drug_df.to_parquet(drug_metadata_path)

### Morgan Fingerprints

In [18]:
drug_df = pd.read_parquet(drug_metadata_path)

In [3]:
drug_df["canonical_smiles"][374] = "CC1=C(C2=CC3=NC(=CC4=C(C(=C(N4)C=C5[C@@]6([C@@H](C(=CC=C6C(=N5)C=C1N2)C(=O)OC)C(=O)OC)C)C)CCC(=O)OC)C(=C3C)CCC(=O)O)C=C"

In [4]:
drug_df.iloc[374,:]

drug                                                        Verteporfin
targets                                                            YAP1
moa-broad                                          inhibitor/antagonist
moa-fine                                                        unclear
human-approved                                                      yes
clinical-trials                                                     yes
gpt-notes-approval    Used in photodynamic therapy for macular degen...
canonical_smiles      CC1=C(C2=CC3=NC(=CC4=C(C(=C(N4)C=C5[C@@]6([C@@...
pubchem_cid                                                         NaN
chemberta             [-0.32670945, -0.028810669, -0.2650887, -0.721...
morgan_fp             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
morgan_512_fp         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
morgan_1024_fp        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
MACCS_fp              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [5]:
def bitvect_to_array(fp):
    arr = np.zeros((fp.GetNumBits(),), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
    
def generate_morgan_fp(drug_df, size):
    mols = [Chem.MolFromSmiles(smiles) for smiles in drug_df["canonical_smiles"].values]
    fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=size)
    morgan_fps = [fpgen.GetFingerprint(mol) for mol in mols]
    
    drug_df[f"morgan_{size}_fp"] = [bitvect_to_array(fp) for fp in morgan_fps]

In [6]:
generate_morgan_fp(drug_df, 512)
generate_morgan_fp(drug_df, 1024)

In [9]:
drug_df.to_parquet(drug_metadata_path)

### MACCS Fingerprints

In [10]:
drug_df = pd.read_parquet(drug_metadata_path)

In [52]:
def bitvect_to_array(fp):
    arr = np.zeros((fp.GetNumBits(),), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

mols = [Chem.MolFromSmiles(smiles) for smiles in drug_df["canonical_smiles"].values]
maccs_fps = [MACCSkeys.GenMACCSKeys(x) for x in mols]

drug_df["MACCS_fp"] = [bitvect_to_array(fp) for fp in maccs_fps]

In [53]:
drug_df.to_parquet(drug_metadata_path)

### How to access

In [43]:
drug_df = pd.read_parquet(drug_metadata_path)

In [44]:
drug_df.head(1)

Unnamed: 0,drug,targets,moa-broad,moa-fine,human-approved,clinical-trials,gpt-notes-approval,canonical_smiles,pubchem_cid,chemberta,morgan_fp,maccs_fp,morgan_512_fp,morgan_1024_fp
0,Talc,,unclear,unclear,yes,yes,Talc used in pharma and cosmetics; safety unde...,[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O...,165411828.0,"[1.2226007, 0.5856873, -0.9940177, -0.05389121...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [49]:
fp = drug_df["maccs_fp"].iloc[1]
print(type(fp), len(fp))

<class 'numpy.ndarray'> 167


In [122]:
pert_names = np.array(drug_df["drug"].values)
perturbations = torch.stack([torch.tensor(drug_df.loc[drug_df["drug"] == pert, "chemberta"].values[0])
                             for pert in pert_names])

In [125]:
perturbations.shape

torch.Size([380, 768])

In [51]:
drug_df = drug_df.drop(columns=["maccs_fp"])

## Generate ZeroShot .csv file

In [4]:
base_dir = "/cluster/work/bewi/data/tahoe100/metadata"
drug_metadata_path = os.path.join(base_dir, "drug_metadata.parquet")
drug_df = pd.read_parquet(drug_metadata_path)

In [5]:
export_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/drug_fingerprints_Mol_selfies.csv"
drug_df.to_csv(os.path.join(base_dir, "drug_metadata.csv"), index=False)
drug_df.to_csv(export_path, index=False)

## Diane's metadata

In [57]:
path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/tutorial/data/drug_fingerprints_Mol_selfies.csv"
df = pd.read_csv(path)

In [59]:
df

Unnamed: 0,drug,MACCS_fp,morgan_512_fp,morgan_1024_fp,pubchem_fp,molformer_github,molformer_huggingFace,selfies_label,selfies_flattened_one_hot
0,5-Fluorocytosine,0000000000000000000000000000000000000110001100...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,1101110001100000000110001100100001000000000000...,"-0.0951872841,-0.636924803,-0.331326634,0.5368...","0.5046858787536621,0.4210881292819977,-0.08740...",27 19 30 8 27 17 30 23 27 19 30 2 27 17 30 2...,0000000000000000000000000001000000000000000000...
1,Amikacin,0000000000000000000000000000000000000000000000...,0100000000000100000001000000001001010000000000...,0100000000000100000000000000001000010000000000...,1101110001111100000111101110111100000000000000...,"0.794443607,0.012108786,0.0602180995,0.8221735...","0.19468684494495392,-0.5759977698326111,-0.192...",27 19 30 8 27 27 19 30 18 18 22 30 2 27 27 1...,0000000000000000000000000001000000000000000000...
2,Amoxicillin,0000000010010000000100000000000000001000000000...,0100010000000000001000000000000001001000000000...,0100010000000000001000000000000001001000000000...,1101110001111000000111101100111000000000000100...,"-0.277433664,0.456426084,0.418058097,0.2918549...","0.7805112600326538,0.3971932530403137,0.067437...",27 19 30 27 19 30 8 2 27 27 19 30 18 18 22 3...,0000000000000000000000000001000000000000000000...
3,Amphotericin B,0000000000000000000000000000000000000000000000...,0100000000000000000001000010000001011011100000...,0100000000000000000000000010000001011000100000...,1101110001111100000111111000111110000000000000...,"0.440224081,-0.506301224,0.470279306,0.1629443...","0.9433532953262329,-0.13454332947731018,1.1354...",27 19 30 27 27 19 30 18 22 30 8 7 27 19 30 2...,0000000000000000000000000001000000000000000000...
4,Ampicillin,0000000010010000000100000000000000001000000000...,0100010000000000001000000000000001001000000001...,0100010000000000001000000000000001001000000000...,1101110001111000000111101100111000000000000100...,"-0.651477933,0.708402514,0.419002682,0.8218743...","0.8034102916717529,0.3502717614173889,-0.03101...",27 19 30 27 19 30 8 2 27 27 19 30 18 18 22 3...,0000000000000000000000000001000000000000000000...
...,...,...,...,...,...,...,...,...,...
60,Ticarcillin,0000000010010000000100000000000000001000000000...,0100010000000010001000000000000001001000100000...,0100010000000010001000000000000001001000000000...,1101110001111000000111001100111000000000000110...,"-0.416813314,0.33261463,0.812118948,0.62490588...","0.7444556355476379,0.4137423634529114,0.243047...",27 19 30 27 19 30 8 2 27 27 19 30 18 18 22 3...,0000000000000000000000000001000000000000000000...
61,Tigecycline,0000000000000000000000000010000000000000000000...,0000000100000100000000000000010001001100011001...,0000000000000000000000000000000001001100010000...,1101110001111100000111101110111100000000000000...,"0.265602767,-0.041300226,0.903219342,0.1615294...","0.6586130261421204,0.11598372459411621,0.12656...",27 19 30 27 19 30 2 27 19 30 3 2 27 19 30 ...,0000000000000000000000000001000000000000000000...
62,Tobramycin,0000000000000000000000000000000000000000000000...,0000000000010000000001000000000000010000000000...,0000000000010000000000000000000000010000000000...,1101110001111100000111101110111100000000000000...,"0.935077727,0.235020116,0.335049093,0.92905676...","0.3344675600528717,-0.3647572696208954,0.02367...",27 19 30 8 27 27 19 30 18 18 22 30 2 27 27 1...,0000000000000000000000000001000000000000000000...
63,Vancomycin,0000000000000000000000000000000000000000000000...,0101000000111000000010000000000001001001000010...,0101000000110000000000000000000001001000000010...,1101110001111100000111111111111110000000000000...,"0.655232847,0.684636295,0.521011412,0.81529390...","0.8847649097442627,0.35866332054138184,0.79055...",27 19 30 27 27 19 30 18 22 30 8 27 27 19 30 1...,0000000000000000000000000001000000000000000000...


In [33]:
type(df["MACCS_fp"].values[1])

str

### GDSC drug-smiles mapping

Need to eliminate Cetuximab, as it has no corresponding smiles code

In [2]:
df_smiles = pd.read_csv("/cluster/work/bewi/members/rquiles/data/gdsc_smiles.csv")

In [4]:
df_smiles.columns = ["drug", "smiles"]

In [23]:
df = df_smiles[~(df_smiles["drug"] == "Cetuximab")]

In [25]:
df.to_csv("/cluster/work/bewi/members/rquiles/data/gdsc_smiles.csv", index=False)

## ZeroShot Drug Fingerprints

In [34]:
df = pd.read_csv("/cluster/work/bewi/members/rquiles/zeroshot_amr/data/drug_fingerprints_Mol_selfies.csv")

In [35]:
df

Unnamed: 0,drug,morgan_512_fp,morgan_1024_fp,MACCS_fp
0,Fulvestrant,0001000100000001000001000000100001001000000000...,0001000000000001000000000000100001001000000000...,0000000000000000000000000000000000000000001000...
1,Paclitaxel,0100000001010000000000000000000001001000000001...,0100000000010000000000000000000001001000000000...,0000000010010000000000000010000000000000000000...
2,Bortezomib,0100000000000000000000000000010001000000101000...,0100000000000000000000000000000001000000001000...,0000000000000000001000000000000000000000000110...
3,Rucaparib,0000000000001000000000000000000001000100000000...,0000000000000000000000000000000001000000000000...,0000000000000000000100000000000000000000001000...
4,Vismodegib,0000000000000000000000000000000001000000000000...,0000000000000000000000000000000001000000000000...,0000000000000000000000000000000000000000000000...
...,...,...,...,...
60,Docetaxel,0100000000010000100000000000000001001010000001...,0100000000010000100000000000000001001000000000...,0000000010010000000000010010000000000000000000...
61,Venetoclax,0000000100010000000000000000011001001000100001...,0000000100000000000000000000010001001000000001...,0000000000000000000000001000000011000010000000...
62,Bexarotene,0000000000000000010000000000000001001000001000...,0000000000000000010000000000000001001000001000...,0000000000000000000000000000000000100000000000...
63,Trametinib,0000000000000001001000000100000001000000000000...,0000000000000001000000000100000001000000000000...,0000000000000000000000100001000000000110001000...
