## ChemBERTa tests

In the following I will test the application of embeddings generated by ChemBERTa to the FCR method.

In [4]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
import scanpy as sc
from tqdm import tqdm

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
MODEL_ID = "DeepChem/ChemBERTa-100M-MLM"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Use case: want to continue pretraining the model:
#model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-100M-MLM").to(device).eval()

# Use case: get embeddings or representations of molecules
model = AutoModel.from_pretrained(MODEL_ID).to(device).eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-100M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
drug_metadata_path = "/cluster/work/bewi/members/rquiles/experiments/datasets/drug_metadata.parquet"
df = pd.read_parquet(drug_metadata_path)

drug_names = list(np.unique(df["drug"]))
df

Unnamed: 0,drug,targets,moa-broad,moa-fine,human-approved,clinical-trials,gpt-notes-approval,canonical_smiles,pubchem_cid
0,Talc,,unclear,unclear,yes,yes,Talc used in pharma and cosmetics; safety unde...,[OH-].[OH-].[O-][Si]12O[Si]3(O[Si](O1)(O[Si](O...,165411828.0
1,Bortezomib,PSMB5,inhibitor/antagonist,Proteasome inhibitor,yes,yes,Approved for multiple myeloma and mantle cell ...,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,387447.0
2,Ixazomib,PSMB5,inhibitor/antagonist,Proteasome inhibitor,yes,yes,Approved for multiple myeloma treatment.,B(C(CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)Cl)(O)O,25183872.0
3,Ixazomib citrate,"PSMB1, PSMB2, PSMB5",inhibitor/antagonist,Proteasome inhibitor,yes,yes,Approved for multiple myeloma treatment as par...,B1(OC(=O)C(O1)(CC(=O)O)CC(=O)O)C(CC(C)C)NC(=O)...,56844015.0
4,Lactate (calcium),,unclear,unclear,yes,yes,"Used in medical settings, but not specifically...",C.CC(C(=O)[O-])O.[Ca+2],168311648.0
...,...,...,...,...,...,...,...,...,...
375,Quinidine (15% dihydroquinidine),KCNH2,inhibitor/antagonist,unclear,yes,yes,Approved for arrhythmias as part of quinine al...,COC1=CC2=C(C=CN=C2C=C1)[C@@H]([C@H]3C[C@@H]4CC...,441074.0
376,Canagliflozin (hemihydrate),SLC5A2,inhibitor/antagonist,Glucose transporter inhibitor,yes,yes,Approved for type 2 diabetes.,CC1=C(C=C(C=C1)[C@H]2[C@@H]([C@H]([C@@H]([C@H]...,24997615.0
377,Osimertinib (mesylate),EGFR,inhibitor/antagonist,EGFR/ERBB inhibitor,yes,yes,Approved for non-small cell lung cancer treatm...,CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=...,78357807.0
378,γ-Oryzanol,,inhibitor/antagonist,DNA methyltransferase inhibitor,no,yes,Used in supplements; limited human data.,C[C@H](CCC=C(C)C)[C@H]1CC[C@@]2([C@@]1(CC[C@]3...,5282164.0


In [3]:
# Add new drug to the df
new_row = pd.DataFrame({
    "drug": ["DMSO_TF"],
    "targets": None,
    "moa-broad": None,
    "moa-fine": None,
    "human-approved": None,
    "clinical-trials": None,
    "gpt-notes-approval": None,
    "canonical_smiles": ["CS(=O)C"],
    "pubchem_cid": None
})

df = pd.concat([df, new_row], ignore_index=True)

# Save the dataframe back with the added molecule
df.to_parquet(drug_metadata_path, index=False)

In [23]:
# TOKENIZE THE INPUT SMILES CODES
smiles = df["canonical_smiles"].astype(str).tolist()

batch = tokenizer(
    smiles,
    padding=True,       # pad all sequences to same length
    truncation=True,    # cut off if too long
    max_length=256,     # max number of tokens
    return_tensors="pt" # return PyTorch tensors
).to(device)

In [36]:
# GET MOLECULE EMBEDDINGS
with torch.no_grad():
    outputs = model(**batch)
    hidden_states = outputs.last_hidden_state

attn = batch["attention_mask"].unsqueeze(-1).float()
emb = (hidden_states * attn).sum(1) / attn.sum(1)

## Test for a real dataset

In [5]:
dataset_path = "/cluster/work/bewi/members/rquiles/experiments/datasets/3_cells_2_drugs.h5ad"
dataset = sc.read_h5ad(dataset_path)

In [7]:
pert_names = np.array(dataset.obs["Agg_Treatment"].values)
pert_smiles = np.array([df[df["drug"] == name]["canonical_smiles"].astype(str).iloc[0] for name in pert_names]).tolist()

In [8]:
for name in pert_names:
    if df[df["drug"] == name]["canonical_smiles"].astype(str).empty:
        print(name)
        break

In [9]:
# PIPELINE

emb_list = []   # store batch embeddings here
batch_size = 1024 # tune this depending on GPU memory

# Iterate through pert_smiles in batches
for i in tqdm(range(0, len(pert_smiles), batch_size)):
    batch_smiles = pert_smiles[i:i + batch_size]

    # Tokenize batch
    batch = tokenizer(
        batch_smiles,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)
        hidden_states = outputs.last_hidden_state

        # Mask-aware mean pooling
        attn = batch["attention_mask"].unsqueeze(-1).float()
        emb = (hidden_states * attn).sum(1) / attn.sum(1)

    # Move embeddings to CPU and store
    emb_list.append(emb.cpu())

# Concatenate all batches into one big tensor
emb = torch.cat(emb_list, dim=0)

print("Final embedding shape:", emb.shape)


  0%|          | 0/43 [00:00<?, ?it/s]


NameError: name 'tokenizer' is not defined