# Protein Target Embeddings with ProtT5 LLM

#### We utilized the ProtT5 LLM to extract sequence-based features from protein sequences as follows:

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm

class ProteinEmbeddingsExtractor:
    def __init__(self, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model_name = "Rostlab/prot_t5_xl_half_uniref50-enc"
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name, do_lower_case=False, legacy=True)
        self.model = T5EncoderModel.from_pretrained(self.model_name).to(self.device).eval()

    #generates embeddings for protein sequences
    def get_embeddings(self, seq):
        sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]
        ids = self.tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
        input_ids = torch.tensor(ids['input_ids']).to(self.device)
        attention_mask = torch.tensor(ids['attention_mask']).to(self.device)

        with torch.no_grad():
            embedding_repr = self.model(input_ids=input_ids, attention_mask=attention_mask)

        emb_0 = embedding_repr.last_hidden_state[0]
        return emb_0.mean(dim=0).detach().cpu().numpy()

    def process_and_save(self, data, output_dir, dataset_name):
        output_dir = Path(output_dir).resolve()
        output_dir.mkdir(parents=True, exist_ok=True)

        #gets unique protein sequences from 'Protein' column of the given dataframe.
        unique_sequences = data['Protein'].unique()

        sequence_embeddings = []
        for seq in tqdm(unique_sequences, desc=f"Processing sequences in {dataset_name}"):
            embedding = self.get_embeddings(seq)
            sequence_embeddings.append(embedding)

        embeddings_array = np.array(sequence_embeddings)
        np.save(output_dir / f"{dataset_name}_target_sequences.npy", unique_sequences)
        np.save(output_dir / f"{dataset_name}_sequence_embeddings.npy", embeddings_array)

In [2]:
# Set the base directory
base_dir = Path('/content/drive/MyDrive/Top_DTI')

In [None]:
data_path = data_path = base_dir  / 'datasets'

task_paths = {
    "biosnap_random": data_path / "biosnap/random",
    "human_random": data_path / "human/random",
    "human_cold": data_path / "human/cold"}


protein_extractor = ProteinEmbeddingsExtractor()
all_datasets = {}

# Process each dataset
for dataset_name, dataset_path in task_paths.items():
    data_dir = Path(dataset_path)


    train_file = data_dir / 'train.csv'
    val_file = data_dir / 'val.csv'
    test_file = data_dir / 'test.csv'

    if train_file.exists() and val_file.exists() and test_file.exists():

        train_data = pd.read_csv(train_file)
        val_data = pd.read_csv(val_file)
        test_data = pd.read_csv(test_file)


        full_data = pd.concat([train_data, val_data, test_data], ignore_index=True)

        all_datasets[dataset_name] = full_data


        #output directory to save embeddings
        out_dir = base_dir  / 'embeddings' /'llm'/ dataset_name / 'target'
        out_dir.mkdir(parents=True, exist_ok=True)

        protein_extractor.process_and_save(full_data, out_dir, dataset_name=dataset_name)

        print(f"{dataset_name} dataset loaded successfully. Total rows: {len(full_data)}")
    else:
        print(f"Skipping {dataset_name}: train.csv, val.csv, or test.csv not found.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]


Processing sequences in biosnap_random:   0%|          | 0/2181 [00:00<?, ?it/s][A
Processing sequences in biosnap_random:   0%|          | 1/2181 [00:02<1:23:16,  2.29s/it][A
Processing sequences in biosnap_random:   0%|          | 2/2181 [00:02<37:06,  1.02s/it]  [A
Processing sequences in biosnap_random:   0%|          | 3/2181 [00:02<24:54,  1.46it/s][A
Processing sequences in biosnap_random:   0%|          | 5/2181 [00:03<18:22,  1.97it/s][A
Processing sequences in biosnap_random:   0%|          | 7/2181 [00:03<11:18,  3.20it/s][A
Processing sequences in biosnap_random:   0%|          | 9/2181 [00:03<07:41,  4.70it/s][A
Processing sequences in biosnap_random:   1%|          | 11/2181 [00:03<06:14,  5.80it/s][A
Processing sequences in biosnap_random:   1%|          | 13/2181 [00:04<06:13,  5.81it/s][A
Processing sequences in biosnap_random:   1%|          | 14/2181 [00:04<05:51,  6.17it/s][A
Processing sequences in biosnap_random:   1%|          | 15/2181 [00:04<08:29,  

biosnap_random dataset loaded successfully. Total rows: 27464


Processing sequences in human_random: 100%|██████████| 2001/2001 [04:29<00:00,  7.43it/s]


human_random dataset loaded successfully. Total rows: 5997


Processing sequences in human_cold: 100%|██████████| 1503/1503 [03:22<00:00,  7.41it/s]

human_cold dataset loaded successfully. Total rows: 3919





In [3]:
# Load the generated target embeddings for the BioSNAP random dataset
biosnap_llm_embeddings_path = base_dir / f"embeddings/llm/biosnap_random"
sequences_names = np.load(biosnap_llm_embeddings_path / f"target/biosnap_random_target_sequences.npy", allow_pickle=True)
gene_embeddings = np.load(biosnap_llm_embeddings_path / f"target/biosnap_random_sequence_embeddings.npy", allow_pickle=True)
biosnap_protein_llm = pd.DataFrame({'sequences': sequences_names, 'protein_llm_embeddings': gene_embeddings.tolist() })
biosnap_protein_llm.head()

Unnamed: 0,sequences,protein_llm_embeddings
0,MGDHAWSFLKDFLAGGVAAAVSKTAVAPIERVKLLLQVQHASKQIS...,"[0.040794070810079575, 0.1398317515850067, -0...."
1,MVLDLDLFRVDKGGDPALIRETQEKRFKDPGLVDQLVKADSEWRRC...,"[0.07856228947639465, 0.09228259325027466, 0.0..."
2,MGNLKSVAQEPGPPCGLGLGLGLGLCGKQGPATPAPEPSRAPASLL...,"[0.030257243663072586, 0.09058675915002823, 0...."
3,MGNAAAAKKGSEQESVKEFLAKAKEDFLKKWESPAQNTAHLDQFER...,"[0.07570360600948334, 0.11278703063726425, 0.0..."
4,MVNENTRMYIPEENHQGSNYGSPRPAHANMNANAAAGLAPEHIPTP...,"[0.07552585750818253, 0.09334281831979752, 0.0..."


# Drug Embeddings with MoLFormer LLM

### We used the MoLFormer LLM to produce drug representations from chemical SMILES strings.

Clone https://github.com/IBM/molformer and change directory to molformer folder

MolFormer requires the installation of 'apex.' However, we uninstalled 'apex' after creating drug embeddings due to a compatibility issue with PyTorch.

After cloning https://github.com/NVIDIA/apex

pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./

pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./




In [5]:
os.chdir(base_dir /'apex')

In [6]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl (36.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.6


In [7]:
!pip install pytorch-fast-transformers

Collecting pytorch-fast-transformers
  Downloading pytorch-fast-transformers-0.4.0.tar.gz (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.6/93.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytorch-fast-transformers
  Building wheel for pytorch-fast-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-fast-transformers: filename=pytorch_fast_transformers-0.4.0-cp312-cp312-linux_x86_64.whl size=22110296 sha256=1ca02fb1ff8e7dc454b901f71a3b1bee2d0bec9211022718e79d08faf033227f
  Stored in directory: /root/.cache/pip/wheels/07/2f/9a/938a96d4260ad919e26b8659b20efe18f1d2af4996e9010dbe
Successfully built pytorch-fast-transformers
Installing collected packages: pytorch-fast-transformers
Successfully installed pytorch-fast-transformers-0.4.0


In [8]:
!pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./

Using pip 24.1.2 from /usr/local/lib/python3.12/dist-packages/pip (python 3.12)
[33mDEPRECATION: --build-option and --global-option are deprecated. pip 24.2 will enforce this behaviour change. A possible replacement is to use --config-settings. Discussion can be found at https://github.com/pypa/pip/issues/11859[0m[33m
[0mProcessing /content/drive/MyDrive/Top_DTI/apex
  Running command Preparing metadata (pyproject.toml)
  W0911 15:54:41.397000 6168 torch/utils/cpp_extension.py:118] No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'

   If your intention is to cross-compile, this is not an error.
  By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2) (until CUDA 12.8),
  Volta (compute capability 7.0), Turing (compute capability 7.5),
  and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0, 8.6), and,
  if the CUDA version is >= 12.8, Blackwell (compute capability 10.0, 12.0).
  If you wish to cross-compile for a single specific 

In [9]:

!pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./

Using pip 24.1.2 from /usr/local/lib/python3.12/dist-packages/pip (python 3.12)
Processing /content/drive/MyDrive/Top_DTI/apex
  Running command Preparing metadata (pyproject.toml)
  W0911 15:57:08.327000 6805 torch/utils/cpp_extension.py:118] No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'

   If your intention is to cross-compile, this is not an error.
  By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2) (until CUDA 12.8),
  Volta (compute capability 7.0), Turing (compute capability 7.5),
  and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0, 8.6), and,
  if the CUDA version is >= 12.8, Blackwell (compute capability 10.0, 12.0).
  If you wish to cross-compile for a single specific architecture,
  export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.



  torch.__version__  = 2.8.0+cu126


  running dist_info
  creating /tmp/pip-modern-metadata-hjsjanlk/apex.egg-info
  writing /tmp/pip-modern-metadata-hjs

In [5]:
os.chdir(base_dir /'molformer')

In [11]:
!pip install args

Collecting args
  Downloading args-0.1.0.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: args
  Building wheel for args (setup.py) ... [?25l[?25hdone
  Created wheel for args: filename=args-0.1.0-py3-none-any.whl size=3318 sha256=78bc26dd4a5f32331cca3cc260824bef8f9438525183bfa08f2c91f0b86d2ec5
  Stored in directory: /root/.cache/pip/wheels/cd/3d/45/a8945af45d38bcd64779802591930d5f01c73650c98b6e120c
Successfully built args
Installing collected packages: args
Successfully installed args-0.1.0


In [7]:
import builtins
if not hasattr(builtins, "basestring"):
    basestring = str
    builtins.basestring = basestring

In [23]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.5-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading pytorch_lightning-2.5.5-py3-none-any.whl (832 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m832.4/832.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.15.2 pytorch-lightning-2.5.5 torchmetrics-1.8.2


In [8]:
from argparse import Namespace
import yaml
from training.tokenizer.tokenizer import MolTranBertTokenizer
from fast_transformers.masking import LengthMask as LM
from training.train_pubchem_light import LightningModule
from rdkit import Chem
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
class MoleculeEmbeddingsExtractor:
    def __init__(self, model_path, checkpoint_path):

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Loading MolFormer model from: {model_path}")

        # Load configuration
        with open(Path(model_path) / 'data/Pretrained MoLFormer/hparams.yaml', 'r') as f:
            self.config = Namespace(**yaml.safe_load(f))

        # Load tokenizer and model
        self.tokenizer = MolTranBertTokenizer(Path(model_path) / 'bert_vocab.txt')

        self.model = LightningModule(self.config, self.tokenizer.vocab).load_from_checkpoint(
            Path(checkpoint_path), config=self.config, vocab=self.tokenizer.vocab
        ).to(self.device).eval()

    def batch_split(self, data, batch_size=64):
        for i in range(0, len(data), batch_size):
            yield data[i:i + batch_size]

    def embed(self, smiles, batch_size=64):
        """
        Embed SMILES strings into molecule embeddings.
        """
        self.model.eval()
        embeddings = []
        for batch in self.batch_split(smiles, batch_size=batch_size):
            batch_enc = self.tokenizer.batch_encode_plus(batch, padding='longest', add_special_tokens=True)
            idx = torch.tensor(batch_enc['input_ids']).to(self.device)
            mask = torch.tensor(batch_enc['attention_mask']).to(self.device)
            with torch.no_grad():
                token_embeddings = self.model.blocks(self.model.tok_emb(idx), length_mask=LM(mask.sum(-1)))

            # Average pooling over tokens
            input_mask_expanded = mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            embedding = sum_embeddings / sum_mask
            embeddings.append(embedding.cpu())
        return torch.cat(embeddings)

    def canonicalize(self, s):
        # Ensure s is not None and is a valid SMILES string
        if s is not None and Chem.MolFromSmiles(s):
            return Chem.MolToSmiles(Chem.MolFromSmiles(s), canonical=True, isomericSmiles=False)
        else:
            return None  # Return None if the SMILES string is invalid or None

    def process_and_save(self, data, output_dir, dataset_name):

        if 'SMILES' not in data.columns:
            raise ValueError("Dataset does not contain a 'SMILES' column.")

        output_dir = Path(output_dir).resolve()
        output_dir.mkdir(parents=True, exist_ok=True)

        # Get unique SMILES
        unique_smiles = data['SMILES'].unique()

        # Apply the canonicalization function
        canonicalized_smiles = [self.canonicalize(s) for s in unique_smiles]

        print(f"Extracting embeddings for {len(unique_smiles)} unique SMILES in {dataset_name}")

        # Filter out invalid canonical SMILES (None)
        valid_indices = [i for i, s in enumerate(canonicalized_smiles) if s is not None]
        valid_smiles = [unique_smiles[i] for i in valid_indices]
        valid_canonical_smiles = [canonicalized_smiles[i] for i in valid_indices]

        # Extract embeddings
        if len(valid_canonical_smiles) == 0:
            print("No valid canonical SMILES found. Skipping embedding extraction.")
            return

        embeddings = self.embed(valid_canonical_smiles).numpy()

        # Double-check: Filter again if embeddings are not generated
        if len(valid_canonical_smiles) != len(embeddings):
            raise ValueError("Mismatch in valid canonical SMILES and embeddings length.")

        # Save the filtered SMILES, canonical SMILES, and embeddings
        np.save(output_dir / f"{dataset_name}_smiles.npy", valid_smiles)
        np.save(output_dir / f"{dataset_name}_canonical_smiles.npy", valid_canonical_smiles)
        np.save(output_dir / f"{dataset_name}_molecule_embeddings.npy", embeddings)

        print(f"Saved {len(embeddings)} embeddings.")

In [33]:
!pip install pytorch-lightning==1.9.5

Collecting pytorch-lightning==1.9.5
  Downloading pytorch_lightning-1.9.5-py3-none-any.whl.metadata (23 kB)
Downloading pytorch_lightning-1.9.5-py3-none-any.whl (829 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m829.5/829.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-lightning
  Attempting uninstall: pytorch-lightning
    Found existing installation: pytorch-lightning 2.5.5
    Uninstalling pytorch-lightning-2.5.5:
      Successfully uninstalled pytorch-lightning-2.5.5
Successfully installed pytorch-lightning-1.9.5


In [13]:
# Define model, tokenizer, and checkpoint paths
lib_path = Path(base_dir/"molformer")
checkpoint_path = lib_path / "data/Pretrained MoLFormer/checkpoints/N-Step-Checkpoint_3_30000.ckpt"

molecule_extractor = MoleculeEmbeddingsExtractor(
    model_path=lib_path,
    checkpoint_path=checkpoint_path)

for dataset_name, dataset_path in task_paths.items():
    print(f"\nProcessing dataset: {dataset_name}")
    dataset_dir = Path(dataset_path)

    train_file = dataset_dir / 'train.csv'
    val_file = dataset_dir / 'val.csv'
    test_file = dataset_dir / 'test.csv'

    if train_file.exists() and val_file.exists() and test_file.exists():

        train_data = pd.read_csv(train_file)
        val_data = pd.read_csv(val_file)
        test_data = pd.read_csv(test_file)

        full_data = pd.concat([train_data, val_data, test_data], ignore_index=True)

        # Check for 'SMILES' column and process embeddings
        if 'SMILES' in full_data.columns:
            output_dir = Path(base_dir /f"embeddings/tda/{dataset_name}/drug")
            output_dir.mkdir(parents=True, exist_ok=True)
            molecule_extractor.process_and_save(full_data, output_dir, dataset_name=dataset_name)
        else:
            print(f"No 'SMILES' column found in the dataset: {dataset_name}")
    else:
        print(f"Skipping {dataset_name}: Missing one or more of train.csv, val.csv, or test.csv.")

Loading MolFormer model from: /content/drive/MyDrive/Top_DTI/molformer
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding


INFO:lightning_fabric.utilities.seed:Global seed set to 12345


Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy.core.multiarray._reconstruct was not an allowed global by default. Please use `torch.serialization.add_safe_globals([numpy.core.multiarray._reconstruct])` or the `torch.serialization.safe_globals([numpy.core.multiarray._reconstruct])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [None]:
# Load the generated drug embeddings for the BioSNAP random dataset
biosnap_random_drug_embeddings_path = base_dir / f"embeddings/tda/biosnap_random"
smile_names = np.load(biosnap_random_drug_embeddings_path / f"drug/biosnap_random_smiles.npy", allow_pickle=True)
can_smile_names = np.load(biosnap_random_drug_embeddings_path / f"drug/biosnap_random_canonical_smiles.npy", allow_pickle=True)
drug_embeddings = np.load(biosnap_random_drug_embeddings_path / f"drug/biosnap_random_molecule_embeddings.npy", allow_pickle=True)
biosnap_random_drugs_llm = pd.DataFrame({ 'smiles': smile_names, 'can_smiles': can_smile_names, 'drug_llm_embeddings': drug_embeddings.tolist()})
biosnap_random_drugs_llm.head()

Unnamed: 0,smiles,can_smiles,drug_llm_embeddings
0,OP(O)(=O)C(Cl)(Cl)P(O)(O)=O,O=P(O)(O)C(Cl)(Cl)P(=O)(O)O,"[0.7787514925003052, 0.34758803248405457, -0.1..."
1,NC1=NC(=O)N(C=N1)[C@H]1C[C@H](O)[C@@H](CO)O1,Nc1ncn(C2CC(O)C(CO)O2)c(=O)n1,"[0.21483802795410156, 0.17366445064544678, -0...."
2,OCCCCCCCCNCO,OCCCCCCCCNCO,"[0.6010259389877319, -0.36157843470573425, -0...."
3,C[C@H](OP(O)(O)=O)[C@@H](N)C(O)=O,CC(OP(=O)(O)O)C(N)C(=O)O,"[0.4546224772930145, 0.39328116178512573, 0.02..."
4,CCO,CCO,"[0.9750620126724243, -0.1789940893650055, 0.24..."
