# Protein Target Embeddings with ProtT5 LLM

#### We utilized the <span style="color:yellow;">ProtT5 </span> LLM to extract sequence-based features from protein sequences as follows:

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re
import torch
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm

class ProteinEmbeddingsExtractor:
    def __init__(self, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model_name = "Rostlab/prot_t5_xl_half_uniref50-enc"
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name, do_lower_case=False, legacy=True)
        self.model = T5EncoderModel.from_pretrained(self.model_name).to(self.device).eval()

    #generates embeddings for protein sequences
    def get_embeddings(self, seq):
        sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]
        ids = self.tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
        input_ids = torch.tensor(ids['input_ids']).to(self.device)
        attention_mask = torch.tensor(ids['attention_mask']).to(self.device)

        with torch.no_grad():
            embedding_repr = self.model(input_ids=input_ids, attention_mask=attention_mask)

        emb_0 = embedding_repr.last_hidden_state[0]
        return emb_0.mean(dim=0).detach().cpu().numpy()

    def process_and_save(self, data, output_dir, dataset_name):
        output_dir = Path(output_dir).resolve()
        output_dir.mkdir(parents=True, exist_ok=True)

        #gets unique protein sequences from 'Protein' column of the given dataframe.
        unique_sequences = data['Protein'].unique()

        sequence_embeddings = []
        for seq in tqdm(unique_sequences, desc=f"Processing sequences in {dataset_name}"):
            embedding = self.get_embeddings(seq)
            sequence_embeddings.append(embedding)

        embeddings_array = np.array(sequence_embeddings)
        np.save(output_dir / f"{dataset_name}_target_sequences.npy", unique_sequences)
        np.save(output_dir / f"{dataset_name}_sequence_embeddings.npy", embeddings_array)

In [2]:
# Set the base directory
base_dir = Path('Path_to_your_directory')

In [3]:
data_path = data_path = base_dir  / 'data'

task_paths = {
    "biosnap_random": data_path / "biosnap/random",
    "human_random": data_path / "human/random",
    "human_cold": data_path / "human/cold"}
    

protein_extractor = ProteinEmbeddingsExtractor()
all_datasets = {}

# Process each dataset
for dataset_name, dataset_path in task_paths.items():
    data_dir = Path(dataset_path)

   
    train_file = data_dir / 'train.csv'
    val_file = data_dir / 'val.csv'
    test_file = data_dir / 'test.csv'
    
    if train_file.exists() and val_file.exists() and test_file.exists():
        
        train_data = pd.read_csv(train_file)
        val_data = pd.read_csv(val_file)
        test_data = pd.read_csv(test_file)
        
      
        full_data = pd.concat([train_data, val_data, test_data], ignore_index=True)

        all_datasets[dataset_name] = full_data

        
        #output directory to save embeddings
        out_dir = base_dir  / 'embeddings' /'llm'/ dataset_name / 'target'
        out_dir.mkdir(parents=True, exist_ok=True)   
        
        protein_extractor.process_and_save(full_data, out_dir, dataset_name=dataset_name)
        
        print(f"{dataset_name} dataset loaded successfully. Total rows: {len(full_data)}")
    else:
        print(f"Skipping {dataset_name}: train.csv, val.csv, or test.csv not found.")


In [22]:
# Load the generated target embeddings for the BioSNAP random dataset 
biosnap_llm_embeddings_path = base_dir / f"embeddings/llm/biosnap_random"
sequences_names = np.load(biosnap_llm_embeddings_path / f"target/biosnap_random_target_sequences.npy", allow_pickle=True)
gene_embeddings = np.load(biosnap_llm_embeddings_path / f"target/biosnap_random_sequence_embeddings.npy", allow_pickle=True)
biosnap_protein_llm = pd.DataFrame({'sequences': sequences_names, 'protein_llm_embeddings': gene_embeddings.tolist() })
biosnap_protein_llm.head()

Unnamed: 0,sequences,protein_llm_embeddings
0,MGDHAWSFLKDFLAGGVAAAVSKTAVAPIERVKLLLQVQHASKQIS...,"[0.040794070810079575, 0.1398317515850067, -0...."
1,MVLDLDLFRVDKGGDPALIRETQEKRFKDPGLVDQLVKADSEWRRC...,"[0.07856228947639465, 0.09228259325027466, 0.0..."
2,MGNLKSVAQEPGPPCGLGLGLGLGLCGKQGPATPAPEPSRAPASLL...,"[0.03025723434984684, 0.09058676660060883, 0.0..."
3,MGNAAAAKKGSEQESVKEFLAKAKEDFLKKWESPAQNTAHLDQFER...,"[0.07570360600948334, 0.11278703063726425, 0.0..."
4,MVNENTRMYIPEENHQGSNYGSPRPAHANMNANAAAGLAPEHIPTP...,"[0.07552587240934372, 0.09334281086921692, 0.0..."


# Drug Embeddings with MoLFormer LLM

### We used the <span style="color:yellow;"> MoLFormer </span> LLM to produce drug representations from chemical SMILES strings.

Clone https://github.com/IBM/molformer and change directory to molformer folder

MolFormer requires the installation of 'apex.' However, we uninstalled 'apex' after creating drug embeddings due to a compatibility issue with PyTorch.

After cloning https://github.com/NVIDIA/apex

pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./

pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./




In [11]:
os.chdir(base_dir /'molformer')

In [17]:
from argparse import Namespace
import yaml
from fast_transformers.masking import LengthMask as LM
from chem_tokenizer.tokenizer import MolTranBertTokenizer
from train_pubchem_light import LightningModule
from rdkit import Chem

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


In [22]:
class MoleculeEmbeddingsExtractor:
    def __init__(self, model_path, checkpoint_path):
      
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Loading MolFormer model from: {model_path}")

        # Load configuration
        with open(Path(model_path) / 'data/Pretrained MoLFormer/hparams.yaml', 'r') as f:
            self.config = Namespace(**yaml.safe_load(f))

        # Load tokenizer and model
        self.tokenizer = MolTranBertTokenizer(Path(model_path) / 'bert_vocab.txt')

        self.model = LightningModule(self.config, self.tokenizer.vocab).load_from_checkpoint(
            Path(checkpoint_path), config=self.config, vocab=self.tokenizer.vocab
        ).to(self.device).eval()
        

    def batch_split(self, data, batch_size=64):  
        for i in range(0, len(data), batch_size):
            yield data[i:i + batch_size]
    
    def embed(self, smiles, batch_size=64):
        """
        Embed SMILES strings into molecule embeddings.
        """
        self.model.eval()
        embeddings = []
        for batch in self.batch_split(smiles, batch_size=batch_size):
            batch_enc = self.tokenizer.batch_encode_plus(batch, padding='longest', add_special_tokens=True)
            idx = torch.tensor(batch_enc['input_ids']).to(self.device)
            mask = torch.tensor(batch_enc['attention_mask']).to(self.device)
            with torch.no_grad():
                token_embeddings = self.model.blocks(self.model.tok_emb(idx), length_mask=LM(mask.sum(-1)))

            # Average pooling over tokens
            input_mask_expanded = mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            embedding = sum_embeddings / sum_mask
            embeddings.append(embedding.cpu())
        return torch.cat(embeddings)

  

    def canonicalize(self, s):
        # Ensure s is not None and is a valid SMILES string
        if s is not None and Chem.MolFromSmiles(s):
            return Chem.MolToSmiles(Chem.MolFromSmiles(s), canonical=True, isomericSmiles=False)
        else:
            return None  # Return None if the SMILES string is invalid or None

    
    
    def process_and_save(self, data, output_dir, dataset_name):
         
        if 'SMILES' not in data.columns:
            raise ValueError("Dataset does not contain a 'SMILES' column.")
    
        output_dir = Path(output_dir).resolve()
        output_dir.mkdir(parents=True, exist_ok=True)
    
        # Get unique SMILES
        unique_smiles = data['SMILES'].unique()
        
        # Apply the canonicalization function
        canonicalized_smiles = [self.canonicalize(s) for s in unique_smiles]
    
        print(f"Extracting embeddings for {len(unique_smiles)} unique SMILES in {dataset_name}")
    
        # Filter out invalid canonical SMILES (None)
        valid_indices = [i for i, s in enumerate(canonicalized_smiles) if s is not None]
        valid_smiles = [unique_smiles[i] for i in valid_indices]
        valid_canonical_smiles = [canonicalized_smiles[i] for i in valid_indices]
        
        # Extract embeddings
        if len(valid_canonical_smiles) == 0:
            print("No valid canonical SMILES found. Skipping embedding extraction.")
            return
    
        embeddings = self.embed(valid_canonical_smiles).numpy()
    
        # Double-check: Filter again if embeddings are not generated
        if len(valid_canonical_smiles) != len(embeddings):
            raise ValueError("Mismatch in valid canonical SMILES and embeddings length.")
    
        # Save the filtered SMILES, canonical SMILES, and embeddings
        np.save(output_dir / f"{dataset_name}_smiles.npy", valid_smiles)
        np.save(output_dir / f"{dataset_name}_canonical_smiles.npy", valid_canonical_smiles)
        np.save(output_dir / f"{dataset_name}_molecule_embeddings.npy", embeddings)
    
        print(f"Saved {len(embeddings)} embeddings.")
    



In [23]:
# Define model, tokenizer, and checkpoint paths

lib_path = Path(base_dir/"molformer")
checkpoint_path = lib_path / "data/Pretrained MoLFormer/checkpoints/N-Step-Checkpoint_3_30000.ckpt"


molecule_extractor = MoleculeEmbeddingsExtractor(
    model_path=lib_path,
    checkpoint_path=checkpoint_path
)


for dataset_name, dataset_path in task_paths.items():
    print(f"\nProcessing dataset: {dataset_name}")
    dataset_dir = Path(dataset_path)

    train_file = dataset_dir / 'train.csv'
    val_file = dataset_dir / 'val.csv'
    test_file = dataset_dir / 'test.csv'

    if train_file.exists() and val_file.exists() and test_file.exists():
       
        train_data = pd.read_csv(train_file)
        val_data = pd.read_csv(val_file)
        test_data = pd.read_csv(test_file)

        full_data = pd.concat([train_data, val_data, test_data], ignore_index=True)

        # Check for 'SMILES' column and process embeddings
        if 'SMILES' in full_data.columns:
            output_dir = Path(base_dir /f"embeddings/tda/{dataset_name}/drug")
            output_dir.mkdir(parents=True, exist_ok=True) 
            molecule_extractor.process_and_save(full_data, output_dir, dataset_name=dataset_name)
        else:
            print(f"No 'SMILES' column found in the dataset: {dataset_name}")
    else:
        print(f"Skipping {dataset_name}: Missing one or more of train.csv, val.csv, or test.csv.")


Loading MolFormer model from: /bozdagpool/UNT/mt0994/DTI/molformer
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding


Global seed set to 12345


Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding


Global seed set to 12345


Using Rotation Embedding
Using Rotation Embedding

Processing dataset: biosnap_random


[23:46:21] Unusual charge on atom 0 number of radical electrons set to zero
[23:46:21] Unusual charge on atom 0 number of radical electrons set to zero


Extracting embeddings for 4505 unique SMILES in biosnap_random
Saved 4505 embeddings.

Processing dataset: human_random




Extracting embeddings for 2726 unique SMILES in human_random
Saved 2726 embeddings.

Processing dataset: human_cold




Extracting embeddings for 1813 unique SMILES in human_cold
Saved 1813 embeddings.


In [25]:
# Load the generated drug embeddings for the BioSNAP random dataset
biosnap_random_drug_embeddings_path = base_dir / f"embeddings/tda/biosnap_random"
smile_names = np.load(biosnap_random_drug_embeddings_path / f"drug/biosnap_random_smiles.npy", allow_pickle=True)
can_smile_names = np.load(biosnap_random_drug_embeddings_path / f"drug/biosnap_random_canonical_smiles.npy", allow_pickle=True)
drug_embeddings = np.load(biosnap_random_drug_embeddings_path / f"drug/biosnap_random_molecule_embeddings.npy", allow_pickle=True)
biosnap_random_drugs_llm = pd.DataFrame({ 'smiles': smile_names, 'can_smiles': can_smile_names, 'drug_llm_embeddings': drug_embeddings.tolist()})
biosnap_random_drugs_llm.head()

Unnamed: 0,smiles,can_smiles,drug_llm_embeddings
0,OP(O)(=O)C(Cl)(Cl)P(O)(O)=O,O=P(O)(O)C(Cl)(Cl)P(=O)(O)O,"[0.7787514925003052, 0.34758803248405457, -0.1..."
1,NC1=NC(=O)N(C=N1)[C@H]1C[C@H](O)[C@@H](CO)O1,Nc1ncn(C2CC(O)C(CO)O2)c(=O)n1,"[0.21483802795410156, 0.17366445064544678, -0...."
2,OCCCCCCCCNCO,OCCCCCCCCNCO,"[0.6010259389877319, -0.36157843470573425, -0...."
3,C[C@H](OP(O)(O)=O)[C@@H](N)C(O)=O,CC(OP(=O)(O)O)C(N)C(=O)O,"[0.4546224772930145, 0.39328116178512573, 0.02..."
4,CCO,CCO,"[0.9750620126724243, -0.1789940893650055, 0.24..."
