In [9]:
import numpy as np

data = np.load("/home/naikv12/Hint2/clinical-trial-outcome-prediction-main/data/target_protein/phase_I_train_balm.npz")
print(data.files)  # ['balm_vectors', 'pkd_scores']

# Access arrays:
balm_vecs = data['balm_vectors']
pkd_scores = data['pkd_scores']

print(balm_vecs.shape)     # (1044, 128)
print(pkd_scores[:5])      # show first 5 pKd scores
print(balm_vecs[0]) # → vector for trial 0)

['balm_vectors', 'pkd_scores']
(1044, 128)
[5.20387571 8.78923055 9.87197404 6.18613396 5.27380819]
[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
  1.57921282  0.76743473 -0.46947439  0.54256004 -0.46341769 -0.46572975
  0.24196227 -1.91328024 -1.72491783 -0.56228753 -1.01283112  0.31424733
 -0.90802408 -1.4123037   1.46564877 -0.2257763   0.0675282  -1.42474819
 -0.54438272  0.11092259 -1.15099358  0.37569802 -0.60063869 -0.29169375
 -0.60170661  1.85227818 -0.01349722 -1.05771093  0.82254491 -1.22084365
  0.2088636  -1.95967012 -1.32818605  0.19686124  0.73846658  0.17136828
 -0.11564828 -0.3011037  -1.47852199 -0.71984421 -0.46063877  1.05712223
  0.34361829 -1.76304016  0.32408397 -0.38508228 -0.676922    0.61167629
  1.03099952  0.93128012 -0.83921752 -0.30921238  0.33126343  0.97554513
 -0.47917424 -0.18565898 -1.10633497 -1.19620662  0.81252582  1.35624003
 -0.07201012  1.0035329   0.36163603 -0.64511975  0.36139561  1.53803657
 -0.03582604  1.56464366

In [5]:

import pandas as pd

# Load the BALM-Benchmark dataset (e.g., BindingDB_filtered)
dataset = pd.read_csv("/home/naikv12/Hint2/clinical-trial-outcome-prediction-main/data/target_protein/data.csv")
print(dataset)

       Index     Drug_ID                                               Drug  \
0          0    444607.0          Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1   
1          1      4316.0         COc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1   
2          2      4293.0              NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1   
3          3      1611.0       NS(=O)(=O)c1cc2c(s1)S(=O)(=O)N(Cc1cccs1)CC2O   
4          4      1612.0     COc1ccc(N2CC(O)c3cc(S(N)(=O)=O)sc3S2(=O)=O)cc1   
...      ...         ...                                                ...   
24695  47936      3019.0                       CC1=Nc2ccc(Cl)cc2S(=O)(=O)N1   
24696  47937  76311095.0  O=C(NCc1ccc(S(=O)(=O)N2CCN(C3COC3)CC2)cc1)c1cc...   
24697  47938  76311094.0           O=C(NCCCCS(=O)(=O)c1ccccc1)c1ccc2nccn2c1   
24698  47939    113557.0                        CCCCCCCOC1OC(CO)C(O)C(O)C1O   
24699  47940    113557.0                        CCCCCCCOC1OC(CO)C(O)C(O)C1O   

      Target_ID                                    

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
import selfies as sf

# Load the pre-trained ChemFIE-DTP model
model_name = "gbyuvd/drugtargetpred-chemselfies"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a custom model with a regression head
class RegressionHeadModel(nn.Module):
    def __init__(self, base_model):
        super(RegressionHeadModel, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 1)  # Output a single scalar value (pKd)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0]  # The first element of the output is the last hidden state
        pooled_output = last_hidden_state[:, 0]  # Get the first token's representation (CLS token)
        pKd = self.regression_head(pooled_output)  # Apply the regression head
        return pKd

# Initialize the model with a regression head
base_model = AutoModel.from_pretrained(model_name)
model_with_regression = RegressionHeadModel(base_model)

# Move the model to the selected device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_with_regression.to(device)

# Convert SMILES to SELFIES representation using the SELFIES library
def smiles_to_selfies(smiles):
    try:
        selfies = sf.encoder(smiles)  # Encode SMILES into SELFIES
        selfies_tokens = list(sf.split_selfies(selfies))
        
        # Join tokens appropriately
        joined_tokens = []
        i = 0
        while i < len(selfies_tokens):
            if selfies_tokens[i] == '.' and i + 1 < len(selfies_tokens):
                joined_tokens.append(f".{selfies_tokens[i+1]}")
                i += 2
            else:
                joined_tokens.append(selfies_tokens[i])
                i += 1
        
        selfies_sentence = ' '.join(joined_tokens)
        return selfies_sentence
    except sf.EncoderError as e:
        print(f"Encoder Error: {e}")
        return None

# Example SMILES and target sequence
smiles = "CS[C@@H]1O[C@H](CO)[C@H](O)[C@H](O)[C@H]1O"
target_seq = "MADNFSLHDALSGSGNPNPQGWPGAWGNQPAGAGGYPGASYPGAYPGQAPPGAYPGQAPPGAYPGAPGAYPGAPAPGVYPGPPSGPGAYPSSGQPSATGAYPATGPYGAPAGPLIVPYNLPLPGGVVPRMLITILGTVKPNANRIALDFQRGNDVAFHFNPRFNENNRRVIVCNTKLDNNWGREERQSVFPFESGKPFKIQVLVEPDHFKVAVNDAHLLQYNHRVKKLNEISKLGISGDIDLTSASYTMI"

# Convert the SMILES string to SELFIES representation
selfies_representation = smiles_to_selfies(smiles)

# Tokenize the inputs (SELFIES + target sequence)
inputs = tokenizer(selfies_representation, target_seq, return_tensors="pt", padding=True, truncation=True)

# Move the inputs to the device (GPU or CPU)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Get predicted binding affinity (pKd)
with torch.no_grad():  # Disable gradients since we're only predicting
    predicted_pKd = model_with_regression(**inputs)

# Output the predicted binding affinity
print(f"Predicted Binding Affinity (pKd): {predicted_pKd.item():.4f}")

Predicted Binding Affinity (pKd): -0.1926


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
import selfies as sf

# Load the pre-trained ChemFIE-DTP model
model_name = "gbyuvd/drugtargetpred-chemselfies"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a custom model with a regression head
class RegressionHeadModel(nn.Module):
    def __init__(self, base_model):
        super(RegressionHeadModel, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 1)  # Output a single scalar value (pKd)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0]  # The first element of the output is the last hidden state
        pooled_output = last_hidden_state[:, 0]  # Get the first token's representation (CLS token)
        pKd = self.regression_head(pooled_output)  # Apply the regression head
        return pKd

# Initialize the model with a regression head
base_model = AutoModel.from_pretrained(model_name)
model_with_regression = RegressionHeadModel(base_model)

# Move the model to the selected device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_with_regression.to(device)

# Convert SMILES to SELFIES representation using the SELFIES library
def smiles_to_selfies(smiles):
    try:
        selfies = sf.encoder(smiles)  # Encode SMILES into SELFIES
        selfies_tokens = list(sf.split_selfies(selfies))
        
        # Join tokens appropriately
        joined_tokens = []
        i = 0
        while i < len(selfies_tokens):
            if selfies_tokens[i] == '.' and i + 1 < len(selfies_tokens):
                joined_tokens.append(f".{selfies_tokens[i+1]}")
                i += 2
            else:
                joined_tokens.append(selfies_tokens[i])
                i += 1
        
        selfies_sentence = ' '.join(joined_tokens)
        return selfies_sentence
    except sf.EncoderError as e:
        print(f"Encoder Error: {e}")
        return None

# Example SMILES and target sequence
smiles = "NS(=O)(=O)c1ccc(S(=O)(=O)NCc2cccs2)s1"
target_seq = "MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK"

# Convert the SMILES string to SELFIES representation
selfies_representation = smiles_to_selfies(smiles)

# Tokenize the inputs (SELFIES + target sequence)
inputs = tokenizer(selfies_representation, target_seq, return_tensors="pt", padding=True, truncation=True)

# Move the inputs to the device (GPU or CPU)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Get predicted binding affinity (pKd)
with torch.no_grad():  # Disable gradients since we're only predicting
    predicted_pKd = model_with_regression(**inputs)

# Output the predicted binding affinity
print(f"Predicted Binding Affinity (pKd): {predicted_pKd.item():.4f}")

Predicted Binding Affinity (pKd): 0.6084


In [24]:
from huggingface_hub import login
from transformers import pipeline

# Set the Hugging Face API token (replace with your token)
api_token = "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"

# Log in with the token
login(token=api_token)
# Load the model (e.g., DeepChem/ChemBERTa-77M-MTR)
model_name = "DeepChem/ChemBERTa-77M-MTR"  # Replace with the actual model name
model = pipeline("text-classification", model=model_name, tokenizer=model_name)

# Example input (SMILES + target sequence)
smiles = "Cc1ccc(CNS(=O)(=O)c2ccc(S(N)(=O)=O)s2)cc1"  # Example SMILES string
target_seq = "MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK"  # Example target protein sequence

# Get the prediction
predictions = model(f"{smiles} {target_seq}")
print(predictions)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device set to use cpu


[{'label': 'LABEL_63', 'score': 0.006647514645010233}]


In [29]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the model configuration
model_name = "DeepChem/ChemBERTa-77M-MTR"  # Replace with the actual model name
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Check the model's config to see if there is a label mapping
print(model.config.id2label)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6', 7: 'LABEL_7', 8: 'LABEL_8', 9: 'LABEL_9', 10: 'LABEL_10', 11: 'LABEL_11', 12: 'LABEL_12', 13: 'LABEL_13', 14: 'LABEL_14', 15: 'LABEL_15', 16: 'LABEL_16', 17: 'LABEL_17', 18: 'LABEL_18', 19: 'LABEL_19', 20: 'LABEL_20', 21: 'LABEL_21', 22: 'LABEL_22', 23: 'LABEL_23', 24: 'LABEL_24', 25: 'LABEL_25', 26: 'LABEL_26', 27: 'LABEL_27', 28: 'LABEL_28', 29: 'LABEL_29', 30: 'LABEL_30', 31: 'LABEL_31', 32: 'LABEL_32', 33: 'LABEL_33', 34: 'LABEL_34', 35: 'LABEL_35', 36: 'LABEL_36', 37: 'LABEL_37', 38: 'LABEL_38', 39: 'LABEL_39', 40: 'LABEL_40', 41: 'LABEL_41', 42: 'LABEL_42', 43: 'LABEL_43', 44: 'LABEL_44', 45: 'LABEL_45', 46: 'LABEL_46', 47: 'LABEL_47', 48: 'LABEL_48', 49: 'LABEL_49', 50: 'LABEL_50', 51: 'LABEL_51', 52: 'LABEL_52', 53: 'LABEL_53', 54: 'LABEL_54', 55: 'LABEL_55', 56: 'LABEL_56', 57: 'LABEL_57', 58: 'LABEL_58', 59: 'LABEL_59', 60: 'LABEL_60', 61: 'LABEL_61', 62: 'LABEL_62', 63: 'LABEL_

In [6]:
import numpy as np

# Define the amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Function to one-hot encode a sequence
def one_hot_encode(sequence):
    encoding = []
    for aa in sequence:
        encoding.append([1 if aa == x else 0 for x in amino_acids])
    return np.array(encoding)

# Example protein sequence
protein_sequence = "MNEKGT"
encoded_sequence = one_hot_encode(protein_sequence)
print(encoded_sequence)

[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
