In [1]:
# import packages
from torch.utils.data import DataLoader
from readData import readData
import shutil
from utils import *
import torch
from torch import nn
from model import LABind
from func_help import setALlSeed,get_std_opt
from tqdm import tqdm
import pickle as pkl
from sklearn.model_selection import KFold
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# config
#DEVICE = torch.device('cpu')
DEVICE = torch.device('cuda:0')
root_path = getRootPath()
dataset = 'LigBind' # DS1:LigBind, DS2:GPSite, DS3:Unseen. Change to change dataset

nn_config = {
    # dataset 
    'train_file': f'{root_path}/{dataset}/label/training.fa',
    'test_file': f'{root_path}/{dataset}/label/test.fa',
    'valid_file': f'{root_path}/{dataset}/label/picking.fa',
    'proj_dir': f'{root_path}/{dataset}/',
    'lig_dict': pkl.load(open(f'{root_path}/tools/ligand.pkl', 'rb')),
    'pdb_class':'source', # source or omegafold or esmfold
    'dssp_max_repr': np.load(f'{root_path}/tools/dssp_max_repr.npy'),
    'dssp_min_repr': np.load(f'{root_path}/tools/dssp_min_repr.npy'),
    'ankh_max_repr': np.load(f'{root_path}/tools/ankh_max_repr.npy'),
    'ankh_min_repr': np.load(f'{root_path}/tools/ankh_min_repr.npy'),
    'ion_max_repr': np.load(f'{root_path}/tools/ion_max_repr.npy'),
    'ion_min_repr': np.load(f'{root_path}/tools/ion_min_repr.npy'),
    # model parameters
    
    'rfeat_dim':1556,
    'ligand_dim':768, 
    'hidden_dim':256, 
    'heads':4, 
    'augment_eps':0.05, 
    'rbf_num':8, 
    'top_k':30, 
    'attn_drop':0.1, 
    'dropout':0.1, 
    'num_layers':4, 
    'lr':0.0004, 
    
    # training parameters 
    # You can modify it according to the actual situation. 
    # Since it involves mapping the entire protein, it will consume a large amount of GPU memory.
    'batch_size':15,
    'max_patience':10,
    'device_ids':[0,1], # 2*A100-40G
}
pretrain_path = { # Please modify 
    'esmfold_path': '../tools/esmfold_v1', # esmfold path
    'ankh_path': '../tools/ankh-large/', # ankh path
    'molformer_path': '../tools/MoLFormer-XL-both-10pct/', # molformer path
    'model_path':f'{root_path}/model/Unseen/', # based on Unseen
    'esm2_3B_path': '../tools/esm2_t36_3B_UR50D/', # esm2 3B path
}

In [7]:
from transformers import AutoTokenizer, AutoModel 
from Bio import SeqIO


tokenizer = AutoTokenizer.from_pretrained(pretrain_path['esm2_3B_path'])

model = AutoModel.from_pretrained(pretrain_path['esm2_3B_path'], torch_dtype=torch.float32)
model.to(DEVICE)
model.eval()

out_path = f"{root_path}/{dataset}/esm3B/"
makeDir(out_path)

fasta_path = f"{root_path}/{dataset}/fasta/"

for file_class in os.listdir(fasta_path):
    
    for fasta_base_file in os.listdir(f"{fasta_path}/{file_class}"):
        
        fasta_file = f"{fasta_path}/{file_class}/{fasta_base_file}"
        sequences = SeqIO.parse(fasta_file, "fasta")
        
        for record in tqdm(sequences):
            
            if os.path.exists(out_path+f'{record.id}.npy'):
                continue
            
            ids = tokenizer(str(record.seq), return_tensors="pt")
            input_ids = ids['input_ids'].to(DEVICE)
            attention_mask = ids['attention_mask'].to(DEVICE)
            
            with torch.no_grad():
                embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
                #emb = embedding_repr.last_hidden_state[0,:len(record.seq)].cpu().numpy()
                
                hidden = embedding_repr.last_hidden_state[0]   # shape: [L+2, 2560]
                emb = hidden[1:-1].cpu().numpy()              # remove BOS/EOS
                
                np.save(out_path+f'{record.id}.npy',emb)
                
                
del model
gc.collect()
torch.cuda.empty_cache()

#Final file count of 32332


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s]
Some weights of EsmModel were not initialized from the model checkpoint at ../tools/esm2_t36_3B_UR50D/ and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1701it [00:00, 191839.50it/s]
83it [00:00, 148518.44it/s]
399it [00:00, 185514.61it/s]
127it [00:00, 170238.61it/s]
513it [00:00, 186988.61it/s]
64it [00:00, 170869.16it/s]
1421it [00:00, 211373.76it/s]
203it [00:00, 184895.49it/s]
69it [00:00, 167480.89it/s]
370it [00:00, 191237.52it/s]
235it [00:00, 183208.45it/s]
1770it [00:00, 217742.13it/s]
19it [00:00, 142561.32it/s]
103it [00:00, 162900.95it/s]
336it [00:00, 189089.78it/s]
237it [00:00, 182830.61it/s]
129it [00:00, 184286.52it/s]
108it [00:00, 177432.37it/s]
393it [00:00, 192318.45it/s]
665it [00:00, 194234.83it/s]
34it [00:00, 150428.62it/s]
158it [00:00, 166969.02it/