In [None]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig, DataCollatorWithPadding, AutoModel
import os

In [None]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=64,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS # BERT
)

In [None]:
#loading the NT model
root_dir = "/home/sxr280/BERTLocRNA/"
model_path = "zhihan1996/DNABERT-2-117M"
path_join = lambda *path: os.path.abspath(os.path.join(*path))
cache_dir = path_join(root_dir, "..", "saved_model", "DNABERT2")
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir = cache_dir, trust_remote_code = True)



In [None]:
print(model)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [None]:
peft_model = get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
import sys
sys.path.append("../../")
sys.path.append("../../../")
from utils.embedding_generator import NucleotideTransformerEmbedder

In [None]:
from torch import nn

class Lora:
    def __init__(self, max_tokens = None, lora_config = None):
        self.max_tokens = max_tokens
        self.lora_config = lora_config
    def Etrunc(self, seq):
        if len(seq) > self.max_tokens:
            seq = seq[:self.max_tokens/2] + seq[-self.max_tokens/2:]

        else:
            seq = seq
        return seq
    def wrapper(self, model):
        lora_config = LoraConfig(
                r=self.lora_config["r"], # Rank
                lora_alpha=self.lora_config["lora_alpha"],
                target_modules=self.lora_config["target_modules"],
                # ["query", "value"],
                lora_dropout=0.05,
                bias="none",
                task_type=TaskType.SEQ_CLS # BERT
            )
        peft_model = get_peft_model(model, lora_config)
        return peft_model
    @staticmethod
    def print_number_of_trainable_model_parameters(model):
        trainable_model_params = 0
        all_model_params = 0
        for _, param in model.named_parameters():
            all_model_params += param.numel()
            if param.requires_grad:
                trainable_model_params += param.numel()
        return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"





class LoraNT(NucleotideTransformerEmbedder, Lora):
     
    def __init__(self, model_config, lora_config):
        super(NucleotideTransformerEmbedder).__init__(task = model_config["task"], 
                                                      model_path = model_config["model_path"],
                                                      hidden_dim = model_config["hidden_dim"])#the model will be load automatically, and attributions are inherented 
        super(Lora).__init__(max_tokens = self.max_tokens, lora_config = lora_config)
        #the input need to be truncated again to fixed the length limitation
        #we should add the last layer to fit for the multi-label prediction
        self.maxpool = nn.MaxPool1d(self.model_config.pooling_size, stride = self.model_config.pooling_size)
        flat_dim = self.hidden_dim*self.max_tokens/self.model_config.pooling_size
        self.last_layer = nn.Linear(flat_dim , self.model_config.nb_classes)
        self.sigmoid = nn.Sigmoid()
        #Calculating the effeciency of the LoRA in the PLM
        Lora.print_number_of_trainable_model_parameters(self.model)
        #wrap the model with LoRa
        self.model = self.wrapper(self.model)
        

    def tokenization(self, x):

        x = Lora.Etrun(x)
        tokens = self.tokenizer(
                    x,
                    truncation = True,
                    padding = "max_length",
                    return_tensors="pt"
                )
        input_ids = tokens["input_ids"].int()
        masks = tokens["attention_mask"].int()
        return input_ids, masks
        
    def forward(self, x):
        #preprocessing
        input_ids, masks = self.tokenization(x)[0]
        PLM_out = self.get_embed(input_ids, masks)
        batch_size = PLM_out.size(0)
        #flatten the embeddings
        PLM_out = torch.view(batch_size, -1)
        digit = self.last_layer(PLM_out)
        pred = self.sigmoid(digit)
        return pred


In [None]:
from datasets import load_dataset, DatasetDict
hf_cache = "/tmp/erda/BERTLocRNA/cache"
save_path = os.path.join("/", "tmp", "erda", "BERTLocRNA", "embeddings", "RNAlocalization_Lora" + "_" + "DNABERT2" + "embedding")
a = load_dataset(save_path)#, cache_dir = hf_cache)

In [1]:
import os
import fm
import torch
from peft import LoraConfig, get_peft_model, TaskType, IA3Config
path_join = lambda *path: os.path.abspath(os.path.join(*path))
model_path = "/home/sxr280/BERTLocRNA/saved_model/RNAFM"
model, alphabet = fm.pretrained.rna_fm_t12(path_join(model_path, "RNA-FM_pretrained.pth"))
batch_converter = alphabet.get_batch_converter()
model.eval() 

data = [
    ("RNA1", "GGGUGCGAUCAUACCAGCACUAAUGCCCUCCUGGGAAGUCCUCGUGUUGCACCCCU"),
    ("RNA2", "GGGUGUCGCUCAGUUGGUAGAGUGCUUGCCUGGCAUGCAAGAAACCUUGGUUCAAUCCCCAGCACUGCA"),
    ("RNA3", "CGAUUCNCGUUCCC--CCGCCUCCA"),
]
batch_labels, batch_strs, batch_tokens = batch_converter(data)

# Extract embeddings (on CPU)

results = model(batch_tokens, repr_layers=[12])
token_embeddings = results["representations"][12]


In [2]:
lora_config = LoraConfig(
                r=8, # Rank
                lora_alpha=16,
                target_modules=["k_proj", "v_proj", "q_proj"],
                lora_dropout=0.05,
                bias="all"
            )
peft_model = get_peft_model(model, lora_config)
results = peft_model(batch_tokens, repr_layers=[12])

In [8]:
peft_model.print_trainable_parameters()

trainable params: 486,426 || all params: 99,890,186 || trainable%: 0.4869607510791901


In [5]:
token_embeddings = results["representations"][12]
token_embeddings.shape

torch.Size([3, 71, 640])

In [None]:
IA3_config = IA3Config(
    task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "f1", "f2"], feedforward_modules=["f1", "f2"]
)
peft_model = get_peft_model(model, IA3_config)

In [None]:
results = peft_model(batch_tokens, repr_layers=[12])

results = peft_model(batch_tokens, repr_layers=[12])

In [None]:
results = peft_model(batch_tokens, repr_layers=[12])