
Python 3.8, Ubuntu and CUDA 12 are required. A clean Anaconda/Conda environment is essential. Refer to setup instructions and QDNABERT2_working_env.yaml file.


Linear NN as final BERT layer to make comparison fairer

In [None]:
#!conda install -c "nvidia/label/cuda-12.2.2" cuda-toolkit

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
! pip install triton==2.0.0.dev20221202
! pip install torch==1.13.1
! pip install einops==0.6.1
! pip install peft==0.4.0
! pip install huggingface-hub==0.16.4
! pip install scikit-learn
! pip install matplotlib
! pip install progressbar

In [None]:
! pip install tensorboard==2.13.0
! pip install tensorboard-data-server==0.7.1

In [2]:
### SET DIRECTORY
import os
os.chdir("/home/beri/anaconda3/envs/QDNABERT2env") # my directory
print(os.getcwd())

/home/beri/anaconda3/envs/QDNABERT2env


In [None]:
!git lfs install
!git clone https://huggingface.co/zhihan1996/DNABERT-2-117M

In [3]:
### LOAD PYTHON MODULES
# Load basic modules
import os
import sys
import time
from os import path

# Load data and machine learning modules
import torch
import triton
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader

# Print triton version
print(triton.__version__)

2.0.0


In [4]:
### PRINT GPU DEVICE
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3070


In [5]:
### LOAD DNABERT MODULE
# In the github I already uploaded this: https://github.com/Zhihan1996/DNABERT_2
# Note: I modified the file DNABERT_2/finetune/train.py to solve some bugs.

sys.path.append("/home/beri/anaconda3/envs/QDNABERT2env/finetune/") 
from train import *

In [6]:
### PARAMETERS
model_args=ModelArguments()
data_args=DataArguments()
training_args=TrainingArguments

# better to save the pretrained model "DNABERT-2-117M" somewhere locally
model_args.model_name_or_path="/home/beri/anaconda3/envs/QDNABERT2env/DNABERT-2-117M/"

batchsize=16 # reduce it to decrease CUDA memory

training_args.deepspeed_plugin=None
#training_args.log_level="info"
training_args.run_name="DNABERT2_aug"
training_args.model_max_length=20
training_args.per_device_train_batch_size=batchsize
training_args.per_device_eval_batch_size=batchsize
training_args.gradient_accumulation_steps=5 # increase it to reduce CUDA memory 
training_args.learning_rate=3e-5
training_args.num_train_epochs=4
training_args.fp16=False
training_args.save_steps=0 #400
training_args.evaluation_strategy="steps"
training_args.eval_steps=500 # avoid testing on validation while training too frequently (takes a lot of memory)
training_args.warmup_steps=50
training_args.logging_steps=100000
training_args.find_unused_parameters=False

# Other arguments to add since it was bugging
training_args.device=torch.device('cuda:0')
training_args.report_to=["tensorboard"]
training_args.world_size=1
training_args.per_device_train_batch_size=8
training_args.train_batch_size=batchsize
training_args.eval_batch_size=batchsize
training_args.test_batch_size=batchsize
training_args.batch_size=batchsize
training_args.num_training_steps=200
training_args.n_gpu=1
training_args.distributed_state=None
training_args.local_rank=-1


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [12]:
# this is the linear classifier layer

import torch.nn as nn

class customLayer(nn.Module):
    def __init__(self):
        
        super(customLayer, self).__init__()
        
        self.linear1 = nn.Linear(768,2)
        self.linear1.weight = torch.nn.Parameter(torch.zeros(768,2))
        self.linear1.bias = torch.nn.Parameter(torch.ones(2))

    def forward(self, input_array):
        h = self.linear1(input_array)
        return self.linear1(h)


In [13]:
# MAKE CUSTOM MODEL TO MODIFY DNABERT2

from transformers.modeling_outputs import TokenClassifierOutput

class CustomModel(nn.Module):
  def __init__(self,num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    #Load Model with given checkpoint and extract its body
    self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
    self.model = AutoModel.from_pretrained(model_args.model_name_or_path, 
                                           trust_remote_code=True, output_hidden_states=True).cuda()
    self.dropout = nn.Dropout(0.1) 

    
    # QUANTUM LAYER 
    self.classifier = customLayer() 

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
    print(sequence_output.shape)
    
    # By default, no pooling is done, only the first word is taken (sequence_output[:,0,:]). 
    # The authors of BERT paper found it sufficient to use only the output from the 1st token 
    # for few tasks such as classification
    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    # POOLING FOR QUANTUM TEAM
    #sequence_output_max=torch.max(sequence_output, dim=1) # here global max pooling
    #logits = self.classifier(sequence_output_max)
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs[0])

modeltest=CustomModel(2).cuda()
print(modeltest)

total_params = sum(p.numel() for p in modeltest.parameters())
print(str(total_params)+" parameters")


Some weights of BertModel were not initialized from the model checkpoint at /home/beri/anaconda3/envs/QDNABERT2env/DNABERT-2-117M/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertUnpadAttention(
            (self): BertUnpadSelfAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (mlp): BertGatedLinearUnitMLP(
            (gated_layers): Linear(in_features=768, out_features=61

In [14]:
# please check the below is correct - BERT parameters will be frozen and only quantum layer will be trained
# although the model is working there could still be errors here in terms of incorrect class inheritance, 
# so please check this code here very carefully
# you are also free to rewrite the code into a cleaner and perhaps more effective way

model = CustomModel(nn.Module)

for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

for name, param in model.named_parameters():
    print(name)

# Use CUDA or CPU according to the "device" object.
model = model.to(device)

Some weights of BertModel were not initialized from the model checkpoint at /home/beri/anaconda3/envs/QDNABERT2env/DNABERT-2-117M/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.embeddings.word_embeddings.weight
model.embeddings.token_type_embeddings.weight
model.embeddings.LayerNorm.weight
model.embeddings.LayerNorm.bias
model.encoder.layer.0.attention.self.Wqkv.weight
model.encoder.layer.0.attention.self.Wqkv.bias
model.encoder.layer.0.attention.output.dense.weight
model.encoder.layer.0.attention.output.dense.bias
model.encoder.layer.0.attention.output.LayerNorm.weight
model.encoder.layer.0.attention.output.LayerNorm.bias
model.encoder.layer.0.mlp.gated_layers.weight
model.encoder.layer.0.mlp.wo.weight
model.encoder.layer.0.mlp.wo.bias
model.encoder.layer.0.mlp.layernorm.weight
model.encoder.layer.0.mlp.layernorm.bias
model.encoder.layer.1.attention.self.Wqkv.weight
model.encoder.layer.1.attention.self.Wqkv.bias
model.encoder.layer.1.attention.output.dense.weight
model.encoder.layer.1.attention.output.dense.bias
model.encoder.layer.1.attention.output.LayerNorm.weight
model.encoder.layer.1.attention.output.LayerNorm.bias
model.encoder.layer.1.mlp.gated_

In [15]:
# PICK GUE DATA CLASS
GUE_class="tf" # pick among: "EMP" "mouse" "prom" "splice" "tf" "virus"

if GUE_class=="EMP":
    GUE_subclasses = ["H3","H3K14ac","H3K36me3","H3K4me1","H3K4me2","H3K4me3","H3K79me3","H3K9ac","H4","H4ac"]

if GUE_class=="mouse":
    GUE_subclasses = ["0", "1", "2", "3", "4"]

if GUE_class=="prom":
    GUE_subclasses = ["prom_300_all","prom_300_notata","prom_300_tata",
                      "prom_core_all","prom_core_notata","prom_core_tata"]

if GUE_class=="splice":
    GUE_subclasses = ["reconstructed"]

if GUE_class=="tf":
    GUE_subclasses = ["0", "1", "2", "3", "4"]

if GUE_class=="virus":
    GUE_subclasses = ["covid"]


In [16]:
from sklearn import metrics

In [17]:
### FINE TUNE DNABERT2


for GUE_subclass in GUE_subclasses:
    
    #GUE_subclass="H3K4me1"

    data_args.data_path="/home/beri/anaconda3/envs/QDNABERT2env/GUE/"+GUE_class+"/"+GUE_subclass
    training_args.output_dir="results/DNABERT2/"+GUE_class+"/"+GUE_subclass

    # load tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        padding_side="right",
        use_fast=True,
        trust_remote_code=True,
    )

    if "InstaDeepAI" in model_args.model_name_or_path:
        tokenizer.eos_token = tokenizer.pad_token

    # define datasets and data collator
    train_dataset = SupervisedDataset(tokenizer=tokenizer, 
                                      data_path=os.path.join(data_args.data_path, "train.csv"), 
                                      kmer=data_args.kmer)
    val_dataset = SupervisedDataset(tokenizer=tokenizer, 
                                     data_path=os.path.join(data_args.data_path, "dev.csv"), 
                                     kmer=data_args.kmer)
    test_dataset = SupervisedDataset(tokenizer=tokenizer, 
                                     data_path=os.path.join(data_args.data_path, "test.csv"), 
                                     kmer=data_args.kmer)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

    
    # load model
    # model used by defaults (no model customization)
    if False:
        model=transformers.AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            num_labels=train_dataset.num_labels,
            trust_remote_code=True,
            output_hidden_states=False,
        )
    # customised model
    if True:
        model=CustomModel(num_labels=2).cuda()

    # configure LoRA
    if model_args.use_lora:
        lora_config = LoraConfig(
            r=model_args.lora_r,
            lora_alpha=model_args.lora_alpha,
            target_modules=list(model_args.lora_target_modules.split(",")),
            lora_dropout=model_args.lora_dropout,
            bias="none",
            task_type="SEQ_CLS",
            inference_mode=False,
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    # define trainer
    trainer = transformers.Trainer(model=model,
                                   tokenizer=tokenizer,
                                   args=training_args,
                                   compute_metrics=compute_metrics,
                                   train_dataset=train_dataset,
                                   eval_dataset=val_dataset,
                                   data_collator=data_collator)
    trainer.local_rank=training_args.local_rank
    trainer.train()

    if training_args.save_model:
        trainer.save_state()
        safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
        
    
    ### TEST ACCURACY ON INDEPENDENT TEST DATA

    # get the evaluation results from trainer
    if training_args.eval_and_save_results:
        results_path = os.path.join(training_args.output_dir, "metrics")
        results = trainer.evaluate(eval_dataset=test_dataset)
        os.makedirs(results_path, exist_ok=True)
        with open(os.path.join(results_path, "test_results.json"), "w") as f:
            json.dump(results, f)
        
    ### ONLY IF YOU WANT TO SAVE THE FINE-TUNED MODEL (BY DEFAULT: NOT SAVING)
    if False:
        path_model="/home/beri/anaconda3/envs/QDNABERT2env/pytorch_model_finetuned.bin"
        torch.save(model.state_dict(), path_model)
    
    del tokenizer, train_dataset, val_dataset, test_dataset, data_collator, model, trainer
     

Some weights of BertModel were not initialized from the model checkpoint at /home/beri/anaconda3/envs/QDNABERT2env/DNABERT-2-117M/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Currently training with a batch size of: 16
***** Running training *****
  Num examples = 32,378
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 5
  Total optimization steps = 1,616
  Number of trainable parameters = 117,070,082


torch.Size([16, 20, 768])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x768 and 2x768)