## DNABERT2 Fine-tuning to detect the location/region of the promoter

### Modules import

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig
from sklearn.model_selection import train_test_split
import os
from transformers import AutoTokenizer, AutoModel, BertConfig, AutoModelForMaskedLM, AutoModelForTokenClassification
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### model loading

In [2]:
model_name = 'zhihan1996/DNABERT-2-117M'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
config = BertConfig.from_pretrained(model_name, output_hidden_states=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, config=config)
dnabert2 = model

dnabert2.to(device)
for param in dnabert2.parameters():
    param.requires_grad = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

bert_layers.py:   0%|          | 0.00/40.7k [00:00<?, ?B/s]

flash_attn_triton.py:   0%|          | 0.00/42.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


bert_padding.py:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- bert_layers.py
- flash_attn_triton.py
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### data loading

data class define

In [4]:
class PromoterDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        label = self.labels[idx]

        # Tokenize the sequence
        encoding = self.tokenizer(
            seq,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Convert label to tensor
        label = torch.tensor(label, dtype=torch.long)

        return {
            'input_ids': encoding['input_ids'].squeeze(),  # Remove batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }


https://drive.usercontent.google.com/download?id=1GRtbzTe3UXYF1oW27ASNhYX3SZ16D7N2&export=download&authuser=0&confirm=t&uuid=9a91e4d5-dfac-4ed1-869c-52ff8525f085&at=AENtkXaXrQnIdKo74wE_zRA19WYK%3A1732141945972

In [5]:
train_data = pd.read_csv('train.csv')  # Replace with actual train file
test_data = pd.read_csv('test.csv')

In [6]:
# Extract sequences and labels from the training data
train_sequences = train_data.iloc[:, 0].tolist()
train_labels = train_data.iloc[:, 1].tolist()

# Extract sequences and labels from the testing data
test_sequences = test_data.iloc[:, 0].tolist()
test_labels = test_data.iloc[:, 1].tolist()

In [7]:
# Create datasets
max_length = 512  # Adjust if need
train_dataset = PromoterDataset(train_sequences, train_labels, tokenizer, max_length)
test_dataset = PromoterDataset(test_sequences, test_labels, tokenizer, max_length)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#### Implement DNABERT2 fro promoter classifier

freeze the pretrained DNABERT2, just add a simple layer for binary classifier (1="it is promoter", 0="it is not a promoter")

In [8]:
class DNABERT2PromoterClassifier(nn.Module):
    def __init__(self, dnabert2, num_classes=2):
        super(DNABERT2PromoterClassifier, self).__init__()
        self.model = dnabert2
        self.hidden_size = dnabert2.config.hidden_size
        self.classifier = nn.Linear(self.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        # Get outputs from the DNABERT2 model
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        # Check if the outputs are a tuple (depending on configuration)
        if isinstance(outputs, tuple):
            last_hidden_state = outputs[0]
        else:
            last_hidden_state = outputs.last_hidden_state

        # Use the [CLS] token's hidden state for classification purposes
        cls_token_state = last_hidden_state[:, 0, :]  # [CLS] token is at position 0
        logits = self.classifier(cls_token_state)
        return logits

In [9]:
# Initialize the classifier
model = DNABERT2PromoterClassifier(dnabert2)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [11]:
# Training loop
epochs = 3  # Adjust as needed
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Compute loss
        loss = criterion(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Statistics
        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions.double() / total
    print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Evaluation on test set
    model.eval()
    correct_predictions = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total += labels.size(0)
    test_accuracy = correct_predictions.double() / total
    print(f'Test Accuracy: {test_accuracy:.4f}')

Epoch 1/3
Epoch 1/3, Loss: 0.6184, Accuracy: 0.7390
Test Accuracy: 0.7542
Epoch 2/3
Epoch 2/3, Loss: 0.5430, Accuracy: 0.7939
Test Accuracy: 0.7775
Epoch 3/3
Epoch 3/3, Loss: 0.4987, Accuracy: 0.8090
Test Accuracy: 0.7870


## DNABERT2 + Fine-tuned to locate the promoter (Ignore all parts above, only use this section)

### Package import

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertConfig, AutoModel
import random
from sklearn.model_selection import train_test_split

### file/model read

In [None]:
df = pd.read_csv('/content/combined_sequences_and_one_hot_1 (1).csv')
print(df.head())

In [None]:
model_name = 'zhihan1996/DNABERT-2-117M'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
config = BertConfig.from_pretrained(model_name)
dnabert2 = AutoModel.from_pretrained(model_name, trust_remote_code=True, config=config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dnabert2.to(device)

for param in dnabert2.parameters():
    param.requires_grad = False

### model/dataloader class define

#### model class

##### purpose

To predict whether each nucleotide in a DNA sequence is part of a promoter region (label 1) or not (label 0).

##### components

Pre-trained DNABERT2 Model: Provides contextual embeddings for DNA sequences.   
Multi-head Attention Layer: Focuses on relevant positions in the sequence.   
Classifier Layer: Makes token-level predictions.  

In [None]:
class PromoterDetectionModel(nn.Module):
    def __init__(self, bert_model):
        super(PromoterDetectionModel, self).__init__()
        self.bert = dnabert2
        self.attention = nn.MultiheadAttention(embed_dim=bert_model.config.hidden_size, num_heads=8)
        self.classifier = nn.Linear(bert_model.config.hidden_size, 2)  #only output 1 or 0 for each token

    def forward(self, input_ids, attention_mask):
        # fetch the output from pre-trained DNABERT2
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        if isinstance(outputs, tuple):
            sequence_output = outputs[0]
        else:
            sequence_output = outputs.last_hidden_state # (batch_size, seq_length, hidden_size)

        # adjust the shape for MultiheadAttention
        sequence_output = sequence_output.permute(1, 0, 2)  # (seq_length, batch_size, hidden_size)

        # apply attention
        attn_output, _ = self.attention(sequence_output, sequence_output, sequence_output)

        # adjust the size back
        attn_output = attn_output.permute(1, 0, 2)  # (batch_size, seq_length, hidden_size)

        # classifier
        logits = self.classifier(attn_output)  # (batch_size, seq_length, 2)
        return logits

dataloader class

In [None]:
class PromoterDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        label = self.labels[idx]

        # encode process
        encoded = self.tokenizer(seq,
                                 padding='max_length',
                                 truncation=True,
                                 max_length=self.max_length,
                                 return_tensors='pt')

        input_ids = encoded['input_ids'].squeeze(0)  # (max_length)
        attention_mask = encoded['attention_mask'].squeeze(0)  # (max_length)

        # transfer label into tensor and padding to max_length if needed
        label = [int(i) for i in label]
        if len(label) < self.max_length:
            label += [0] * (self.max_length - len(label))
        else:
            label = label[:self.max_length]
        label = torch.tensor(label, dtype=torch.long)  # (max_length)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

model initialize

In [None]:
model = PromoterDetectionModel(dnabert2)
model.to(device)

data loader initizalize

In [None]:
sequences = df['Sequence'].tolist()
labels = df['One-Hot Encoding'].tolist()
labels = [list(label) for label in labels]

train_sequences, val_sequences, train_labels, val_labels = train_test_split(
    sequences, labels, test_size=0.2, random_state=42)

train_dataset = PromoterDataset(train_sequences, train_labels, tokenizer)
val_dataset = PromoterDataset(val_sequences, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

loss function

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)


### Model fine-tuned (only trained the new layers over the pre-trained DNABERT2, freeze the DNABERT2)

In [None]:
epochs = 300

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask)

        # adjust the shape of logits
        logits = logits.view(-1, 2)  # (batch_size * seq_length, 2)
        labels_flat = labels.view(-1)  # (batch_size * seq_length)

        loss = criterion(logits, labels_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

    # model eval
    model.eval()
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)

            total_correct += (predictions == labels).sum().item()
            total_count += labels.numel()

    accuracy = total_correct / total_count
    print(f'Validation Accuracy: {accuracy:.4f}')

# model saved
torch.save(model.state_dict(), 'promoter_detection_model.pth')