In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

In [None]:
import os
import re
import torch
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
from torch.optim import Adam

In [None]:
import pandas as pd
import re
import random
from datasets import Dataset

data = pd.read_csv("Pretrain.csv")

def filter_seq(seq):
    return re.sub(r'[^ATGC]', '', str(seq))

data['SEQ'] = data['SEQ'].apply(filter_seq)

def chunk_sequences(sequences, min_length=50, max_length=512):
    chunked_sequences = []

    for seq in sequences:
        if min_length <= len(seq) <= max_length:
            chunked_sequences.append(seq)

        elif len(seq) > max_length:
            start = 0
            while len(seq) - start >= min_length:
                chunk_size = random.randint(
                    min_length,
                    min(max_length, len(seq) - start)
                )
                chunked_sequences.append(seq[start:start + chunk_size])
                start += chunk_size

    return chunked_sequences

chunked_seqs = chunk_sequences(data['SEQ'].tolist())
chunked_df = pd.DataFrame({'SEQ': chunked_seqs})

def create_kmers(sequence, k=n):
    return ' '.join(
        sequence[i:i+k] for i in range(len(sequence) - k + 1)
    )

chunked_df['kmers'] = chunked_df['SEQ'].apply(lambda x: create_kmers(x, k=3))

dataset = Dataset.from_pandas(
    chunked_df[['SEQ', 'kmers']],
    preserve_index=False
)

print(f"Final dataset size: {len(dataset)} rows")


In [None]:
# Tokenizer for DNABERT
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_n")  
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['SEQ'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Load DNABERT model
model = BertForMaskedLM.from_pretrained("zhihan1996/DNA_bert_n") 

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./TransVi_pretrained_n",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

In [None]:
# Custom Trainer with Adam Optimizer
class CustomTrainer(Trainer):
    def create_optimizer(self):
        self.optimizer = Adam(self.model.parameters(), lr=2e-5)  

# Initialize the custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

In [None]:
# Save the model
trainer.save_model("./TransVi_pretrained_n")
tokenizer.save_pretrained("./TransVi_pretrained_n")