In [1]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: NVIDIA GeForce RTX 3050


In [3]:
import os
import re
import torch
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
from torch.optim import Adam




In [5]:
# Load the dataset
data = pd.read_csv("Pretrain.csv")

# Function to filter sequences
def filter_seq(seq):
    return re.sub(r'[^ATGC]', '', str(seq))

# Function to create k-mers from sequences
def create_kmers(sequence, k=5):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return ' '.join(kmers)

# Apply the filter function to the SEQ column
data['SEQ'] = data['SEQ'].apply(filter_seq)

# Apply the k-mer function to the filtered sequences
data['kmers'] = data['SEQ'].apply(lambda x: create_kmers(x, k=5))

# Select the first 100,000 rows
subset_data = data[:100000]

# Convert the filtered sequences and k-mers to a Dataset
dataset = Dataset.from_pandas(subset_data[['SEQ', 'kmers']])

# Check the dataset size
print(f"Dataset size: {len(dataset)} rows")

Dataset size: 100000 rows


In [7]:
# Tokenizer for DNABERT
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_5")  # Replace with DNABERT tokenizer if different
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['SEQ'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Load DNABERT model
model = BertForMaskedLM.from_pretrained("zhihan1996/DNA_bert_5")  # Replace with DNABERT model if different

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at zhihan1996/DNA_bert_5 were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./virus-pretrained_5",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

In [11]:
# Custom Trainer with Adam Optimizer
class CustomTrainer(Trainer):
    def create_optimizer(self):
        # Use Adam optimizer instead of AdamW
        self.optimizer = Adam(self.model.parameters(), lr=2e-5)  # Define learning rate and other parameters as necessary

# Initialize the custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train the model with Adam optimizer
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmeet2priyasi[0m ([33mmeet2priyasi-WBSU [0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.0
1000,0.0
1500,0.0
2000,0.0
2500,0.0
3000,0.0
3500,0.0
4000,0.0
4500,0.0
5000,0.0


TrainOutput(global_step=250000, training_loss=0.0, metrics={'train_runtime': 188598.0325, 'train_samples_per_second': 10.605, 'train_steps_per_second': 1.326, 'total_flos': 5.26228420608e+17, 'train_loss': 0.0, 'epoch': 20.0})

In [12]:
# Save the model
trainer.save_model("./virus-pretrainedmodel_5")
tokenizer.save_pretrained("./virus-pretrainedmodel_5")

('./virus-pretrainedmodel_5\\tokenizer_config.json',
 './virus-pretrainedmodel_5\\special_tokens_map.json',
 './virus-pretrainedmodel_5\\vocab.txt',
 './virus-pretrainedmodel_5\\added_tokens.json',
 './virus-pretrainedmodel_5\\tokenizer.json')