In [1]:
from Bio import SeqIO

# Function to check if a record contains valid sequence data
def load_sequences_from_genbank(file_path):
    sequences = []
    skipped_ids = []  # To keep track of skipped records due to missing sequences
    problematic_records = []  # To track records causing errors
    
    for record in SeqIO.parse(file_path, "genbank"):
        try:
            # Check if the sequence is None or empty
            if not record.seq or len(record.seq) == 0:
                skipped_ids.append(record.id)  # Store the skipped record ID
                print(f"Skipping record with missing or invalid sequence: {record.id}")
            else:
                sequences.append(str(record.seq))  # Store valid sequences as strings

        except Exception as e:
            # Track records that throw errors during sequence handling
            problematic_records.append(record.id)
            print(f"Error processing record {record.id}: {str(e)}")

    return sequences, skipped_ids, problematic_records

# Path to your uploaded .gb file
genbank_file_path = '2Mar2025_phages_downloaded_from_genbank.gb'

# Load the sequences
sequences, skipped_ids, problematic_records = load_sequences_from_genbank(genbank_file_path)

# Print number of valid sequences, skipped records, and problematic records
print(f"Loaded {len(sequences)} valid sequences.")
print(f"Skipped records: {len(skipped_ids)}")
print(f"Problematic record IDs (with errors): {problematic_records[:10]}")  # Show first 10 problematic records if any

Error processing record NZ_CP038625.1: Sequence content is undefined
Error processing record NZ_CP023680.1: Sequence content is undefined
Error processing record NC_022901.1: Sequence content is undefined
Error processing record NZ_CP019275.1: Sequence content is undefined
Error processing record NZ_CP014526.1: Sequence content is undefined
Error processing record NZ_CP023686.1: Sequence content is undefined
Error processing record KK213166.1: Sequence content is undefined
Error processing record FPDV00000000.1: Sequence content is undefined
Error processing record FPDN00000000.1: Sequence content is undefined
Error processing record FPDR00000000.1: Sequence content is undefined
Error processing record FPDT00000000.1: Sequence content is undefined
Error processing record FPDJ00000000.1: Sequence content is undefined
Error processing record FPDK00000000.1: Sequence content is undefined
Error processing record FPDU00000000.1: Sequence content is undefined
Error processing record FPDP0000

In [2]:
# Tokenize the sequences (nucleotide-level tokenization)
def tokenize_sequences(sequences):
    tokenized_sequences = []
    for seq in sequences:
        tokenized_sequences.append([nt for nt in seq])  # Tokenization by nucleotide
    return tokenized_sequences

# Tokenize the valid sequences
tokenized_sequences = tokenize_sequences(sequences)
print(f"First 5 tokenized sequences: {tokenized_sequences[:1]}")

First 5 tokenized sequences: [['G', 'G', 'G', 'G', 'A', 'T', 'A', 'C', 'G', 'T', 'G', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'A', 'C', 'C', 'G', 'C', 'C', 'A', 'C', 'C', 'C', 'G', 'C', 'A', 'C', 'C', 'C', 'C', 'C', 'T', 'A', 'C', 'C', 'A', 'A', 'A', 'A', 'T', 'T', 'A', 'T', 'T', 'T', 'C', 'C', 'G', 'T', 'C', 'T', 'G', 'T', 'C', 'A', 'A', 'T', 'A', 'G', 'C', 'C', 'C', 'C', 'C', 'G', 'C', 'A', 'T', 'C', 'C', 'G', 'A', 'T', 'A', 'G', 'G', 'C', 'C', 'C', 'G', 'A', 'A', 'C', 'T', 'A', 'T', 'C', 'A', 'C', 'A', 'A', 'A', 'C', 'G', 'G', 'A', 'A', 'A', 'A', 'G', 'C', 'G', 'A', 'T', 'A', 'G', 'C', 'C', 'C', 'A', 'A', 'A', 'A', 'C', 'A', 'C', 'T', 'A', 'A', 'G', 'C', 'C', 'C', 'C', 'T', 'T', 'T', 'T', 'C', 'A', 'T', 'C', 'A', 'T', 'T', 'T', 'C', 'A', 'T', 'T', 'T', 'G', 'T', 'A', 'A', 'G', 'C', 'G', 'C', 'G', 'T', 'T', 'T', 'A', 'A', 'A', 'A', 'T', 'C', 'A', 'T', 'G', 'G', 'T', 'A', 'A', 'A', 'A', 'T', 'A', 'A', 'C', 'C', 'A', 'C', 'T', 'A', 'C', 'C', 'C', 'C', 'A', 'A', 'A', 'T', 'T', 'G', 'C', 'G',

In [3]:
import torch
import numpy as np

In [6]:
nucleotides = ['**', 'A', 'T', 'C', 'G', '#'] # vocabulary
def token2nucleotide(s):
    return nucleotides[s]

PRIME_LENGTH = 4 # give the model a random DNA primer to start
num_seq = 2 # number of runs
context_length = 10000 # maximal length for the generated sequence (upper limit for the model is 131K)

# model can be downloaded from https://huggingface.co/lingxusb/megaDNA_updated/resolve/main/megaDNA_phage_145M.pt
model_path = "notebook/megaDNA_phage_145M.pt" # model name
device = 'cuda' # change this to 'cuda' if you use GPU

In [7]:
 # Load the pre-trained model
model = torch.load(model_path, map_location=torch.device(device), weights_only=False)


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import torch
from transformers import AdamW, GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Define the custom tokenizer for nucleotide sequences
nucleotides = ['**', 'A', 'T', 'C', 'G', '#']  # Define custom vocabulary for tokenization
class NucleotideTokenizer:
    def _init_(self, vocab):
        self.vocab = vocab
        self.token_to_id = {nt: idx for idx, nt in enumerate(vocab)}  # Map nucleotides to ids
        self.id_to_token = {idx: nt for idx, nt in enumerate(vocab)}  # Map ids to nucleotides

    def encode(self, sequence):
        return [self.token_to_id[nt] for nt in sequence]  # Convert sequence to token ids

    def decode(self, token_ids):
        return ''.join([self.id_to_token[token_id] for token_id in token_ids])  # Convert token ids back to sequence

# Custom Dataset for training the model
class GenomicDataset(Dataset):
    def _init_(self, sequences, tokenizer, max_length=512):
        self.sequences = sequences
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def _len_(self):
        return len(self.sequences)
    
    def _getitem_(self, idx):
        sequence = self.sequences[idx]
        # Tokenize the sequence and return as input_ids
        encoded = self.tokenizer.encode(sequence)
        encoded = encoded[:self.max_length] + [0] * (self.max_length - len(encoded))  # Padding/truncation
        return torch.tensor(encoded)

# Initialize the tokenizer and model
tokenizer = NucleotideTokenizer(nucleotides)
model = torch.load("notebook/megaDNA_phage_145M.pt", map_location=torch.device('cuda'))
model.eval()



2025-04-28 16:46:43.877008: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

TypeError: NucleotideTokenizer() takes no arguments

In [None]:
# # Prepare your tokenized sequences (replace with actual tokenized sequences)
# sequences = ["ATGCGTACGTAGC", "TTGCGATGCGTA"]  # Example sequences
# tokenized_sequences = [tokenizer.encode(seq) for seq in sequences]

# Create DataLoader
train_loader = DataLoader(GenomicDataset(tokenized_sequences, tokenizer), batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Fine-tuning loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = batch.to(device)
        labels = inputs.clone().detach()
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Generate sequences after fine-tuning
def generate_sequence(model, seed_sequence, max_length=512):
    model.eval()
    input_ids = tokenizer.encode(seed_sequence)  # Tokenize the seed sequence
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.95, top_k=50, top_p=0.95)
    generated_sequence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_sequence

seed_sequence = "ATGCGTACGTAGC"  # Example seed
generated_seq = generate_sequence(model, seed_sequence)
print(f"Generated Sequence: {generated_seq}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_megaDNA")
tokenizer.save_pretrained("./fine_tuned_megaDNA")

In [None]:
# # Prepare your tokenized sequences (replace with actual tokenized sequences)
# sequences = ["ATGCGTACGTAGC", "TTGCGATGCGTA"]  # Example sequences
# tokenized_sequences = [tokenizer.encode(seq) for seq in sequences]

# Create DataLoader
train_loader = DataLoader(GenomicDataset(tokenized_sequences, tokenizer), batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Fine-tuning loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = batch.to(device)
        labels = inputs.clone().detach()
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Generate sequences after fine-tuning
def generate_sequence(model, seed_sequence, max_length=512):
    model.eval()
    input_ids = tokenizer.encode(seed_sequence)  # Tokenize the seed sequence
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.95, top_k=50, top_p=0.95)
    generated_sequence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_sequence

seed_sequence = "ATGCGTACGTAGC"  # Example seed
generated_seq = generate_sequence(model, seed_sequence)
print(f"Generated Sequence: {generated_seq}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_megaDNA")
tokenizer.save_pretrained("./fine_tuned_megaDNA")

In [None]:
# # Prepare your tokenized sequences (replace with actual tokenized sequences)
# sequences = ["ATGCGTACGTAGC", "TTGCGATGCGTA"]  # Example sequences
# tokenized_sequences = [tokenizer.encode(seq) for seq in sequences]

# Create DataLoader
train_loader = DataLoader(GenomicDataset(tokenized_sequences, tokenizer), batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Fine-tuning loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = batch.to(device)
        labels = inputs.clone().detach()
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Generate sequences after fine-tuning
def generate_sequence(model, seed_sequence, max_length=512):
    model.eval()
    input_ids = tokenizer.encode(seed_sequence)  # Tokenize the seed sequence
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.95, top_k=50, top_p=0.95)
    generated_sequence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_sequence

seed_sequence = "ATGCGTACGTAGC"  # Example seed
generated_seq = generate_sequence(model, seed_sequence)
print(f"Generated Sequence: {generated_seq}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_megaDNA")
tokenizer.save_pretrained("./fine_tuned_megaDNA")

In [None]:
# # Prepare your tokenized sequences (replace with actual tokenized sequences)
# sequences = ["ATGCGTACGTAGC", "TTGCGATGCGTA"]  # Example sequences
# tokenized_sequences = [tokenizer.encode(seq) for seq in sequences]

# Create DataLoader
train_loader = DataLoader(GenomicDataset(tokenized_sequences, tokenizer), batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Fine-tuning loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = batch.to(device)
        labels = inputs.clone().detach()
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Generate sequences after fine-tuning
def generate_sequence(model, seed_sequence, max_length=512):
    model.eval()
    input_ids = tokenizer.encode(seed_sequence)  # Tokenize the seed sequence
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.95, top_k=50, top_p=0.95)
    generated_sequence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_sequence

seed_sequence = "ATGCGTACGTAGC"  # Example seed
generated_seq = generate_sequence(model, seed_sequence)
print(f"Generated Sequence: {generated_seq}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_megaDNA")
tokenizer.save_pretrained("./fine_tuned_megaDNA")

In [None]:
# # Prepare your tokenized sequences (replace with actual tokenized sequences)
# sequences = ["ATGCGTACGTAGC", "TTGCGATGCGTA"]  # Example sequences
# tokenized_sequences = [tokenizer.encode(seq) for seq in sequences]

# Create DataLoader
train_loader = DataLoader(GenomicDataset(tokenized_sequences, tokenizer), batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Fine-tuning loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = batch.to(device)
        labels = inputs.clone().detach()
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Generate sequences after fine-tuning
def generate_sequence(model, seed_sequence, max_length=512):
    model.eval()
    input_ids = tokenizer.encode(seed_sequence)  # Tokenize the seed sequence
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.95, top_k=50, top_p=0.95)
    generated_sequence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_sequence

seed_sequence = "ATGCGTACGTAGC"  # Example seed
generated_seq = generate_sequence(model, seed_sequence)
print(f"Generated Sequence: {generated_seq}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_megaDNA")
tokenizer.save_pretrained("./fine_tuned_megaDNA")

In [None]:
# # Prepare your tokenized sequences (replace with actual tokenized sequences)
# sequences = ["ATGCGTACGTAGC", "TTGCGATGCGTA"]  # Example sequences
# tokenized_sequences = [tokenizer.encode(seq) for seq in sequences]

# Create DataLoader
train_loader = DataLoader(GenomicDataset(tokenized_sequences, tokenizer), batch_size=8, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Fine-tuning loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = batch.to(device)
        labels = inputs.clone().detach()
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

# Generate sequences after fine-tuning
def generate_sequence(model, seed_sequence, max_length=512):
    model.eval()
    input_ids = tokenizer.encode(seed_sequence)  # Tokenize the seed sequence
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.95, top_k=50, top_p=0.95)
    generated_sequence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_sequence

seed_sequence = "ATGCGTACGTAGC"  # Example seed
generated_seq = generate_sequence(model, seed_sequence)
print(f"Generated Sequence: {generated_seq}")

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_megaDNA")
tokenizer.save_pretrained("./fine_tuned_megaDNA")