In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import csv

def calculate_cag_repeats(sequence):
    """Calculates the number of CAG repeats in a DNA sequence"""

    repeats = re.findall(r'(CAG)', sequence)
    repeat_count = len(repeats)
    return repeat_count

def label_sequence(repeat_count):
    """Label the sequence based on the number of CAG repeats"""
    if repeat_count <= 35:
        return 'Normal'
    elif 36 <= repeat_count <= 39:
        return 'Intermediate'
    elif 40 <= repeat_count <= 55:
        return 'Reduced_Penetrance'
    else:
        return 'Full_Mutation'


with open('/content/drive/MyDrive/our_project/dataset/huntington_sequences_5000.txt', 'r') as file:
    sequences = [line.strip() for line in file]


formatted_data = []
for sequence in sequences:
    repeat_count = calculate_cag_repeats(sequence)
    label = label_sequence(repeat_count)
    formatted_data.append([sequence, repeat_count, label])


with open('/content/drive/MyDrive/our_project/dataset/labeled_DATA.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Sequence", "CAG_Repeats", "Label"])  # Write the header
    writer.writerows(formatted_data)  # Write the data

print("Data formatted and saved to new2_Huntington.csv")

Data formatted and saved to new2_Huntington.csv


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

class DNASequenceDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]

        # Convert sequence to k-mers (k=6 for DNABERT)
        kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
        kmer_sequence = " ".join(kmers)

        encoding = self.tokenizer(
            kmer_sequence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_dna_sequences(file_path):
    df = pd.read_csv(file_path)
    sequences = df.iloc[:, 0].values  # First column contains sequences
    labels = df.iloc[:, -1].values    # Last column contains labels
    return sequences, labels

def train_dnabert_model(file_path, model_save_dir, num_epochs=10):
    # Create model save directory if it doesn't exist
    os.makedirs(model_save_dir, exist_ok=True)

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load and prepare data
    sequences, labels = load_dna_sequences(file_path)

    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        sequences,
        encoded_labels,
        test_size=0.2,
        random_state=42
    )

    # Load DNABERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('zhihan1996/DNA_bert_6')
    model = BertForSequenceClassification.from_pretrained(
        'zhihan1996/DNA_bert_6',
        num_labels=len(label_encoder.classes_)
    ).to(device)

    # Create datasets
    train_dataset = DNASequenceDataset(X_train, y_train, tokenizer)
    test_dataset = DNASequenceDataset(X_test, y_test, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Initialize optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # Training loop
    best_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                _, predicted = torch.max(outputs.logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f'Epoch {epoch+1}: Average Loss = {total_loss/len(train_loader):.4f}, Accuracy = {accuracy:.2f}%')

        if accuracy > best_accuracy:
            best_accuracy = accuracy

            torch.save(model.state_dict(), os.path.join(model_save_dir, 'dnabert_model.pt'))
            torch.save(label_encoder, os.path.join(model_save_dir, 'label_encoder.pkl'))
            tokenizer.save_pretrained(model_save_dir)

    return model, tokenizer, label_encoder

def load_model(model_save_dir, device):

    label_encoder = torch.load(os.path.join(model_save_dir, 'label_encoder.pkl'))


    tokenizer = BertTokenizer.from_pretrained(model_save_dir)


    model = BertForSequenceClassification.from_pretrained(
        model_save_dir,
        num_labels=len(label_encoder.classes_)
    ).to(device)

    return model, tokenizer, label_encoder

def predict_mutation(model, tokenizer, label_encoder, sequence, device):
    # Convert sequence to k-mers
    kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
    kmer_sequence = " ".join(kmers)

    # Tokenize
    encoding = tokenizer(
        kmer_sequence,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1)

    # Convert prediction to label
    predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]
    probabilities = probabilities.cpu().numpy()[0]

    return {
        'predicted_label': predicted_label,
        'probabilities': dict(zip(label_encoder.classes_, probabilities))
    }

In [None]:
 # Prediction example:
sequence = ""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')ATGCGCGTATCAGGCCAAGTTCATGCCCGGGGCAGAATTAACAGCAGCAGCAGCAGCAGCAGCAGCAAATTAAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAATTAAAGCAGCAGCAGCAGCAGCAGCAGCAGCAGAATTAACAGCAGCAGCAGCAAATTAAGCAGCAGCAGCAGCAGCAGAATTAAAATTAAAATTAAGTCTAAATTCTTTGACAAAGCGACTTTGTACATTTTTACTAGACGTAATGCGTGACCATTATTTATTATAGGCAACGCTTCACTGAAAGTCTAAAGGTTAACGGGTCTCGAGTTATCTTGTGTGCTGTATCCGGGCATACGGGGCCTAGTCCTTACATGGCGATGAAG
result = predict_mutation(model, tokenizer, label_encoder, sequence, device)

print("\nPrediction Result:")
print(f"Predicted Label: {result['predicted_label']}")
print("\nClass Probabilities:")
for label, prob in result['probabilities'].items():
        print(f"{label}: {prob:.4f}")


Prediction Result:
Predicted Label: Intermediate

Class Probabilities:
Intermediate: 0.9988
Normal: 0.0004
Reduced_Penetrance: 0.0008


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import json

class DNASequenceDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]

        # Convert sequence to k-mers (k=6 for DNABERT)
        kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
        kmer_sequence = " ".join(kmers)

        encoding = self.tokenizer(
            kmer_sequence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_dna_sequences(file_path):
    df = pd.read_csv(file_path)
    sequences = df.iloc[:, 0].values  # First column contains sequences
    labels = df.iloc[:, -1].values    # Last column contains labels
    return sequences, labels

def train_dnabert_model(file_path, model_save_path, num_epochs=10):
      # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load and prepare data
    sequences, labels = load_dna_sequences(file_path)

    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        sequences,
        encoded_labels,
        test_size=0.2,
        random_state=42
    )

    # Load DNABERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('zhihan1996/DNA_bert_6')
    model = BertForSequenceClassification.from_pretrained(
        'zhihan1996/DNA_bert_6',
        num_labels=len(label_encoder.classes_)
    ).to(device)

    # Create datasets
    train_dataset = DNASequenceDataset(X_train, y_train, tokenizer)
    test_dataset = DNASequenceDataset(X_test, y_test, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Initialize optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # Training loop
    best_accuracy = 0
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                _, predicted = torch.max(outputs.logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f'Epoch {epoch+1}: Average Loss = {total_loss/len(train_loader):.4f}, Accuracy = {accuracy:.2f}%')

        if accuracy > best_accuracy:
            best_accuracy = accuracy
           # Create directories if they don't exist

    return model, tokenizer, label_encoder
# Save model and encoder

def predict_mutation(model, tokenizer, label_encoder, sequence, device):
    # Convert sequence to k-mers
    kmers = [sequence[i:i+6] for i in range(len(sequence)-5)]
    kmer_sequence = " ".join(kmers)

    # Tokenize
    encoding = tokenizer(
        kmer_sequence,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1)

    # Convert prediction to label
    predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]
    probabilities = probabilities.cpu().numpy()[0]

    return {
        'predicted_label': predicted_label,
        'probabilities': dict(zip(label_encoder.classes_, probabilities))
    }

# Usage example:
if __name__ == "__main__":
    file_path = '/content/drive/MyDrive/our_project/dataset/labeled_DATA.csv'
    model_save_path = '/content/drive/MyDrive/our_project'
    encoder_save_path = '/content/drive/MyDrive/our_project/encoder'

    # Train model
    model, tokenizer, label_encoder = train_dnabert_model(file_path, model_save_path)
    os.makedirs('/content/drive/MyDrive/our_project/model', exist_ok=True)
    torch.save(model,'/content/drive/MyDrive/our_project/model/update2.pt')
    torch.save(label_encoder, '/content/drive/MyDrive/our_project/model/labelencoder2.pt')
    torch.save(tokenizer, '/content/drive/MyDrive/our_project/model/tokenizer2.pt')
    num_labels = len(label_encoder.classes_)
    from google.colab import files
    files.download('/content/drive/MyDrive/our_project/model/update2.pt')
    files.download('/content/drive/MyDrive/our_project/model/labelencoder2.pt')
    files.download('/content/drive/MyDrive/our_project/model/tokenizer2.pt')
    with open('/content/drive/MyDrive/our_project/model/config.json', 'w') as f:
            json.dump({'num_labels': num_labels}, f)
        # Download the files

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNA_bert_6 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 250/250 [05:56<00:00,  1.42s/it]


Epoch 1: Average Loss = 0.3965, Accuracy = 93.30%


Epoch 2/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 2: Average Loss = 0.1991, Accuracy = 93.90%


Epoch 3/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 3: Average Loss = 0.1684, Accuracy = 88.20%


Epoch 4/10: 100%|██████████| 250/250 [05:59<00:00,  1.44s/it]


Epoch 4: Average Loss = 0.1495, Accuracy = 95.00%


Epoch 5/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 5: Average Loss = 0.1228, Accuracy = 94.50%


Epoch 6/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 6: Average Loss = 0.1117, Accuracy = 94.60%


Epoch 7/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 7: Average Loss = 0.0785, Accuracy = 94.00%


Epoch 8/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 8: Average Loss = 0.0612, Accuracy = 94.70%


Epoch 9/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 9: Average Loss = 0.0441, Accuracy = 94.80%


Epoch 10/10: 100%|██████████| 250/250 [06:00<00:00,  1.44s/it]


Epoch 10: Average Loss = 0.0381, Accuracy = 95.70%


FileNotFoundError: Cannot find file: /content/drive/MyDrive/our_project/model/dnabert_state_dict.pt

In [None]:
!pip install Flask flask-ngrok



Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
import torch
import gradio as gr
from transformers import BertTokenizer
import os

def load_trained_model(model_path='/content/drive/MyDrive/our_project/dnabert_model.pt', encoder_path='/content/drive/MyDrive/our_project/label_encoder.pkl'):
    """Load the trained DNA-BERT model and label encoder"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load the model
    model = BertForSequenceClassification.from_pretrained(
        'zhihan1996/DNA_bert_6',
        num_labels=2  # Update this based on your number of classes
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    # Load the label encoder
    label_encoder = torch.load(encoder_path)

    # Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained('zhihan1996/DNA_bert_6')

    return model, tokenizer, label_encoder, device

def create_gradio_interface():
    """Create and launch the Gradio interface"""

    # First install gradio if not already installed
    !pip install -q gradio

    # Load the model and components
    try:
        model, tokenizer, label_encoder, device = load_trained_model()
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    def predict_sequence(dna_sequence):
        """Make prediction for a single DNA sequence"""
        try:
            # Convert sequence to k-mers
            kmers = [dna_sequence[i:i+6] for i in range(len(dna_sequence)-5)]
            kmer_sequence = " ".join(kmers)

            # Tokenize
            encoding = tokenizer(
                kmer_sequence,
                add_special_tokens=True,
                max_length=512,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Move to device
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            # Get prediction
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
                predicted_class = torch.argmax(probabilities, dim=1)

            # Convert prediction to label
            predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]
            probs = probabilities.cpu().numpy()[0]

            # Format the output
            result = f"Predicted Class: {predicted_label}\n\nProbabilities:\n"
            for label, prob in zip(label_encoder.classes_, probs):
                result += f"{label}: {prob:.4f}\n"

            return result

        except Exception as e:
            return f"Error processing sequence: {str(e)}"

    # Create the interface
    interface = gr.Interface(
        fn=predict_sequence,
        inputs=gr.Textbox(
            lines=3,
            placeholder="Enter DNA sequence here...",
            label="DNA Sequence"
        ),
        outputs=gr.Textbox(label="Prediction Results"),
        title="Huntington's Disease Mutation Predictor",
        description="Enter a DNA sequence to predict the likelihood of Huntington's disease mutation.",
        examples=[
            ["ATGGCGACCCTGGAAAAGCTGATGAAGGCCTTCGAGTCCCTCAAGTCCTTC"],  # Add some example sequences
        ]
    )

    return interface

def deploy_model():
    """Main deployment function"""
    print("Starting DNA-BERT model deployment...")

    # Create and launch the interface
    interface = create_gradio_interface()
    if interface:
        interface.launch(share=True)
        print("\nModel deployed successfully! Use the URL above to access the interface.")
    else:
        print("\nDeployment failed. Please check the error messages above.")

In [None]:
   deploy_model()

Starting DNA-BERT model deployment...
Error loading model: name 'BertForSequenceClassification' is not defined

Deployment failed. Please check the error messages above.


2024-12-06 17:33:12.441 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-12-06 17:33:12.467 Session state does not function when running a script without `streamlit run`


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K
added 22 packages in 4s
[1G[0K⠦[1G[0K
[1G[0K⠦[1G[0K3 packages are looking for funding
[1G[0K⠦[1G[0K  run `npm fund` for details


ERROR:pyngrok.process.ngrok:t=2024-12-06T17:33:26+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2024-12-06T17:33:26+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2024-12-06T17:33:26+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.