In [10]:
import tensorflow as tf
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [11]:
def preprocess_sequence_for_cnn(sequence_string, target_length=11520):
    """
    Preprocesses a single DNA sequence string for input into the CNN model,
    padding or truncating to a target length.

    Args:
        sequence_string (str): The DNA sequence string (e.g., "AAGTTG...").
        target_length (int): The desired length of the sequence after padding/truncation.

    Returns:
        np.ndarray: The preprocessed sequence as a numpy array,
                    reshaped to (1, target_length, 1, 4) for the CNN.
    """
    # Define the mapping for one-hot encoding
    nucleotide_map = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1], 'N': [0, 0, 0, 0]} # Handle 'N' as all zeros

    # Pad or truncate the sequence to the target length
    if len(sequence_string) < target_length:
        # Pad with 'N' to the target length
        padded_sequence = sequence_string + 'N' * (target_length - len(sequence_string))
    elif len(sequence_string) > target_length:
        # Truncate the sequence
        padded_sequence = sequence_string[:target_length]
    else:
        # Sequence is already the target length
        padded_sequence = sequence_string

    # Convert the padded/truncated sequence string to a list of one-hot encoded vectors
    encoded_sequence = [nucleotide_map.get(base.upper(), [0, 0, 0, 0]) for base in padded_sequence] # Handle unknown bases

    # Convert the list to a numpy array
    encoded_sequence_array = np.array(encoded_sequence, dtype=np.float32)

    # Reshape for the CNN: (1, target_length, 1, 4)
    # Add a batch dimension (1), a width dimension (1), and keep height (target_length) and channels (4)
    preprocessed_input = np.expand_dims(encoded_sequence_array, axis=0) # Add batch dimension
    preprocessed_input = np.expand_dims(preprocessed_input, axis=2) # Add width dimension

    return preprocessed_input

In [21]:
# Define and fit the label encoder with the class names your model was trained on
# Replace the placeholder list with the actual list of virus types
label_encoder = LabelEncoder()
# Assuming your model was trained on 7 classes, replace with your actual class names
label_encoder.fit(['Dengue1', 'Dengue2', 'Dengue3', 'Dengue4', 'Japanese Encephalitis Virus', 'West Nile Virus', 'Zika Virus'])

In [3]:
# FOR COLAB NOTEBOOKS

# Code for colab to mount the drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
# Load the model
from tensorflow import keras
model = keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/VectorizeDNA/models/model5_flavi.keras')

In [23]:
# Example Usage:
# Assuming you have a new sequence string you want to test
# Use a sequence of the same length as the training data (11520) or a different length to test padding/truncation
new_dna_sequence = "CTGTATAAAGGTGCTGTGCCCATACACCAGCACTATGATGGAAACCATGGAGCGACTGCAACGTAGGCATGGGGGAGGATTAGTCAGAGTGCCATTGTGTCGCAACTCCACACATGAGATGTACTGGGTCTCTGGGGCAAAGAGCAACATCATAAAAAGTGTGTCCACCACAAGTCAGCTCCTCCTGGGACGCATGGATGGCCCCAGGAGGCCAGTGAAATATGAGGAGGATGTGAACCTCGGCTCGGGTACACGAGCTGTGGCAAGCTGTGCTGAGGCTCCTAACATGAAAATCATCGGCAGGCGCATTGAGAGAATCCGCAATGAACATGCAGAAACATGGTTTCTTGATGAAAACCACCCATACAGGACATGGGCCTACCATGGGAGCTACGAAGCCCCCACGCAAGGATCAGCGTCTTCCCTCGTGAACGGGGTTGTTAGACTCCTGTCAAAGCCTTGGGACGTGGTGACTGGAGTTACAGGAATAGCCATGACTGACACCACACCATACGGCCAACAAAGAGTCTTCAAAGAAAAAGTGGACACCAGGGTGCCAG"

# Specify the target length that the model was trained on
target_sequence_length = 11520 # This should match the sequence length of your training data (X_train.shape[1])

preprocessed_sequence = preprocess_sequence_for_cnn(new_dna_sequence, target_length=target_sequence_length)

print(f"Original sequence string length: {len(new_dna_sequence)}")
print(f"Preprocessed sequence shape for CNN: {preprocessed_sequence.shape}")

# Now you can use this preprocessed_sequence as input to your trained model's predict method
prediction = model.predict(preprocessed_sequence)
print(f"Prediction output shape: {prediction.shape}")
predicted_class_index = np.argmax(prediction)
# Use the label_encoder to get the predicted virus type
predicted_virus_type = label_encoder.classes_[predicted_class_index]
print(f"Predicted Virus Type: {predicted_virus_type}")



Original sequence string length: 560
Preprocessed sequence shape for CNN: (1, 11520, 1, 4)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
Prediction output shape: (1, 7)
Predicted Virus Type: Japanese Encephalitis Virus


### Gradio

In [24]:
import numpy as np
!pip install -q gradio

import gradio as gr


def predict_virus_type(sequence_string):
    # Clean the input sequence string: remove whitespace and newline characters
    cleaned_sequence_string = "".join(sequence_string.split())

    # Preprocess the input sequence
    target_sequence_length = 11520
    preprocessed_sequence = preprocess_sequence_for_cnn(cleaned_sequence_string, target_length=target_sequence_length)

    # Get prediction from the model
    prediction = model.predict(preprocessed_sequence)[0] # Get the prediction for the single input

    # Get the predicted class index and convert to virus type
    predicted_class_index = np.argmax(prediction)
    predicted_virus_type = label_encoder.classes_[predicted_class_index]

    # Format the output as a dictionary for gr.Label
    confidence_scores = {}
    for i, class_name in enumerate(label_encoder.classes_):
        confidence_scores[class_name] = float(prediction[i]) # Convert numpy float to Python float

    return predicted_virus_type, confidence_scores # Return both the predicted type and the confidence scores dictionary


# Create the Gradio interface
iface = gr.Interface(
    fn=predict_virus_type,
    inputs=gr.Textbox(label="Enter DNA Sequence"),
    outputs=[
        gr.Textbox(label="Predicted Virus Type"), # Output for the predicted type
        gr.Label(label="Prediction Confidence Scores") # Output for the colored bars
    ],
    title="DNA Sequence Virus Type Predictor",
    description="Enter a DNA sequence to predict the virus type and see confidence scores for all classes."
)

# Launch the interface
iface.launch(debug=True)


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://6349dada8dc6c88199.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://6349dada8dc6c88199.gradio.live




In [None]:
# prompt: Generate the codes for running this gradio app on hugging face.

import tensorflow as tf
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
import gradio as gr
import os # Import os to handle file paths

def preprocess_sequence_for_cnn(sequence_string, target_length=11520):
    """
    Preprocesses a single DNA sequence string for input into the CNN model,
    padding or truncating to a target length.

    Args:
        sequence_string (str): The DNA sequence string (e.g., "AAGTTG...").
        target_length (int): The desired length of the sequence after padding/truncation.

    Returns:
        np.ndarray: The preprocessed sequence as a numpy array,
                    reshaped to (1, target_length, 1, 4) for the CNN.
    """
    # Define the mapping for one-hot encoding
    nucleotide_map = {'A': [1, 0, 0, 0],
                      'C': [0, 1, 0, 0],
                      'G': [0, 0, 1, 0],
                      'T': [0, 0, 0, 1],
                      'N': [0, 0, 0, 0]} # Handle 'N' as all zeros

    # Pad or truncate the sequence to the target length
    if len(sequence_string) < target_length:
        # Pad with 'N' to the target length
        padded_sequence = sequence_string + 'N' * (target_length - len(sequence_string))
    elif len(sequence_string) > target_length:
        # Truncate the sequence
        padded_sequence = sequence_string[:target_length]
    else:
        # Sequence is already the target length
        padded_sequence = sequence_string

    # Convert the padded/truncated sequence string to a list of one-hot encoded vectors
    encoded_sequence = [nucleotide_map.get(base.upper(), [0, 0, 0, 0]) for base in padded_sequence] # Handle unknown bases

    # Convert the list to a numpy array
    encoded_sequence_array = np.array(encoded_sequence, dtype=np.float32)

    # Reshape for the CNN: (1, target_length, 1, 4)
    # Add a batch dimension (1), a width dimension (1), and keep height (target_length) and channels (4)
    preprocessed_input = np.expand_dims(encoded_sequence_array, axis=0) # Add batch dimension
    preprocessed_input = np.expand_dims(preprocessed_input, axis=2) # Add width dimension

    return preprocessed_input

# Define and fit the label encoder with the class names your model was trained on
# Replace the placeholder list with the actual list of virus types
label_encoder = LabelEncoder()
# Assuming your model was trained on 7 classes, replace with your actual class names
label_encoder.fit(['Dengue Virus Type 1', 'Dengue Virus Type 2', 'Dengue Virus Type 3', 'Dengue Virus Type 4', 'Japanese Encephalitis Virus', 'West Nile Virus', 'Zika Virus'])

# Load the model
# Update the path to reflect the expected location on Hugging Face Spaces
# The model file should be placed in the root directory of your repository
model_path = "model5_flavi.keras"
model = keras.models.load_model(model_path)

def predict_virus_type(sequence_string):
    # Clean the input sequence string: remove whitespace and newline characters
    cleaned_sequence_string = "".join(sequence_string.split())

    # Preprocess the input sequence
    target_sequence_length = 11520
    preprocessed_sequence = preprocess_sequence_for_cnn(cleaned_sequence_string, target_length=target_sequence_length)

    # Get prediction from the model
    prediction = model.predict(preprocessed_sequence)[0] # Get the prediction for the single input

    # Get the predicted class index and convert to virus type
    predicted_class_index = np.argmax(prediction)
    predicted_virus_type = label_encoder.classes_[predicted_class_index]

    # Format the output as a dictionary for gr.Label
    confidence_scores = {}
    for i, class_name in enumerate(label_encoder.classes_):
        confidence_scores[class_name] = float(prediction[i]) # Convert numpy float to Python float

    return predicted_virus_type, confidence_scores # Return both the predicted type and the confidence scores dictionary


# Create the Gradio interface
iface = gr.Interface(
    fn=predict_virus_type,
    inputs=gr.Textbox(label="Enter cDNA Sequence"),
    outputs=[
        gr.Textbox(label="Predicted Virus Type"), # Output for the predicted type
        gr.Label(label="Prediction Confidence Scores") # Output for the colored bars
    ],
    title="Flavivirus cDNA Sequence Virus Type Predictor",
    description="Enter a cDNA sequence to predict the flavivirus type *Dengue Virus Type 1-4, West Nile Virus, Japanese Encephalitis Virus and Zika Virus) and see confidence scores for all classes."
)

# Launch the interface
iface.launch() # No debug=True for Hugging Face Spaces


In [None]:
# prompt: generate requirements.txt for hugging face space implementation

tensorflow==2.15.0
scikit-learn==1.2.2
numpy==1.25.2
pandas==1.5.3
matplotlib==3.7.1
gradio==3.39.0