<a href="https://colab.research.google.com/github/ShaliniAnandaPhD/EDM-Genetic-Circuits/blob/main/Protein_STRUCTURE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Core Features Across Scripts:

Protein Sequence Input: Users can input protein sequences either manually or by uploading a file.
ESMFold Prediction: The application sends the input sequence to the ESMFold API and retrieves a predicted protein structure in PDB format.
Visualization: Using py3Dmol, the predicted structure is rendered in a 3D view, with options for different visualization styles and color schemes.
plDDT Score: Each script provides the predicted local distance difference test (plDDT) score, which indicates the confidence in the predicted structure on a scale of 0-100.

In [None]:
!pip install streamlit
!pip install stmol
!pip install py3D mol

Collecting streamlit
  Downloading streamlit-1.29.0-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
Collecting validators<1,>=0.2 (from streamlit)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m10.4 MB/s[0m eta [36m0:0

esm9.py: Basic structure prediction and visualization, along with downloading the PDB file

In [None]:
%%writefile esm9.py

import streamlit as st
import py3Dmol
import requests
import biotite.structure.io as bsio
from stmol import showmol

# Page configuration
st.sidebar.title('ESMFold')
st.sidebar.write('[*ESMFold*](https://esmatlas.com/about) is an end-to-end single sequence protein structure predictor based on the ESM-2 language model. For more information, read the [research article](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2) and the [news article](https://www.nature.com/articles/d41586-022-03539-1) published in *Nature*.')

# Function to render molecular structure
def render_mol(pdb):
    pdbview = py3Dmol.view()
    pdbview.addModel(pdb, 'pdb')
    pdbview.setStyle({'cartoon': {'color': 'spectrum'}})
    pdbview.setBackgroundColor('white')
    pdbview.zoomTo()
    pdbview.zoom(2, 800)
    pdbview.spin(True)
    showmol(pdbview, height=500, width=800)

# Protein sequence input
DEFAULT_SEQ = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
txt = st.sidebar.text_area('Input sequence', DEFAULT_SEQ, height=275)

# ESMfold prediction function
def update(sequence):
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    try:
        # Temporary measure to bypass SSL verification
        response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', headers=headers, data=sequence, verify=False)
        pdb_string = response.content.decode('utf-8')
    except requests.exceptions.RequestException as e:
        st.error(f"Request Error: {e}")
        return

    with open('predicted.pdb', 'w') as f:
        f.write(pdb_string)

    struct = bsio.load_structure('predicted.pdb', extra_fields=["b_factor"])
    b_value = round(struct.b_factor.mean(), 4)

    # Display protein structure
    st.subheader('Visualization of predicted protein structure')
    render_mol(pdb_string)

    # plDDT value
    st.subheader('plDDT')
    st.write('plDDT is a per-residue estimate of the confidence in prediction on a scale from 0-100.')
    st.info(f'plDDT: {b_value}')

    st.download_button(
        label="Download PDB",
        data=pdb_string,
        file_name='predicted.pdb',
        mime='text/plain',
    )

# Button for prediction
predict = st.sidebar.button('Predict', on_click=lambda: update(txt))

if not predict:
    st.warning('Enter protein sequence data!')



Writing esm9.py


esm15.py: Adds amino acid frequency analysis and different visualization options.

In [None]:
%%writefile esm15.py

import streamlit as st
import py3Dmol
import requests
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import matplotlib.pyplot as plt
from stmol import showmol
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Page configuration
st.sidebar.title('ESMFold')
st.sidebar.write('[*ESMFold*](https://esmatlas.com/about) is an end-to-end single sequence protein structure predictor based on the ESM-2 language model. Read the [research article](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2) and the [news article](https://www.nature.com/articles/d41586-022-03539-1) in *Nature*.')

# Function to render molecular structure
def render_mol(pdb, style, color):
    pdbview = py3Dmol.view()
    pdbview.addModel(pdb, 'pdb')
    pdbview.setStyle({style: {'color': color}})
    pdbview.setBackgroundColor('white')
    pdbview.zoomTo()
    pdbview.spin(True)
    showmol(pdbview, height=500, width=800)

# Function for basic protein sequence analysis
def analyze_sequence(sequence):
    protein_seq = seq.ProteinSequence(sequence)

    # Amino acid frequency plot
    fig, ax = plt.subplots(figsize=(8, 6))
    graphics.plot_sequence_logo(ax, protein_seq.get_alphabet(), [1]*len(protein_seq))
    st.pyplot(fig)

# Protein sequence input
DEFAULT_SEQ = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
txt = st.sidebar.text_area('Input sequence', DEFAULT_SEQ, height=275)

# Visualization options
style = st.sidebar.selectbox("Select visualization style", ["cartoon", "stick", "sphere"], index=0)
color = st.sidebar.selectbox("Select color scheme", ["spectrum", "chain", "residue"], index=0)

# ESMfold prediction function
def update(sequence):
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    try:
        response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', headers=headers, data=sequence, verify=False)
        pdb_string = response.content.decode('utf-8')
    except requests.exceptions.RequestException as e:
        st.error(f"Request Error: {e}")
        return

    with open('predicted.pdb', 'w') as f:
        f.write(pdb_string)

    struct = fasta.load_structure('predicted.pdb', extra_fields=["b_factor"])
    b_value = round(struct.b_factor.mean(), 4)

    # Display protein structure
    st.subheader('Visualization of predicted protein structure')
    render_mol(pdb_string, style, color)

    # plDDT value
    st.subheader('plDDT')
    st.write('plDDT is a per-residue estimate of the confidence in prediction on a scale from 0-100.')
    st.info(f'plDDT: {b_value}')

    st.download_button(
        label="Download PDB",
        data=pdb_string,
        file_name='predicted.pdb',
        mime='text/plain',
    )

# Button for prediction and analysis
predict = st.sidebar.button('Predict', on_click=lambda: update(txt))

if predict:
    analyze_sequence(txt)
else:
    st.warning( 'Enter protein sequence data!')



Writing esm15.py


In [None]:
%%writefile esm9.py

import streamlit as st
import py3Dmol
import requests
import biotite.sequence as seq
import biotite.structure.io.pdb as pdb
import matplotlib.pyplot as plt
from stmol import showmol
import tempfile
import numpy as np
from collections import Counter

# Page configuration
st.sidebar.title('ESMFold')
st.sidebar.write('[*ESMFold*](https://esmatlas.com/about) is an end-to-end single sequence protein structure predictor based on the ESM-2 language model. Read the [research article](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2) and the [news article](https://www.nature.com/articles/d41586-022-03539-1) in *Nature*.')

# Function to render molecular structure
def render_mol(pdb_str, style, color):
    pdbview = py3Dmol.view()
    pdbview.addModel(pdb_str, 'pdb')
    pdbview.setStyle({style: {'color': color}})
    pdbview.setBackgroundColor('white')
    pdbview.zoomTo()
    pdbview.spin(True)
    showmol(pdbview, height=500, width=800)

# Function for basic protein sequence analysis
def analyze_sequence(sequence):
    protein_seq = seq.ProteinSequence(sequence)
    aa_freq = Counter(protein_seq)
    plt.bar(aa_freq.keys(), aa_freq.values())
    plt.title("Amino Acid Frequency")
    plt.xlabel("Amino Acid")
    plt.ylabel("Frequency")
    st.pyplot(plt)

# Read sequence from uploaded file
def read_sequence(file):
    if file is not None:
        return file.getvalue().decode("utf-8")
    return None

# Protein sequence input - Upload file or use text area
st.sidebar.subheader("Protein Sequence Input")
uploaded_file = st.sidebar.file_uploader("Upload sequence file (txt)", type=["txt"])
if uploaded_file is not None:
    sequence = read_sequence(uploaded_file)
else:
    DEFAULT_SEQ = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
    sequence = st.sidebar.text_area('Or, enter sequence manually', DEFAULT_SEQ, height=100)

# Visualization options
style = st.sidebar.selectbox("Select visualization style", ["cartoon", "stick", "sphere"], index=0)
color = st.sidebar.selectbox("Select color scheme", ["spectrum", "chain", "residue", "red", "blue", "green"], index=0)

# ESMfold prediction function
def update(sequence):
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    try:
        response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', headers=headers, data=sequence, verify=False)
        pdb_str = response.content.decode('utf-8')
    except requests.exceptions.RequestException as e:
        st.error(f"Request Error: {e}")
        return

    # Save to a temporary file and then read it
    with tempfile.NamedTemporaryFile(mode='w+', suffix='.pdb', delete=False) as temp_file:
        temp_file.write(pdb_str)
        temp_file_name = temp_file.name

    pdb_file = pdb.PDBFile()
    pdb_file.read(temp_file_name)
    structure = pdb.get_structure(pdb_file)

    # Extracting B-factor data
    b_factors = []
    for atom_array in structure:
        if 'b_factor' in atom_array.get_annotation_categories():
            b_factors.extend(atom_array.b_factor)

    if b_factors:
        b_value = round(np.mean(b_factors), 4)
    else:
        b_value = 'Not available'

    # Display protein structure
    st.subheader('Visualization of predicted protein structure')
    render_mol(pdb_str, style, color)

    # plDDT value
    st.subheader('plDDT')
    st.write('plDDT is a per-residue estimate of the confidence in prediction on a scale from 0-100.')
    st.info(f'plDDT: {b_value}')

    st.download_button(
        label="Download PDB",
        data=pdb_str,
        file_name='predicted.pdb',
        mime='text/plain',
    )

# Button for prediction and analysis
if st.sidebar.button('Predict'):
    if sequence:
        update(sequence)
        analyze_sequence(sequence)
    else:
        st.warning('Please enter a protein sequence or upload a file.')


esm60.py: Further expands on the analysis by including a hydrophobicity plot and validation for the protein sequence.

In [None]:
%%writefile esm60.py

import streamlit as st
import py3Dmol
import requests
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
import biotite.structure.io.pdb as pdb
import matplotlib.pyplot as plt
from stmol import showmol
import tempfile
import numpy as np
from collections import Counter

# Page configuration
st.sidebar.title('ESMFold')
st.sidebar.write('[*ESMFold*](https://esmatlas.com/about) is an end-to-end single sequence protein structure predictor based on the ESM-2 language model.')

# Hydrophobicity scale (Kyte & Doolittle scale)
HYDROPHOBICITY_SCALE = {
    'I': 4.5, 'V': 4.2, 'L': 3.8, 'F': 2.8, 'C': 2.5,
    'M': 1.9, 'A': 1.8, 'G': -0.4, 'T': -0.7, 'S': -0.8,
    'W': -0.9, 'Y': -1.3, 'P': -1.6, 'H': -3.2, 'E': -3.5,
    'Q': -3.5, 'D': -3.5, 'N': -3.5, 'K': -3.9, 'R': -4.5
}

# Function to validate protein sequence
def is_valid_protein_sequence(sequence):
    valid_chars = set(HYDROPHOBICITY_SCALE.keys())
    return all(char in valid_chars for char in sequence)

# Function to render molecular structure
def render_mol(pdb_str, style, color, spin):
    pdbview = py3Dmol.view()
    pdbview.addModel(pdb_str, 'pdb')
    pdbview.setStyle({style: {'color': color}})
    pdbview.setBackgroundColor('white')
    pdbview.zoomTo()
    pdbview.spin(spin)
    showmol(pdbview, height=500, width=800)

# Function for basic protein sequence analysis
def analyze_sequence(sequence):
    protein_seq = seq.ProteinSequence(sequence)
    aa_freq = Counter(protein_seq)
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.bar(aa_freq.keys(), aa_freq.values())
    plt.title("Amino Acid Frequency")
    plt.xlabel("Amino Acid")
    plt.ylabel("Frequency")

    # Hydrophobicity plot
    hydrophobicity_scores = [HYDROPHOBICITY_SCALE[aa] for aa in sequence]
    plt.subplot(1, 2, 2)
    plt.plot(hydrophobicity_scores)
    plt.title("Hydrophobicity Plot")
    plt.xlabel("Residue")
    plt.ylabel("Hydrophobicity")
    st.pyplot(plt)

# Read sequence from uploaded file
def read_sequence(file):
    if file is not None:
        return file.getvalue().decode("utf-8")
    return None

# Protein sequence input - Upload file or use text area
st.sidebar.subheader("Protein Sequence Input")
uploaded_file = st.sidebar.file_uploader("Upload sequence file (txt)", type=["txt"])
if uploaded_file is not None:
    sequence = read_sequence(uploaded_file)
else:
    DEFAULT_SEQ = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
    sequence = st.sidebar.text_area('Or, enter sequence manually', DEFAULT_SEQ, height=100)

# Validate the protein sequence
if sequence and not is_valid_protein_sequence(sequence):
    st.sidebar.error("Invalid protein sequence. Please ensure it contains only valid amino acid characters (ACDEFGHIKLMNPQRSTVWY).")
    st.stop()

# Visualization options
style = st.sidebar.selectbox("Select visualization style", ["cartoon", "stick", "sphere"], index=0)
color = st.sidebar.selectbox("Select color scheme", ["spectrum", "chain", "residue", "red", "blue", "green"], index=0)
spin = st.sidebar.checkbox("Enable spinning of the molecule", value=True)

# ESMfold prediction function
def update(sequence):
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    try:
        response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', headers=headers, data=sequence, verify=False)
        pdb_str = response.content.decode('utf-8')
    except requests.exceptions.RequestException as e:
        st.error(f"Request Error: {e}")
        return

    # Save to a temporary file and then read it
    with tempfile.NamedTemporaryFile(mode='w+', suffix='.pdb', delete=False) as temp_file:
        temp_file.write(pdb_str)
        temp_file_name = temp_file.name

    pdb_file = pdb.PDBFile()
    pdb_file.read(temp_file_name)
    structure = pdb.get_structure(pdb_file)

    # Extracting B-factor data
    b_factors = []
    for model in structure:
        if hasattr(model, 'b_factor'):
            b_factors.extend(model.b_factor)

    if b_factors:
        b_value = round(np.mean(b_factors), 4)
    else:
        b_value = 'Not available'

    # Display protein structure
    st.subheader('Visualization of predicted protein structure')
    render_mol(pdb_str, style, color, spin)

    # plDDT value
    st.subheader('plDDT')
    st.write('plDDT is a per-residue estimate of the confidence in prediction on a scale from 0-100.')
    st.info(f'plDDT: {b_value}')

    st.download_button(
        label="Download PDB",
        data=pdb_str,
        file_name='predicted.pdb',
        mime='text/plain',
    )

# Button for prediction and analysis
if st.sidebar.button('Predict'):
    if sequence:
        update(sequence)
        analyze_sequence(sequence)
    else:
        st.warning('Please enter a protein sequence or upload a file.')

# Help section
st.sidebar.subheader("Help & Information")
st.sidebar.info("This application predicts the structure of proteins based on their amino acid sequence. Upload a file containing the sequence or enter it manually. Use the visualization options to customize the molecular display.")


Overwriting esm50.py
