In [10]:
# Install required libraries
!pip install dash plotly pandas biopython dash-bio
import pandas as pd
from dash import Dash, dcc, html, dash_table, Input, Output
import plotly.express as px
import plotly.graph_objects as go
from google.colab import files
from collections import Counter
from Bio.SeqUtils import ProtParam, gc_content
import dash_bio as dashbio



ImportError: cannot import name 'gc_content' from 'Bio.SeqUtils' (/usr/local/lib/python3.11/dist-packages/Bio/SeqUtils/__init__.py)

In [3]:
# Load Dataset
uploaded = files.upload()
file_name = next(iter(uploaded))
df = pd.read_csv(file_name)

Saving protein sequence.csv to protein sequence.csv


In [11]:
# Calculate sequence features
def analyze_sequence(seq, mol_type):
    try:
        if mol_type == 'Protein':
            analyzer = ProtParam.ProteinAnalysis(seq)
            return {
                'molecular_weight': analyzer.molecular_weight(),
                'aromaticity': analyzer.aromaticity(),
                'instability_index': analyzer.instability_index(),
                'gravy': analyzer.gravy()
            }
        else:  # DNA/RNA
            return {
                'gc_content': GC(seq),
                'at_content': 100 - GC(seq)
            }
    except:
        return {}


In [5]:
app = Dash(__name__)

In [13]:
# Data Preparation
sample_df = df.sample(5000) if len(df) > 5000 else df.copy()



In [14]:
# Initialize Dash App
app = Dash(__name__)


In [15]:
# Define Layout
app.layout = html.Div([
    html.H1("Macromolecular Sequence Analysis Dashboard",
           style={'textAlign': 'center', 'color': '#2c3e50', 'marginBottom': '30px'}),

    # Row 1: Summary Cards
    html.Div([
        html.Div([
            html.H3("Total Sequences"),
            html.P(f"{len(df):,}")
        ], className='card', style={'background': '#3498db'}),

        html.Div([
            html.H3("Unique Structures"),
            html.P(f"{df['structureId'].nunique():,}")
        ], className='card', style={'background': '#2ecc71'}),

        html.Div([
            html.H3("Longest Sequence"),
            html.P(f"{df['seqLength'].max()} residues")
        ], className='card', style={'background': '#e74c3c'}),

        html.Div([
            html.H3("Most Common Type"),
            html.P("Protein" if df['macromoleculeType'].value_counts().index[0] == 'Protein' else "DNA/RNA")
        ], className='card', style={'background': '#f39c12'})
    ], className='card-container'),

    # Row 2: Filters
    html.Div([
        dcc.Dropdown(
            id='mol-type-filter',
            options=[{'label': mol_type, 'value': mol_type}
                    for mol_type in df['macromoleculeType'].unique()],
            value='Protein',
            style={'width': '30%', 'display': 'inline-block'}
        ),
        dcc.RangeSlider(
            id='length-slider',
            min=df['seqLength'].min(),
            max=min(df['seqLength'].max(), 1000),  # Cap at 1000 for usability
            value=[100, 300],
            marks={i: str(i) for i in range(0, 1001, 100)},
            step=10,
            tooltip={'placement': 'bottom'}
        )
    ], style={'padding': '20px'}),

    # Row 3: Visualizations
    html.Div([
        dcc.Graph(id='length-distribution'),
        dcc.Graph(id='composition-heatmap')
    ], style={'columnCount': 2}),

    # Row 4: Sequence Analysis
    html.Div([
        dcc.Dropdown(
            id='sequence-selector',
            options=[{'label': f"{row['structureId']}_{row['chainId']} ({row['seqLength']}aa)",
                     'value': i}
                    for i, row in sample_df.iterrows()],
            value=0,
            style={'width': '100%'}
        ),
        html.Div([
            html.H4("Sequence Preview:"),
            html.Pre(id='sequence-display', style={
                'overflowX': 'auto',
                'backgroundColor': '#f5f5f5',
                'padding': '10px',
                'borderRadius': '5px',
                'maxHeight': '200px'
            })
        ]),
        html.Div(id='sequence-stats')
    ], style={'border': '1px solid #ddd', 'padding': '20px', 'margin': '20px 0'}),

    # Data Table
    dash_table.DataTable(
        id='data-table',
        columns=[{'name': col, 'id': col} for col in df.columns],
        data=sample_df.head(10).to_dict('records'),
        page_size=10,
        style_table={'overflowX': 'auto', 'height': '300px'},
        style_cell={
            'textAlign': 'left',
            'padding': '8px',
            'whiteSpace': 'normal',
            'maxWidth': '200px'
        },
        filter_action='native'
    )
], style={'fontFamily': 'Arial', 'padding': '20px'})



In [16]:
# Callbacks
@app.callback(
    [Output('length-distribution', 'figure'),
     Output('composition-heatmap', 'figure')],
    [Input('mol-type-filter', 'value'),
     Input('length-slider', 'value')]
)
def update_visualizations(mol_type, length_range):
    filtered_df = df[(df['macromoleculeType'] == mol_type) &
                    (df['seqLength'] >= length_range[0]) &
                    (df['seqLength'] <= length_range[1])]

    # Length distribution plot
    length_fig = px.histogram(filtered_df, x='seqLength', nbins=50,
                            title=f'{mol_type} Sequence Length Distribution',
                            color_discrete_sequence=['#3498db'])

    # Composition heatmap (sample 100 sequences)
    sample = filtered_df.sample(min(100, len(filtered_df)))
    if mol_type == 'Protein':
        aa_counts = pd.DataFrame([Counter(seq) for seq in sample['sequence']]).mean().reset_index()
        aa_counts.columns = ['Amino Acid', 'Frequency']
        heatmap_fig = px.bar(aa_counts, x='Amino Acid', y='Frequency',
                            title='Amino Acid Composition',
                            color='Frequency',
                            color_continuous_scale='Viridis')
    else:
        nt_counts = pd.DataFrame([Counter(seq) for seq in sample['sequence']]).mean().reset_index()
        nt_counts.columns = ['Nucleotide', 'Frequency']
        heatmap_fig = px.bar(nt_counts, x='Nucleotide', y='Frequency',
                            title='Nucleotide Composition',
                            color='Frequency',
                            color_continuous_scale='Viridis')

    return length_fig, heatmap_fig



In [17]:
@app.callback(
    [Output('sequence-display', 'children'),
     Output('sequence-stats', 'children')],
    [Input('sequence-selector', 'value')]
)
def update_sequence_viewer(selected_idx):
    row = sample_df.iloc[selected_idx]

    # Calculate sequence statistics
    def analyze_sequence(seq, mol_type):
        try:
            if mol_type == 'Protein':
                analyzer = ProtParam.ProteinAnalysis(seq)
                return {
                    'molecular_weight': analyzer.molecular_weight(),
                    'aromaticity': analyzer.aromaticity(),
                    'instability_index': analyzer.instability_index(),
                    'gravy': analyzer.gravy()
                }
            else:  # DNA/RNA
                return {
                    'gc_content': GC(seq),
                    'at_content': 100 - GC(seq)
                }
        except:
            return {}

    stats = analyze_sequence(row['sequence'], row['macromoleculeType'])

    # Display first 200 characters of sequence
    seq_display = row['sequence'][:200] + ('...' if len(row['sequence']) > 200 else '')

    # Create stats display
    if row['macromoleculeType'] == 'Protein':
        stats_html = html.Div([
            html.H4(f"{row['structureId']}_{row['chainId']} Protein Analysis"),
            html.P(f"Length: {row['seqLength']} residues"),
            html.P(f"Molecular Weight: {stats.get('molecular_weight', 'N/A'):.1f} Da"),
            html.P(f"Aromaticity: {stats.get('aromaticity', 'N/A'):.2f}"),
            html.P(f"Instability Index: {stats.get('instability_index', 'N/A'):.1f}"),
            html.P(f"GRAVY: {stats.get('gravy', 'N/A'):.2f}")
        ])
    else:
        stats_html = html.Div([
            html.H4(f"{row['structureId']}_{row['chainId']} DNA/RNA Analysis"),
            html.P(f"Length: {row['seqLength']} bases"),
            html.P(f"GC Content: {stats.get('gc_content', 'N/A'):.1f}%"),
            html.P(f"AT Content: {stats.get('at_content', 'N/A'):.1f}%")
        ])

    return seq_display, stats_html



In [19]:
# Run App
app.run(mode='inline')

<IPython.core.display.Javascript object>