# Notebook to visualize data collected from various databases - DASH
 
**Developed by** :Srivalli Kolla

**Created on** : 23 October, 2024

**Last modified** : 24 October, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

Env : dash

Dash is a framework for building interactive web applications and dashboards. It integrates Plotly for creating dynamic visualizations that can respond to user inputs. The app layout is defined using HTML components, allowing for customizable and user-friendly interfaces. Callbacks enable real-time interactivity, updating visual elements based on user actions. 

# Import packages

In [3]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import io
import zipfile
import os
import tempfile

from dash import html, dcc, Input, Output, State
from plotly.io import write_image


1. Load file and remove spaces from column names
2. Convert all string columns to lowercase
3. Preprocess the DataFrame and inititae DASH app

In [4]:
df = pd.read_csv('../data_visualization/heart_cellular_collectives.csv')
df

Unnamed: 0,Data-modality,Protocol,Repository,Study-Title,Dataset name,Organism,Tissue-Broad,Tissue,Disease,Developmentalstage,Ethnicity,Cell count,"Sex(female,male,unknown)"
0,"sc,sn","scRNA, snRNA",CellxGene,Cells of the adult human heart,All — Cells of the adult human heart,Human,"Apex,LA,LV,RA,RV,IVseptum","apex of heart,left cardiac atrium,heart left v...",Normal,Adult,"European,Asian",486134,219695;266439;0
1,sn,snRNA,CellxGene,Spatial multi-omic map of human myocardial inf...,All-snRNA-Spatial multi-omic map of human myoc...,Human,LV,heart left ventricle,Normal,Adult,European,41663,10682;30981;0
2,sn,snRNA,CellxGene,Spatial multi-omic map of human myocardial inf...,All-snRNA-Spatial multi-omic map of human myoc...,Human,LV,heart left ventricle,Myocardial infarction,Adult,European,150132,45354;104778;0
3,sc,scRNA,CellxGene,Spatially resolved multiomics of human cardiac...,Combined single cell and single nuclei RNA-Seq...,Human,RA,right cardiac atrium,Normal,Adult,European,1,1;0;0
4,sc,scRNA,CellxGene,Construction of a human cell landscape at sing...,Construction of a human cell landscape at sing...,Human,Heart,heart,Normal,"Adult,Fetal",Han Chinese,10783,9475;1308;0
5,sn,snRNA,CellxGene,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,Human,"RV,LV,IVseptum","heart right ventricle,heart left ventricle,int...",DCM,"Adult,Fetal,Pediatric",European,482581,127956;354625
6,sn,snRNA,CellxGene,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,Human,"RV,LV,IVseptum","heart right ventricle,heart left ventricle,int...",Arrhythmogenic right ventricular cardiomyopathy,"Adult,Fetal,Pediatric",European,104496,34322;70174;0
7,sn,snRNA,CellxGene,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,Human,"RV,LV,IVseptum","heart right ventricle,heart left ventricle,int...",Normal,"Adult,Fetal,Pediatric",European,67246,16932;50314;0
8,sn,snRNA,CellxGene,Pathogenic variants damage cell composition an...,DCM/ACM heart cell atlas: All cells,Human,"RV,LV,IVseptum","heart right ventricle,heart left ventricle,int...",Non-compaction cardiomyopathy,"Adult,Fetal,Pediatric",European,11632,11632;0;0
9,sc,scRNA,CellxGene,Integrated adult and foetal heart single-cell ...,Integrated adult and foetal hearts,Human,"Apex,basal_zone","apex of heart, basal zone of heart",Normal,Fetal,unknown,30889,21700;9189;0


# Data visualization

## Pre Processing

1. Create a copy of the DataFrame to avoid modifying the original
2. Function to parse sex values as seperate to add during merging and apply on the column 
3. Function to safely convert to numeric during merging and apply on numerical columns in our Dataset name
4. Cartedully define the numeric and string columns and apply defined functions
5. Aggregate(Merge) Numerical columns
6. Define a function to caredully hanndle multiple data during merging by converting them into a comma sperated list without duplicates
7. Now Aggregate categorical columns and define to use function from 6 to handle properly
8. Now combine numerical and categorical aggregated data into one Dataset name
9. Ensure that Sex column is properly demonstrated
10. Add desired order for columns for output aand also appennd any additional columns which are missing and sort according to order

In [5]:
def preprocess_and_aggregate(df):
    df_copy = df.copy()

    # Function to parse the 'Sex(female,male,unknown)' column into female, male, and unknown counts
    def parse_sex(x):
        if pd.isna(x) or x == '':
            return 0, 0, 0
        parts = str(x).split(';')
        if len(parts) != 3:
            return 0, 0, 0
        return (
            int(parts[0]) if parts[0].isdigit() else 0,
            int(parts[1]) if parts[1].isdigit() else 0,
            int(parts[2]) if parts[2].isdigit() else 0
        )

    # Apply parsing function and create 'female', 'male', and 'unknown' columns
    df_copy[['female', 'male', 'unknown']] = df_copy['Sex(female,male,unknown)'].apply(lambda x: pd.Series(parse_sex(x)))

    # Function to safely convert to numeric values
    def safe_numeric(x):
        return pd.to_numeric(x, errors='coerce').fillna(0)

    # Ensure 'Cell count' is numeric
    df_copy['Cell count'] = safe_numeric(df_copy['Cell count'])

    # Columns to aggregate
    numerical_cols = ['Cell count', 'female', 'male', 'unknown']
    string_cols = [col for col in df_copy.columns if col not in numerical_cols + ['Study-Title', 'Dataset name']]

    # Aggregate numerical columns by sum
    num_agg = df_copy.groupby(['Study-Title', 'Dataset name'])[numerical_cols].sum().reset_index()

    # Function to join unique, comma-separated values for string columns
    def join_unique_comma_separated(series):
        combined_list = []
        for value in series.dropna().astype(str):
            combined_list.extend(value.split(','))
        return ','.join(sorted(set(filter(None, combined_list))))

    # Aggregate string columns by joining unique values
    str_agg = df_copy.groupby(['Study-Title', 'Dataset name'])[string_cols].agg(join_unique_comma_separated).reset_index()

    # Merge numerical and string aggregations
    grouped_df = pd.merge(num_agg, str_agg, on=['Study-Title', 'Dataset name'])

    # Reconstruct the 'Sex(female,male,unknown)' column
    grouped_df['Sex(female,male,unknown)'] = (
        grouped_df['female'].astype(int).astype(str) + ';' +
        grouped_df['male'].astype(int).astype(str) + ';' +
        grouped_df['unknown'].astype(int).astype(str)
    )
    
    # Drop the temporary 'female', 'male', and 'unknown' columns
    grouped_df = grouped_df.drop(columns=['female', 'male', 'unknown'])

    # Define the desired columns and ensure there are no duplicates
    desired_columns = [
        'Study-Title', 'Dataset name', 'Data-modality', 'Repository',
        'Organism', 'Tissue-Broad', 'Tissue', 'Disease',
        'Developmentalstage', 'Ethnicity', 'Sex(female,male,unknown)', 'Cell count'
    ]

    # Add any additional columns that are not in the desired list
    additional_columns = [col for col in grouped_df.columns if col not in desired_columns]
    column_order = desired_columns + additional_columns

    # Reorder the DataFrame columns
    grouped_df = grouped_df[column_order]

    return grouped_df

In [6]:
grouped_df = preprocess_and_aggregate(df)
grouped_df

Unnamed: 0,Study-Title,Dataset name,Data-modality,Repository,Organism,Tissue-Broad,Tissue,Disease,Developmentalstage,Ethnicity,"Sex(female,male,unknown)",Cell count,Protocol
0,A human cell atlas of fetal gene expression,Survey of human embryonic development,sn,CellxGene,Human,Heart,heart,Normal,Fetal,unknown,55878;45871;0,101749,sci-RNA-seq
1,Cells of the adult human heart,All — Cells of the adult human heart,"sc,sn",CellxGene,Human,"Apex,IVseptum,LA,LV,RA,RV","apex of heart,heart left ventricle,heart right...",Normal,Adult,"Asian,European",219695;266439;0,486134,"snRNA,scRNA"
2,Cellular Atlas of Human Heart Failure,PRJNA762100,"sc+sn,sn",SRA,Human,LVapex,LV apex,"DCM,Normal",Adult,"African American,White",7431;10160;0,17591,snRNA
3,Construction of a human cell landscape at sing...,Construction of a human cell landscape at sing...,sc,CellxGene,Human,Heart,heart,Normal,"Adult,Fetal",Han Chinese,9475;1308;0,10783,scRNA
4,Defining cardiac functional recovery in end-st...,PRJNA939636,sn,SRA,Human,LV,LV apex,Normal,Adult,Unknown,4204;21237;2880,22119,snRNA
5,Defining the fetal gene program at single cell...,PRJNA767653,sn,SRA,Human,LV,~100mg of left ventricular Heart muscle,"DCM,Normal","Adult,Pediatric",Unknown,1388;1342;1231,3961,snRNA
6,Expression profiling by snRNAseq of left ventr...,PRJNA1127309,sn,SRA,Human,LV,LV,"Aorticstenotic,Normal",Adult,Unknown,0;153;0,153,snRNA
7,HBM236.JPVT.769,HBM236.JPVT.769,sn,Hubmap,Human,Heart,Heart,Normal,Adult,White,0;5907;0,5907,snRNA
8,HBM296.KZXD.676,HBM296.KZXD.676,sc,Hubmap,Human,Heart,Heart,Normal,Adult,White,6924;0;0,6924,sci-RNA-seq
9,HBM342.CMHT.948,HBM342.CMHT.948,sc,Hubmap,Human,Heart,Heart,Normal,Adult,White,179;0;0,179,sci-RNA-seq


1. Function to get unique values from comma-separated columns
2. Function to select the data from comma seperated column if it matches with any of the value from selection
3. Function to split comma-separated values and explode into multiple rows

In [7]:
def get_unique_comma_separated_values(df, column):
    unique_values = set()
    for values in df[column].dropna():
        for value in values.split(','):
            unique_values.add(value.strip())
    return sorted(unique_values)

def any_in_string(selected_values, string_value):
    if not string_value or not selected_values:
        return False
    string_list = [s.strip() for s in string_value.split(',')]

    return any(value in string_list for value in selected_values)

def split_and_explode(df, col):
    return df.assign(**{col: df[col].str.split(',')}).explode(col)


In [8]:
unique_diseases = get_unique_comma_separated_values(grouped_df, 'Disease')
unique_tissue_broad = get_unique_comma_separated_values(grouped_df, 'Tissue-Broad')
unique_modalities = get_unique_comma_separated_values(grouped_df, 'Data-modality')
unique_repositories = grouped_df['Repository'].unique().tolist() 

## DASH

In [9]:
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Heart Data Consortium Visualizations", style={'textAlign': 'center'}),
    
    # Filters
    html.Div([
        html.H2("Disease Filters"),
        dcc.Dropdown(
            id='disease-dropdown',
            options=[{'label': disease, 'value': disease} for disease in unique_diseases],
            value=unique_diseases,
            multi=True,
            placeholder="Select Disease"
        ),
    ], style={'margin-bottom': '20px'}),

    html.Div([
        html.H2("Tissue-Broad Filters"),
        dcc.Dropdown(
            id='Tissue-Broad-dropdown',
            options=[{'label': tissue, 'value': tissue} for tissue in unique_tissue_broad],
            value=unique_tissue_broad,
            multi=True,
            placeholder="Select Tissue-Broad"
        ),
    ], style={'margin-bottom': '20px'}),

    html.Div([
        html.H2("Data Modality Filters"),
        dcc.Dropdown(
            id='modality-dropdown',
            options=[{'label': modality, 'value': modality} for modality in unique_modalities],
            value=unique_modalities,
            multi=True,
            placeholder="Select Data Modality"
        ),
    ], style={'margin-bottom': '20px'}),

    html.Div([
        html.H2("Repository Filters"),
        dcc.Dropdown(
            id='repository-dropdown',
            options=[{'label': repo, 'value': repo} for repo in unique_repositories],
            value=unique_repositories,
            multi=True,
            placeholder="Select Repository"
        ),
    ], style={'margin-bottom': '20px'}),

    # Graphs
    dcc.Graph(id='repository-piechart'),
    dcc.Graph(id='repository-barplot'),
    dcc.Graph(id='disease-barplot'),
    dcc.Graph(id='disease-pie-chart'),
    dcc.Graph(id='cell-count-barplot'),
    dcc.Graph(id='tissue-broad-barchart'),
    dcc.Graph(id='tissue-broad-piechart'),
    dcc.Graph(id='modality-piechart'),
    dcc.Graph(id='developmental-stage-barplot'),
    dcc.Graph(id='developmental-stage-pie-chart'),
    dcc.Graph(id='sex-distribution-barplot-counts'),
    dcc.Graph(id='sex-distribution-barplot-percentage'),
    dcc.Graph(id='sex-distribution-pie-chart'),
    dcc.Graph(id='ethnicity-barplot-counts'),  
    dcc.Graph(id='ethnicity-barplot-percentage'),
    dcc.Graph(id='ethnicity-pie-chart'),
    dcc.Graph(id='treemap-ethnicity'),
    dcc.Graph(id ='treemap-total'),
    
    # Save Button
    html.Div([
        html.H3("Specify Save Path"),
        dcc.Input(id='save-path-input', type='text', value='./Plots/', style={'width': '100%', 'margin-bottom': '20px'}),
        html.Button("Save All Figures", id="save-button", n_clicks=0),
        html.Div(id="save-status", style={'margin-top': '20px', 'font-weight': 'bold'})
    ]),
])

In [10]:
# Callback to save all graphs as PNG images to local path
@app.callback(
    Output('save-status', 'children'),
    [Input('save-button', 'n_clicks')],
    [State('save-path-input', 'value'),
     State('repository-piechart', 'figure'),
     State('repository-barplot', 'figure'),
     State('disease-barplot', 'figure'),
     State('disease-pie-chart', 'figure'),
     State('cell-count-barplot', 'figure'),
     State('modality-piechart', 'figure'),
     State('developmental-stage-barplot', 'figure'),
     State('developmental-stage-pie-chart', 'figure'),
     State('sex-distribution-barplot-counts', 'figure'),
     State('sex-distribution-barplot-percentage', 'figure'),
     State('sex-distribution-pie-chart', 'figure'),
     State('ethnicity-barplot-counts', 'figure'),
     State('ethnicity-barplot-percentage', 'figure'),
     State('ethnicity-pie-chart', 'figure'),
     State('treemap-ethnicity', 'figure'),
     State('treemap-total','figure')]
     
)
def save_all_figures(n_clicks, save_path, *figures):
    if n_clicks == 0:
        return dash.no_update  # Do not save until the button is clicked

    if not save_path.endswith('/'):
        save_path += '/'
    
    # Create the directory if it does not exist
    os.makedirs(save_path, exist_ok=True)

    # Save each figure as a PNG file
    for i, fig_dict in enumerate(figures):
        if fig_dict is not None:
            fig = go.Figure(fig_dict)  # Convert dict to Figure object
            fig_name = f"figure_{i}.png"
            fig_path = os.path.join(save_path, fig_name)
            try:
                # Update layout to include legend outside the plot, vertically aligned (Top-right corner)
                fig.update_layout(
                    autosize=False,  # Disable autosizing to manually control layout
                    width=1500,      # Increase width to give space for legend
                    height=1000,     # Increase height for more room
                    margin=dict(l=50, r=200, t=50, b=50),  # Add larger right margin for the legend
                    legend=dict(
                        orientation='v',    # Vertical legend
                        yanchor='top',      # Anchor the legend to the top
                        y=1,                # Position the legend at the top
                        xanchor='right',    # Anchor the legend to the right
                        x=1.2               # Position the legend outside the plot (right of the graph)
                    )
                )
                # Save the figure as a PNG with 300 DPI
                write_image(fig, file=fig_path, format='png', scale=3)
            except Exception as e:
                print(f"Error saving figure {i}: {e}")
    
    return "All figures saved successfully!"


1. Filter counts based on selection
2. Convert 'Cell count' to numeric and handle any errors
3. Aggregate the total cell counts by repository
4. Plotting pie chart and bar graph
5. Adding text in graphs

In [11]:
# Callback for Repository bar plot and pie chart
@app.callback(
    [Output('repository-piechart', 'figure'),
     Output('repository-barplot', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_repository_charts(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    
    # Fill NaN values with an empty string to avoid issues
    df_copy = df.copy()
    df_copy['Disease'] = df_copy['Disease'].fillna('')
    df_copy['Tissue-Broad'] = df_copy['Tissue-Broad'].fillna('')
    df_copy['Data-modality'] = df_copy['Data-modality'].fillna('')
    df_copy['Repository'] = df_copy['Repository'].fillna('')

    # Apply filtering only if selections are made; otherwise, skip that filter
    filtered_df = df_copy[
        (df_copy['Disease'].apply(lambda x: any_in_string(selected_diseases, x)) if selected_diseases else True) &
        (df_copy['Tissue-Broad'].apply(lambda x: any_in_string(selected_tissue, x)) if selected_tissue else True) &
        (df_copy['Data-modality'].apply(lambda x: any_in_string(selected_modalities, x)) if selected_modalities else True) &
        (df_copy['Repository'].apply(lambda x: any_in_string(selected_repositories, x)) if selected_repositories else True)
    ]

    # Check if the filtered DataFrame is empty
    if filtered_df.empty:
        empty_fig = px.pie(title="No data available for selected filters.")
        return empty_fig, empty_fig

    # Aggregate cell counts by repository
    repo_agg_df = filtered_df.groupby('Repository')['Cell count'].sum().reset_index()
    total_cell_count = repo_agg_df['Cell count'].sum()

    # Pie chart for repository distribution
    pie_fig = px.pie(repo_agg_df, names='Repository', values='Cell count',
                     labels={'Repository': 'Repository', 'Cell count': 'Number of Cells'},
                     title=f"Distribution of Cells by Repository (Total: {total_cell_count:,})",
                     hole=0.3)

    # Bar plot for cell counts by repository
    bar_fig = px.bar(repo_agg_df, x='Repository', y='Cell count',
                     labels={'Repository': 'Repository', 'Cell count': 'Number of Cells'},
                     title=f"Number of Cells by Repository (Total: {total_cell_count:,})",
                     text='Cell count')

    # Update traces for better visualization
    pie_fig.update_traces(textposition='inside', textinfo='percent')
    bar_fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

    return pie_fig, bar_fig

In [12]:
# Callback for disease Bar plot and Pie chart
@app.callback(
    [Output('disease-pie-chart', 'figure'),
     Output('disease-barplot', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_disease_charts(selected_diseases, selected_tissue, selected_modalities, selected_repositories):

    # Fill NaN values with an empty string for filtering
    df_copy = df.copy()
    df_copy['Disease'] = df_copy['Disease'].fillna('')
    df_copy['Tissue-Broad'] = df_copy['Tissue-Broad'].fillna('')
    df_copy['Data-modality'] = df_copy['Data-modality'].fillna('')
    df_copy['Repository'] = df_copy['Repository'].fillna('')

    # Apply conditional filtering based on the selected dropdown values
    filtered_df = df_copy[
        (df_copy['Disease'].apply(lambda x: any_in_string(selected_diseases, x)) if selected_diseases else True) &
        (df_copy['Tissue-Broad'].apply(lambda x: any_in_string(selected_tissue, x)) if selected_tissue else True) &
        (df_copy['Data-modality'].apply(lambda x: any_in_string(selected_modalities, x)) if selected_modalities else True) &
        (df_copy['Repository'].apply(lambda x: any_in_string(selected_repositories, x)) if selected_repositories else True)
    ]

    # If the filtered DataFrame is empty, return empty figures with a message
    if filtered_df.empty:
        empty_pie = px.pie(title="No data available for selected filters.")
        empty_bar = px.bar(title="No data available for selected filters.")
        return empty_pie, empty_bar

    # Group the filtered DataFrame by 'Disease' and aggregate 'Cell count'
    disease_counts = filtered_df.groupby('Disease')['Cell count'].sum().reset_index()

    # Create pie chart for Disease distribution
    pie_fig = px.pie(disease_counts, names='Disease', values='Cell count',
                     title="Distribution of Cells by Disease")

    # Create bar chart for Disease counts
    bar_fig = px.bar(disease_counts, x='Disease', y='Cell count',
                     labels={'Disease': 'Disease', 'Cell count': 'Number of Cells'},
                     title="Cell Counts by Disease",
                     text='Cell count')

    # Update the trace formatting
    pie_fig.update_traces(textposition='inside', textinfo='percent')
    bar_fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')

    return pie_fig, bar_fig

In [13]:
# Callback for cell-count histogram
@app.callback(
    Output('cell-count-barplot', 'figure'),
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_cell_count_histogram(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    # Ensure inputs are lists for consistent filtering
    selected_diseases = selected_diseases if isinstance(selected_diseases, list) else [selected_diseases]
    selected_tissue = selected_tissue if isinstance(selected_tissue, list) else [selected_tissue]
    selected_modalities = selected_modalities if isinstance(selected_modalities, list) else [selected_modalities]
    selected_repositories = selected_repositories if isinstance(selected_repositories, list) else [selected_repositories]

    # Copy the DataFrame and fill NaN values for filtering
    df_copy = df.copy()
    df_copy['Disease'] = df_copy['Disease'].fillna('')
    df_copy['Tissue-Broad'] = df_copy['Tissue-Broad'].fillna('')
    df_copy['Data-modality'] = df_copy['Data-modality'].fillna('')
    df_copy['Repository'] = df_copy['Repository'].fillna('')

    # Apply conditional filtering based on selected dropdown values
    filtered_df = df_copy[
        (df_copy['Disease'].apply(lambda x: any_in_string(selected_diseases, x)) if selected_diseases else True) &
        (df_copy['Tissue-Broad'].apply(lambda x: any_in_string(selected_tissue, x)) if selected_tissue else True) &
        (df_copy['Data-modality'].apply(lambda x: any_in_string(selected_modalities, x)) if selected_modalities else True) &
        (df_copy['Repository'].apply(lambda x: any_in_string(selected_repositories, x)) if selected_repositories else True)
    ]

    # If the filtered DataFrame is empty, return an empty bar plot with a message
    if filtered_df.empty:
        return px.bar(title="No data available for selected filters.")

    # Define a function to split and sum cell counts
    def split_and_sum(cell_counts):
        if pd.isna(cell_counts):
            return 0
        if isinstance(cell_counts, str):
            return sum(pd.to_numeric(cell_counts.split(';'), errors='coerce').fillna(0))
        elif isinstance(cell_counts, (int, float)):
            return cell_counts
        return 0

    # Calculate the cell counts for each row
    filtered_df['Cell count'] = filtered_df['Cell count'].apply(split_and_sum)

    # Aggregate cell counts by study title
    study_cell_counts = filtered_df.groupby('Study-Title')['Cell count'].sum().reset_index()

    # Create a bar plot for the aggregated cell counts
    fig = px.bar(study_cell_counts, x='Study-Title', y='Cell count',
                 labels={'Study-Title': 'Study Title', 'Cell count': 'Number of Cells'},
                 title="Histogram of Cell Counts per Study", text='Cell count')
    
    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    
    return fig

In [14]:
# Callback for Modality Pie Chart
@app.callback(
    Output('modality-piechart', 'figure'),
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_modality_piechart(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    
    # Split and explode relevant columns
    exploded_df = df.copy()
    for col in ['Disease', 'Tissue-Broad', 'Data-modality', 'Repository']:
        exploded_df = split_and_explode(exploded_df, col)
    
    # Filter the exploded DataFrame based on selected filters
    filtered_df = exploded_df[
        exploded_df['Disease'].str.strip().isin(selected_diseases) &
        exploded_df['Tissue-Broad'].str.strip().isin(selected_tissue) &
        exploded_df['Data-modality'].str.strip().isin(selected_modalities) &
        exploded_df['Repository'].str.strip().isin(selected_repositories)
    ]

    # Return a message if no data is available after filtering
    if filtered_df.empty:
        return px.pie(title="No data available for the selected filters.")

    # Aggregate the filtered DataFrame
    modality_counts = filtered_df.groupby('Data-modality')['Cell count'].sum().reset_index()

    # Create a pie chart figure
    fig = px.pie(
        modality_counts,
        names='Data-modality',
        values='Cell count',
        title="Distribution of Data Modalities",
        hole=0.3,
        labels={'Data-modality': 'Data Modality', 'Cell count': 'Number of Cells'}
    )

    return fig

In [15]:
# Callback for Developmentalstage plots
@app.callback(
    [Output('developmental-stage-barplot', 'figure'),
     Output('developmental-stage-pie-chart', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_developmental_stage_charts(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    exploded_df = split_and_explode(grouped_df, 'Disease')
    exploded_df = split_and_explode(exploded_df, 'Tissue-Broad')
    exploded_df = split_and_explode(exploded_df, 'Data-modality')
    exploded_df = split_and_explode(exploded_df, 'Repository')

    # Handle NaN values by filling with an empty string before applying the filters
    filtered_df = exploded_df[
        exploded_df['Disease'].fillna('').apply(lambda x: x.strip() in selected_diseases) &
        exploded_df['Tissue-Broad'].fillna('').apply(lambda x: x.strip() in selected_tissue) &
        exploded_df['Data-modality'].fillna('').apply(lambda x: x.strip() in selected_modalities) &
        exploded_df['Repository'].fillna('').apply(lambda x: x.strip() in selected_repositories)
    ]

    if filtered_df.empty:
        return (
            px.bar(title="No data available for selected filters."),
            px.pie(title="No data available for selected filters.")
        )

    expanded_stages_df = split_and_explode(filtered_df, 'Developmentalstage')
    expanded_stages_df['Cell count'] = pd.to_numeric(expanded_stages_df['Cell count'], errors='coerce')

    stage_counts = expanded_stages_df.groupby(['Study-Title', 'Developmentalstage'])['Cell count'].sum().reset_index()

    total_counts = stage_counts.groupby('Study-Title')['Cell count'].sum().reset_index()
    total_counts.rename(columns={'Cell count': 'Total Count'}, inplace=True)
    stage_counts = stage_counts.merge(total_counts, on='Study-Title')
    stage_counts['Percentage'] = (stage_counts['Cell count'] / stage_counts['Total Count']) * 100

    bar_fig = px.bar(stage_counts, x='Study-Title', y='Cell count', color='Developmentalstage',
                     labels={'Study-Title': 'Study Title', 'Cell count': 'Number of Cells'},
                     title="Cell Counts by Developmentalstage",
                     barmode='stack', text='Cell count')

    percentage_bar_fig = px.bar(stage_counts, x='Study-Title', y='Percentage', color='Developmentalstage',
                               labels={'Percentage': 'Percentage (%)', 'Developmentalstage': 'Stage'},
                               title="Percentage of Cell Counts by Developmentalstage",
                               barmode='stack', text='Percentage')


    pie_fig = px.pie(stage_counts, values='Cell count', names='Developmentalstage', 
                     title="Percentage of Cell Counts by Developmentalstage",
                     labels={'Cell count': 'Number of Cells', 'Developmentalstage': 'Stage'},
                     hole=0.3)
    
    bar_fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    percentage_bar_fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
    pie_fig.update_traces(textinfo='percent')

    return percentage_bar_fig, pie_fig


In [16]:
# Callback for Tissue-Broad Bar Plot and Pie Chart
@app.callback(
    [Output('tissue-broad-barchart', 'figure'),
     Output('tissue-broad-piechart', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_tissue_broad_charts(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    
    # Split and explode relevant columns
    exploded_df = df.copy()
    for col in ['Disease', 'Tissue-Broad', 'Data-modality', 'Repository']:
        exploded_df = split_and_explode(exploded_df, col)
    
    # Filter the exploded DataFrame based on selected filters
    filtered_df = exploded_df[
        exploded_df['Disease'].str.strip().isin(selected_diseases) &
        exploded_df['Tissue-Broad'].str.strip().isin(selected_tissue) &
        exploded_df['Data-modality'].str.strip().isin(selected_modalities) &
        exploded_df['Repository'].str.strip().isin(selected_repositories)
    ]

    # Check if the filtered DataFrame is empty
    if filtered_df.empty:
        empty_fig = px.bar(title="No data available for selected filters.")
        return empty_fig, empty_fig
    
    # Group the filtered data by Tissue-Broad and sum the Cell counts
    tissue_broad_counts = filtered_df.groupby('Tissue-Broad')['Cell count'].sum().reset_index()

    # Create the bar plot for Tissue-Broad
    bar_fig = px.bar(
        tissue_broad_counts,
        x='Tissue-Broad',
        y='Cell count',
        title="Tissue-Broad Distribution (Bar Plot)",
        labels={'Cell count': 'Number of Cells'},
        text='Cell count'
    )
    bar_fig.update_layout(xaxis_title='Tissue-Broad', yaxis_title='Cell Count', title_x=0.5)

    # Create the pie chart for Tissue-Broad
    pie_fig = px.pie(
        tissue_broad_counts,
        names='Tissue-Broad',
        values='Cell count',
        title="Tissue-Broad Distribution (Pie Chart)",
        labels={'Cell count': 'Number of Cells'},
        hole=0.3
    )
    pie_fig.update_traces(textinfo='percent+label')

    return bar_fig, pie_fig

In [17]:
# Callback for Sex distribution plots
@app.callback(
    [Output('sex-distribution-barplot-counts', 'figure'),
     Output('sex-distribution-barplot-percentage', 'figure'),
     Output('sex-distribution-pie-chart', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_sex_distribution_charts(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    # Ensure inputs are lists for consistent filtering
    selected_diseases = selected_diseases if isinstance(selected_diseases, list) else [selected_diseases]
    selected_tissue = selected_tissue if isinstance(selected_tissue, list) else [selected_tissue]
    selected_modalities = selected_modalities if isinstance(selected_modalities, list) else [selected_modalities]
    selected_repositories = selected_repositories if isinstance(selected_repositories, list) else [selected_repositories]

    # Copy the DataFrame and fill NaN values for filtering
    df_copy = df.copy()
    df_copy['Disease'] = df_copy['Disease'].fillna('')
    df_copy['Tissue-Broad'] = df_copy['Tissue-Broad'].fillna('')
    df_copy['Data-modality'] = df_copy['Data-modality'].fillna('')
    df_copy['Repository'] = df_copy['Repository'].fillna('')
    df_copy['Sex(female,male,unknown)'] = df_copy['Sex(female,male,unknown)'].fillna('0;0;0')

    # Apply conditional filtering based on selected dropdown values
    filtered_df = df_copy[
        (df_copy['Disease'].apply(lambda x: any_in_string(selected_diseases, x)) if selected_diseases else True) &
        (df_copy['Tissue-Broad'].apply(lambda x: any_in_string(selected_tissue, x)) if selected_tissue else True) &
        (df_copy['Data-modality'].apply(lambda x: any_in_string(selected_modalities, x)) if selected_modalities else True) &
        (df_copy['Repository'].apply(lambda x: any_in_string(selected_repositories, x)) if selected_repositories else True)
    ]

    if filtered_df.empty:
        # Return empty plots with a message when no data is available
        no_data_fig = px.bar(title="No data available for selected filters.")
        return no_data_fig, no_data_fig, px.pie(title="No data available for selected filters.")

    # Split the 'Sex(female,male,unknown)' column
    sex_split = filtered_df['Sex(female,male,unknown)'].str.split(';', expand=True)
    filtered_df['Female Count'] = pd.to_numeric(sex_split[0], errors='coerce').fillna(0)
    filtered_df['Male Count'] = pd.to_numeric(sex_split[1], errors='coerce').fillna(0)
    filtered_df['Unknown Count'] = pd.to_numeric(sex_split[2], errors='coerce').fillna(0)

    # Aggregate the data by 'Study-Title'
    sex_summary_df = filtered_df.groupby('Study-Title').agg({
        'Female Count': 'sum',
        'Male Count': 'sum',
        'Unknown Count': 'sum'
    }).reset_index()

    # Creating a stacked bar plot for the counts
    fig_counts = px.bar(
        sex_summary_df, 
        x='Study-Title', 
        y=['Female Count', 'Male Count', 'Unknown Count'],
        labels={'value': 'Number of Cells', 'variable': 'Sex'},
        title="Cell Counts by Sex", 
        barmode='stack'
    )
    fig_counts.update_traces(texttemplate='%{y:.0f}', textposition='outside')
    fig_counts.update_layout(
        yaxis_title='Cell Count',
        title_x=0.5
    )

    # Calculate percentages for the bar plot
    sex_summary_df['Total'] = sex_summary_df[['Female Count', 'Male Count', 'Unknown Count']].sum(axis=1)
    sex_summary_df['Female Percentage'] = (sex_summary_df['Female Count'] / sex_summary_df['Total']) * 100
    sex_summary_df['Male Percentage'] = (sex_summary_df['Male Count'] / sex_summary_df['Total']) * 100
    sex_summary_df['Unknown Percentage'] = (sex_summary_df['Unknown Count'] / sex_summary_df['Total']) * 100

    # Creating a stacked bar plot for percentages
    fig_percentage = px.bar(
        sex_summary_df, 
        x='Study-Title', 
        y=['Female Percentage', 'Male Percentage', 'Unknown Percentage'],
        labels={'value': 'Percentage (%)', 'variable': 'Sex'},
        title="Cell Counts Percentage by Sex", 
        barmode='stack'
    )
    fig_percentage.update_traces(texttemplate='%{y:.2f}%', textposition='outside')
    fig_percentage.update_layout(
        yaxis_title='Percentage (%)',
        title_x=0.5
    )

    # Summing the counts across all studies for the pie chart
    total_sex_counts = sex_summary_df[['Female Count', 'Male Count', 'Unknown Count']].sum().reset_index()
    total_sex_counts.columns = ['Sex', 'Cell Count']
    total_sex_counts['Sex'] = total_sex_counts['Sex'].replace({
        'Female Count': 'Female', 
        'Male Count': 'Male', 
        'Unknown Count': 'Unknown'
    })

    # Creating a pie chart for the overall distribution
    pie_fig = px.pie(
        total_sex_counts, 
        names='Sex', 
        values='Cell Count',
        title="Overall Distribution of Cells by Sex",
        hole=0.3
    )
    pie_fig.update_traces(textinfo='percent+label')
    pie_fig.update_layout(title_x=0.5)

    return fig_counts, fig_percentage, pie_fig

In [18]:
# Callback for Ethnicity Barplot
@app.callback(
    [Output('ethnicity-barplot-counts', 'figure'),
     Output('ethnicity-barplot-percentage', 'figure'),
     Output('ethnicity-pie-chart', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_ethnicity_charts(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    # Exploding necessary columns to handle multiple entries
    exploded_df = split_and_explode(grouped_df, 'Disease')
    exploded_df = split_and_explode(exploded_df, 'Tissue-Broad')
    exploded_df = split_and_explode(exploded_df, 'Data-modality')
    exploded_df = split_and_explode(exploded_df, 'Repository')
    exploded_df = split_and_explode(exploded_df, 'Ethnicity')

    # Filtering the DataFrame based on the selected filters
    filtered_df = exploded_df[
        exploded_df['Disease'].apply(lambda x: x.strip() in selected_diseases) &
        exploded_df['Tissue-Broad'].apply(lambda x: x.strip() in selected_tissue) &
        exploded_df['Data-modality'].apply(lambda x: x.strip() in selected_modalities) &
        exploded_df['Repository'].apply(lambda x: x.strip() in selected_repositories)
    ]

    if filtered_df.empty:
        # Return empty plots if no data is available
        no_data_fig = px.bar(title="No data available for selected filters.")
        return no_data_fig, no_data_fig, px.pie(title="No data available for selected filters.")

    # Aggregating the cell count by 'Study-Title' and 'Ethnicity'
    ethnicity_counts = filtered_df.groupby(['Study-Title', 'Ethnicity'])['Cell count'].sum().reset_index()

    # Creating a bar plot for cell counts by ethnicity
    fig_counts = px.bar(
        ethnicity_counts, 
        x='Study-Title', 
        y='Cell count', 
        color='Ethnicity',
        labels={'Study-Title': 'Study', 'Cell count': 'Number of Cells'},
        title="Cell Counts by Ethnicity", 
        text='Cell count'
    )
    fig_counts.update_traces(texttemplate='%{text:.0f}', textposition='outside')
    fig_counts.update_layout(
        yaxis_title='Number of Cells',
        title_x=0.5
    )

    # Calculating the total count for each study to compute percentages
    ethnicity_totals = ethnicity_counts.groupby('Study-Title')['Cell count'].sum().reset_index()
    ethnicity_totals.rename(columns={'Cell count': 'Total Count'}, inplace=True)
    ethnicity_counts = ethnicity_counts.merge(ethnicity_totals, on='Study-Title')
    ethnicity_counts['Percentage'] = (ethnicity_counts['Cell count'] / ethnicity_counts['Total Count']) * 100

    # Creating a bar plot for percentages by ethnicity
    fig_percentage = px.bar(
        ethnicity_counts, 
        x='Study-Title', 
        y='Percentage', 
        color='Ethnicity',
        labels={'Percentage': 'Percentage (%)', 'Ethnicity': 'Ethnicity'},
        title="Cell Counts Percentage by Ethnicity",
        text='Percentage'
    )
    fig_percentage.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
    fig_percentage.update_layout(
        yaxis_title='Percentage (%)',
        title_x=0.5
    )

    # Aggregating the overall cell counts by ethnicity for the pie chart
    total_ethnicity_counts = filtered_df.groupby('Ethnicity')['Cell count'].sum().reset_index()
    total_ethnicity_counts.rename(columns={'Cell count': 'Total Count'}, inplace=True)

    # Creating a pie chart for the overall distribution of cell counts by ethnicity
    pie_fig = px.pie(
        total_ethnicity_counts, 
        names='Ethnicity', 
        values='Total Count',
        labels={'Total Count': 'Number of Cells', 'Ethnicity': 'Ethnicity'},
        title="Overall Distribution of Cells by Ethnicity",
        hole=0.3
    )
    pie_fig.update_traces(textinfo='percent+label')
    pie_fig.update_layout(title_x=0.5)

    return fig_counts, fig_percentage, pie_fig

In [19]:
# Callback for treemap with ethnicity and total cell counts
@app.callback(
    [Output('treemap-ethnicity', 'figure'),
     Output('treemap-total', 'figure')],
    [Input('disease-dropdown', 'value'),
     Input('Tissue-Broad-dropdown', 'value'),
     Input('modality-dropdown', 'value'),
     Input('repository-dropdown', 'value')]
)
def update_treemaps(selected_diseases, selected_tissue, selected_modalities, selected_repositories):
    # Ensure inputs are lists for consistent filtering
    selected_diseases = selected_diseases if isinstance(selected_diseases, list) else [selected_diseases]
    selected_tissue = selected_tissue if isinstance(selected_tissue, list) else [selected_tissue]
    selected_modalities = selected_modalities if isinstance(selected_modalities, list) else [selected_modalities]
    selected_repositories = selected_repositories if isinstance(selected_repositories, list) else [selected_repositories]

    # Copy the DataFrame and fill NaN values for filtering
    df_copy = df.copy()
    df_copy['Disease'] = df_copy['Disease'].fillna('')
    df_copy['Tissue-Broad'] = df_copy['Tissue-Broad'].fillna('')
    df_copy['Data-modality'] = df_copy['Data-modality'].fillna('')
    df_copy['Repository'] = df_copy['Repository'].fillna('')
    df_copy['Ethnicity'] = df_copy['Ethnicity'].fillna('')

    # Apply conditional filtering based on selected dropdown values
    filtered_df = df_copy[
        (df_copy['Disease'].apply(lambda x: any_in_string(selected_diseases, x)) if selected_diseases else True) &
        (df_copy['Tissue-Broad'].apply(lambda x: any_in_string(selected_tissue, x)) if selected_tissue else True) &
        (df_copy['Data-modality'].apply(lambda x: any_in_string(selected_modalities, x)) if selected_modalities else True) &
        (df_copy['Repository'].apply(lambda x: any_in_string(selected_repositories, x)) if selected_repositories else True)
    ]

    if filtered_df.empty:
        # Create an empty figure with a message when no data is available
        fig_ethnicity = px.treemap(title="No data available for selected filters.")
        fig_total = px.treemap(title="No data available for selected filters.")
    else:
        # Treemap based on Ethnicity
        fig_ethnicity = px.treemap(
            filtered_df,
            path=['Repository', 'Ethnicity'],
            values='Cell count',
            title='Treemap of Heart Data Consortium by Ethnicity'
        )

        # Treemap for all cell counts (showing repository and study title hierarchy)
        fig_total = px.treemap(
            filtered_df,
            path=['Repository', 'Study-Title'],
            values='Cell count',
            title='Treemap of Heart Data Consortium by Total Cell Counts'
        )

    # Update the traces to customize the text and position for ethnicity treemap
    fig_ethnicity.update_traces(
        texttemplate='%{label}<br>Cell count: %{value}',
        textposition='middle center'
    )

    # Update the traces for the total cell counts treemap
    fig_total.update_traces(
        texttemplate='%{label}<br>Cell count: %{value}',
        textposition='middle center'
    )

    # Center the titles for both figures
    fig_ethnicity.update_layout(title_x=0.5)
    fig_total.update_layout(title_x=0.5)

    return fig_ethnicity, fig_total

In [20]:
# Run the app
if __name__ == '__main__':
    app.run_server(debug=True, port=8051)