# Importing and formatting

In [5]:
# import
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, callback

# Analysis of the dataset
Here you can find the link to the paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC6083445/#S4

18 individuals that have been used for the sequencing:

* Zebrafish Embryos, 4hpf (hours post fertilization)
* Zebrafish Embryos, 6hpf
* Zebrafish Embryos, 8hpf
* Zebrafish Embryos, 10hpf
* Zebrafish Embryos, 14hpf
* Zebrafish Embryos, 18hpf
* Zebrafish Embryos, 24hpf

*TracerSeq Embryos:*

* TracerSeq Embryo 1
* TracerSeq Embryo 2
* TracerSeq Embryo 3
* TracerSeq Embryo 4
* TracerSeq Embryo 5

*CRISPR-Targeted Embryos:*

* CRISPR-Targeted Embryo chordin A
* CRISPR-Targeted Embryo chordin B
* CRISPR-Targeted Embryo chordin C
* CRISPR-Targeted Embryo tyrosinase A
* CRISPR-Targeted Embryo tyrosinase B
* CRISPR-Targeted Embryo tyrosinase C

In [6]:
# Loading dataset
'''
to avoid loading every time the entire dataset,
we load it outside of the dash app context
'''

df = pd.read_csv('zfish.csv')

In [7]:
# info dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63530 entries, 0 to 63529
Columns: 1004 entries, gene0 to Type
dtypes: float64(1003), int64(1)
memory usage: 486.6 MB


In [8]:
df.head(3)

Unnamed: 0,gene0,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene994,gene995,gene996,gene997,gene998,gene999,R,G,B,Type
0,-0.028154,0.376729,-0.034165,0.88809,-0.004843,-0.027648,-0.06607,0.066663,-0.025603,-0.053993,...,-0.05516,-0.057072,-0.090247,-0.176327,-0.177548,-0.020889,96.25,51.25,12.5,0
1,-0.028154,-0.085815,-0.034165,0.946432,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.05516,-0.057072,-0.090247,-0.176327,2.631356,-0.020889,96.25,51.25,12.5,0
2,-0.028154,-0.085815,-0.034165,1.04208,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.05516,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0


In [9]:
# How many types are in the dataset?
print("different types:", df['Type'].unique())
print("number of different types:", df['Type'].nunique())

different types: [0 8 2 9 1 7 4 3 6 5]
number of different types: 10


## What are types?

Based on my interpration is the destiny of each cell. It is the type of tissue that it became.

There are 10 of them.
Their dtype is numerical, is better to treat them as string since they represent categorical data

In [10]:
df["Type"] = df['Type'].astype(str)
print(df['Type'].dtype)

object


---

In [11]:

for t in df["R"].unique():
    print(t)

96.25
82.5
68.75
55.0
41.25
27.5
13.75


## What are RGBs?

Unique combination of RGBs refers to different hours post fertilization. 
There are 7 of them.

## Modifying RGB values to create a column called hpf


In [12]:
df["hpf"] = df['R'].astype(str) + '-' + df['G'].astype(str) + '-' + df['B'].astype(str)


In [13]:
print(df["hpf"].unique())

['96.25-51.25-12.5' '82.5-52.5-25.0' '68.75-53.75-37.5' '55.0-55.0-50.0'
 '41.25-56.25-62.5' '27.5-57.5-75.0' '13.75-58.75-87.5']


In [None]:
# Convert hpf to numerical labels
# it takes a while to run, no need anymore
'''

hpf = 0
for t in df["hpf"].unique():
    for i in range(len(df["hpf"])):
        if df.loc[i, "hpf"] == t:
            df.loc[i, "hpf"] = hpf
    hpf += 1

print(df["hpf"].unique())

'''

[0 1 2 3 4 5 6]


In [None]:
# changing hpf to string for better visualization and because it encodes categorical data
'''
df["hpf"] = df["hpf"].astype(str)
print(df.head())
'''

      gene0     gene1     gene2     gene3     gene4     gene5    gene6  \
0 -0.028154  0.376729 -0.034165  0.888090 -0.004843 -0.027648 -0.06607   
1 -0.028154 -0.085815 -0.034165  0.946432 -0.004843 -0.027648 -0.06607   
2 -0.028154 -0.085815 -0.034165  1.042080 -0.004843 -0.027648 -0.06607   
3 -0.028154 -0.085815 -0.034165  1.245001 -0.004843 -0.027648 -0.06607   
4 -0.028154 -0.085815 -0.034165 -0.205409 -0.004843 -0.027648 -0.06607   

      gene7     gene8     gene9  ...   gene995   gene996   gene997   gene998  \
0  0.066663 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327 -0.177548   
1 -0.395881 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327  2.631356   
2 -0.395881 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327 -0.427278   
3  1.763011 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327 -0.427278   
4 -0.395881 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327 -0.427278   

    gene999      R      G     B  Type  hpf  
0 -0.020889  96.25  51.25  12

## Saving the new dataset as csv


(don't run it again, it takes a while)

In [None]:
# Save formatted dataset
#           df.to_csv('zfish_formatted.csv', index=False)

In [None]:
df = pd.read_csv('zfish_formatted.csv')
df.head(20)

Unnamed: 0,gene0,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene995,gene996,gene997,gene998,gene999,R,G,B,Type,hpf
0,-0.028154,0.376729,-0.034165,0.88809,-0.004843,-0.027648,-0.06607,0.066663,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.177548,-0.020889,96.25,51.25,12.5,0,0
1,-0.028154,-0.085815,-0.034165,0.946432,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,2.631356,-0.020889,96.25,51.25,12.5,0,0
2,-0.028154,-0.085815,-0.034165,1.04208,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0
3,-0.028154,-0.085815,-0.034165,1.245001,-0.004843,-0.027648,-0.06607,1.763011,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0
4,-0.028154,-0.085815,-0.034165,-0.205409,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0
5,-0.028154,-0.085815,-0.034165,-0.205409,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0
6,-0.028154,1.0392,-0.034165,0.783147,-0.004843,-0.027648,-0.06607,0.083018,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.168048,-0.020889,96.25,51.25,12.5,0,0
7,-0.028154,-0.085815,-0.034165,0.696908,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0
8,-0.028154,-0.085815,-0.034165,-0.205409,-0.004843,-0.027648,-0.06607,-0.395881,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0
9,-0.028154,-0.085815,-0.034165,-0.205409,-0.004843,-0.027648,-0.06607,2.197854,-0.025603,-0.053993,...,-0.057072,-0.090247,-0.176327,-0.427278,-0.020889,96.25,51.25,12.5,0,0


## Generating functions for visualizaiton

In [17]:
def generate_scatter_plot_facet(data, x_col, y_col, title):
    fig = px.scatter(
        data,
        x=x_col,
        y=y_col,
        color="Type",
        facet_col="hpf",
        title=title,
        labels={x_col: x_col, y_col: y_col},
        template='plotly_white',
        custom_data=[data.index]
    )
    fig.update_traces(marker=dict(size=8, opacity=0.7), selector=dict(mode='markers'))
    return fig



def generate_histogram(data, column, color, title):
    fig = px.histogram(
        data,
        x=column,
        title=title,
        color=color,
        labels={column: column},
        template='plotly_white',
        nbins=30
    )
    fig.update_traces(marker=dict(color='skyblue', line=dict(color='black', width=1)))
    return fig

In [18]:
# testing them out
df_sample = df.sample(frac=1).reset_index(drop=True)
df_sample = df_sample.sort_values(by='hpf')
print(df_sample.head())
scatter_fig = generate_scatter_plot_facet(df_sample, 'gene0', 'gene1', 'gene0 vs gene1 Scatter Plot')
scatter_fig.show()

          gene0     gene1     gene2     gene3     gene4     gene5    gene6  \
52308 -0.028154 -0.085815 -0.034165 -0.205409 -0.004843 -0.027648 -0.06607   
23053 -0.028154 -0.085815 -0.034165  1.038874 -0.004843 -0.027648 -0.06607   
23057 -0.028154 -0.085815 -0.034165 -0.205409 -0.004843 -0.027648 -0.06607   
23062 -0.028154 -0.085815 -0.034165  1.382845 -0.004843 -0.027648 -0.06607   
23070 -0.028154 -0.085815 -0.034165  1.037277 -0.004843 -0.027648 -0.06607   

          gene7     gene8     gene9  ...   gene995   gene996   gene997  \
52308  2.138063 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327   
23053 -0.395881 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327   
23057 -0.395881 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327   
23062 -0.395881 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327   
23070  1.504368 -0.025603 -0.053993  ... -0.057072 -0.090247 -0.176327   

        gene998   gene999      R      G     B  Type  hpf  
52308 -0.427278 -0.020889  

In [19]:
# Creating function to select different hpf values
def filter_data_by_hpf(data, hpf_values):
    return data[data["hpf"].isin(hpf_values)]

# test filter data by hpf with more than one value
test_data = pd.DataFrame({
    "hpf": ['0', '1', '2', '0', '1', '2'],
    "gene0": np.random.rand(6),
    "gene1": np.random.rand(6)
})
test_data = filter_data_by_hpf(test_data, ['0', '2'])
test_data.head()

px.scatter(test_data, x='gene0', y='gene1', color='hpf').show()
### Works perfectly up to here ###

In [20]:
# Function to generate dataset from Type
def generate_dataset_by_type(data, selected_types):
    return data[data["Type"].isin(selected_types)]

In [21]:
# Testing custom_data parameter in plotly

custom_test = px.scatter(test_data, x='gene0', y='gene1', color='hpf', custom_data=[test_data.index])

custom_test.update_traces(hovertemplate='Index: %{customdata}<br>gene0: %{x}<br>gene1: %{y}<br>').show() 
# here the <br> works to create new lines in hovertemplate

# Creating a primordial dashboard

In [38]:
app = Dash()

app.layout = html.Div([
    html.H1("Zebrafish Gene Expression Dashboard"),

    # Dropdown menus
    html.Div([
        html.H3("Select hours post fertilization (hpf)"),
        # Choosing hpf
        dcc.Dropdown(id = 'hpf-dropdown-scatter',
                 options= [h for h in np.sort(df['hpf'].unique())],
                 value=np.sort(df['hpf'].unique())[0]),
    ],style={'width': '48%', 'display': 'inline-block'}),
    # Choosing type
    html.Div([
        html.H3('Select Type of tissue'),
        dcc.Dropdown(
            id= "type-dropdown-scatter",
            options=[t for t in np.sort(df["Type"].unique())],
            value=np.sort(df["Type"].unique())[0]
        )
    ],style={'width': '48%', 'display': 'inline-block', 'float':'right'}),

    # Basic scatter gene0 - gene 1
    html.Div([
        dcc.Graph(
            id= 'scatter-hpf-type-selected',
        )
    ]),

    # Facet version of scatter plot with multi choice dropdown
    # drop down multi choice
    html.Div([
        html.H3("Select hpf (multiple choices possible)"),
        dcc.Dropdown(
            id= "hpf-dropdown-scatter-facet",
            options=[h for h in np.sort(df['hpf'].unique())],
            value=[np.sort(df['hpf'].unique())[0]],
            multi=True)
    ], style={'width': '48%', 'display': 'inline-block'}),

    html.Div([
        html.H3('Select Type of tissue'),
        dcc.Dropdown(
            id= "type-dropdown-scatter-facet",
            options=[t for t in np.sort(df["Type"].unique())],
            value=[np.sort(df["Type"].unique())[0]],
            multi=True
        )
    ],style={'width': '48%', 'display': 'inline-block', 'float':'right'}),

    # Creating facet graph
    html.Div(
        dcc.Graph(
            id='facet-scatter',
        )
    )

                                 
 ])




# Defining callbacks
@callback(
    Output(component_id="scatter-hpf-type-selected", component_property='figure'),
    Input(component_id='hpf-dropdown-scatter', component_property= 'value'),
    Input(component_id='type-dropdown-scatter', component_property= 'value'),
)
def update_scatter_hpf_type_selected(hpf, type):
    df_filtered = df[(df['hpf'] == hpf) & (df['Type'] == type)] 
    fig = px.scatter(df_filtered, x="gene0", y="gene1", color="Type", custom_data=[df_filtered.index])
    fig.update_traces(hovertemplate="index= %{customdata}")
    return fig


@callback(
    Output(component_id="facet-scatter", component_property='figure'),
    Input(component_id='hpf-dropdown-scatter-facet', component_property= 'value'),
    Input(component_id='type-dropdown-scatter-facet', component_property= 'value')
)
def update_scatter_facet_hpf_type_selected(hpf, type):
    if hpf is None:
        hpf = [h for h in np.sort(df['hpf'].unique())]
    if type is None:
        type = [t for t in np.sort(df["Type"].unique())]
    df_filtered = df[(df['hpf'].isin(hpf)) & (df['Type'].isin(type))]
    fig = px.scatter(df_filtered, x="gene0", y="gene1", color="Type", facet_col="hpf", custom_data=[df_filtered.index])
    fig.update_traces(hovertemplate="index= %{customdata}")
    return fig






if __name__ == '__main__':
    app.run(debug=True)

# Exploratory Data Analysis

In [27]:
from sklearn.decomposition import PCA
fig_type = px.histogram(
    df,
    x="Type",
    title="Number of cells per Type"
)
fig_type.show()


In [28]:
fig_hpf = px.histogram(
    df,
    x="hpf",
    nbins=len(df["hpf"].unique()),
    title="Number of cells per developmental stage (hpf)"
)
fig_hpf.show()


In [37]:
gene_cols = [c for c in df.columns if c.startswith("gene")]
cells_sample = df.sample(50, random_state=0)
genes_sample = gene_cols[:50]

# sample 50 cells and 50 genes for readability
# Aggregate mean expression by cell type
mean_by_type = df.groupby("Type")[genes_sample].mean()

fig_heat = px.imshow(
    mean_by_type,
    labels=dict(x="Genes", y="Cell Type", color="Expression"),
    x=genes_sample,
    y=mean_by_type.index.astype(str),
    title="Mean gene expression per cell type"
)
fig_heat.show()



In [31]:
# PCA on gene expression matrix
X = df[gene_cols].values
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

pca_df = pd.DataFrame({
    "PC1": X_pca[:, 0],
    "PC2": X_pca[:, 1],
    "Type": df["Type"],
    "hpf": df["hpf"]
})

fig_pca_type = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="Type",
    title="PCA of cells colored by Type"
)
fig_pca_type.show()

fig_pca_hpf = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="hpf",
    title="PCA of cells colored by hpf"
)
fig_pca_hpf.show()
