# Viewing Gene Expression Distributions

A great deal of quality-control information comes from the alignment step itself.
Here we are concerned with how the data looks as an ensemble.
Many methods are particularly concerned with gene-wise expression variance.

In this notebook we demonstrate the plotting utilities provided by `gsforge` to examine such distributions.

***Set up the notebook***

In [None]:
import os
import GSForge as gsf
from pathlib import Path
import scipy
import numpy as np
import holoviews as hv

hv.extension("bokeh")

***Declare used paths***

In [None]:
# OS-independent path management.
from os import fspath, environ
from pathlib import Path

In [None]:
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/osfstorage")).expanduser()
HYDRO_NORMED_GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hydro_normed.nc")
assert HYDRO_NORMED_GEM_PATH.exists()

Declare an path to which the created `.nc` file will saved.

***Load an AnnotatedGEM***

In [None]:
agem = gsf.AnnotatedGEM(HYDRO_NORMED_GEM_PATH)
agem

In [None]:
# vals = list(agem.data.Sample.isel(Sample=[1,2, 25,26, 50, 200, 220, 249]).copy(deep=True).values)
# gsf.plots.gem.SamplewiseDistributions(agem, sample_subset=vals, datashade=True, hue_key='treatment')

In [None]:
gsf.plots.gem.EmpiricalCumulativeDistribution(agem,
#                                               sample_subset=vals,
                                              datashade=True, 
                                              hue_key='treatment',
                                              count_transform=np.log2
                                             )

In [None]:
# hv.plotting.util.process_cmap(cmap='glasbey', ncolors=475)

In [None]:
import xarray as xr

In [None]:
from holoviews.operation.stats import univariate_kde

In [None]:
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize

In [None]:
import datashader as ds

In [None]:
import colorcet as cc

In [None]:
import pandas as pd

In [None]:
counts, labels = gsf.get_gem_data(agem, annotation_variables=['treatment'], 
                                  sample_subset=vals, 
                                  count_transform=lambda c: np.log2(c+0.25))

In [None]:
for i, c in enumerate(counts):
    print(c)
    break

In [None]:
counts.values.shapea

The average squared deviation is normally calculated as x.sum() / N, where N = len(x). If, however, ddof is specified, the divisor N - ddof is used instead. In standard statistical practice, ddof=1 provides an unbiased estimator of the variance of the infinite population. ddof=0 provides a maximum likelihood estimate of the variance for normally distributed variables. The standard deviation computed in this function is the square root of the estimated variance, so even with ddof=1, it will not be an unbiased estimate of the standard deviation per se.

In [None]:
label_series = labels.to_series()
unique_keys = label_series.unique()
colors = hv.plotting.util.process_cmap(cmap='glasbey', ncolors=len(unique_keys))

hue_colors = {label: color for label, color in zip(unique_keys, colors)}
color_key = label_series.map(hue_colors)
color_key

In [None]:
def kde_linespace(sample_counts, bin_range, sample_size, cut=3):
    """
    Apply the math behind the univariate_kde as implemented by Holoviews so that
    it can be applied with `numpy.apply_along_axis`.
    """
    kde = scipy.stats.gaussian_kde(dataset=counts, bw_method=None, weights=None)
    bw = kde.scotts_factor() * sample_counts.std(ddof=1)
    
    kmin, kmax = bin_range[0] - bw * cut, bin_range[1] + bw * cut
    xs = np.linspace(kmin, kmax, sample_size)
    return xs

def evaluate_kde(values, x_space):
    kde_model = scipy.stats.gaussian_kde(dataset=values, bw_method=None, weights=None)
    return kde_model.evaluate(x_space)


bin_range = (counts.min().values, counts.max().values)


kde = scipy.stats.gaussian_kde(dataset=counts, bw_method=None, weights=None)
x_space = kde_linespace(counts.values, bin_range=bin_range, sample_size=50)
kde_dist_y = np.apply_along_axis(func1d=evaluate_kde, axis=1, arr=counts.values, x_space=x_space)

# np.apply_along_axis(func1d=sample_kde, axis=1, arr=counts.values, bin_range=bin_range, sample_size=100)
# np.apply_along_axis(func1d=sample_kde, axis=1, arr=counts.values, bin_range=bin_range, sample_size=100)

In [None]:
# x_space = kde_linespace(counts.values, bin_range=bin_range, sample_size=50)
# np.apply_along_axis(func1d=kde_linespace, axis=1, arr=counts.values, bin_range=bin_range, sample_size=100)

In [None]:
df = pd.DataFrame(kde_dist_y, columns=x_space, index=counts.Sample.values)
# df

In [None]:
df.shape

In [None]:
colors = hv.plotting.util.process_cmap(cmap='glasbey', ncolors=df.shape[0])
# colors

In [None]:
color_key 

In [None]:
lines = {sample_name: hv.Curve((df.columns.values, row_series.values))
         for sample_name, row_series in df.iterrows()}
linespread = dynspread(datashade(hv.NdOverlay(lines, kdims='k'), color_key=list(color_key.values),
                                 aggregator=ds.count_cat('k')))
linespread.opts(hv.opts.RGB(width=600, height=600, show_grid=True, bgcolor="lightgrey", xlabel='Expression', ylabel='Distribution'))

In [None]:


kmin, kmax = bin_range[0] - bw * cut, bin_range[1] + bw * cut
    if isfinite(clip[0]):
        kmin = max(kmin, clip[0])
    if isfinite(clip[1]):
        kmax = min(kmax, clip[1])
    return np.linspace(kmin, kmax, gridsize)


if self.p.bandwidth:
    kde.set_bandwidth(self.p.bandwidth)
    
bw = kde.scotts_factor() * data.std(ddof=1)

if self.p.bin_range:
    xs = np.linspace(bin_range[0], bin_range[1], self.p.n_samples)
else:
    xs = _kde_support(bin_range, bw, self.p.n_samples, self.p.cut, selected_dim.range)


ys = kde.evaluate(xs)

***View available count arrays***

In [None]:
agem.count_array_names

Recall that all `gsforge` plotting operations allow use of the `Interface` data selection pipeline.

In this case we can select another count array and view the normalized distributions.

In [None]:
covariance_control_vs_heat = gsf.plots.gem.GroupedGeneCovariance(agem, group_variable='treatment', x_group_label='CONTROL', y_group_label='HEAT',
                                           count_transform=lambda c: np.log(c + 0.25)).opts(size=0.75, width=300, height=300)
covariance_control_vs_heat
hv.save(covariance_control_vs_heat, 'figures/covariance_control_vs_heat.png', 'png')

In [None]:
# gsf.plots.gem.EmpiricalCumulativeDistribution(
#             agem,
#             hue_key='treatment',
# #             count_variable=count_var,
# #             axis_transform=('log 2', lambda ds: np.log2(ds + 0.25)),
#             datashade=True,
#         )

## Gene-wise Aggregate Distributions

The call below shows the default arguments, with the exception of `datashade=True`.

### Available Aggregates

+ frequency
+ mean
+ variance
+ standard_dev
+ fano
+ mean_rank
+ cv_squared

In [None]:
for count_var in agem.count_array_names:
    
    for y_axis in ['variance', 'fano', 'cv_squared']:

        plot = gsf.plots.gem.GenewiseAggregateScatter(
            agem,
            count_variable=count_var,
            x_axis_selector='mean',
            y_axis_selector=y_axis,
            axis_transform=('log 2', lambda ds: np.log2(ds.where(ds > 0))),
            datashade=True,
        )

        hv.save(plot, f'figures/{count_var}_gw_agg_log2_mean_vs_log2_{y_axis}.png', dpi=300, toolbar=None)

In [None]:
# for count_var in agem.count_array_names:
#     for hue in [None, 'treatment', 'genotype']:

#         plot = gsf.plots.gem.EmpiricalCumulativeDistribution(
#             agem,
#             hue_key=hue,
#             count_variable=count_var,
#             axis_transform=('log 2', lambda ds: np.log2(ds.where(ds > 0))),
#             datashade=True,
#         )

#         hv.save(plot, f'figures/ECDF_{count_var}_{hue}.png', dpi=300, toolbar=None)

For some reason the adjoint png files produced have extra white space.
We can remove that.

In [None]:
from PIL import Image
from PIL import ImageOps
import numpy as np

In [None]:
padding = 5
padding = np.asarray([-1*padding, -1*padding, padding, padding])


for figure in Path('figures').glob('*_gw_*.png'):
    print(figure)
    
    image = Image.open(figure)
    image.load()
    imageSize = image.size

    # remove alpha channel
    invert_im = image.convert("RGB")

    # invert image (so that white is 0)
    invert_im = ImageOps.invert(invert_im)
    imageBox = invert_im.getbbox()
    imageBox = tuple(np.asarray(imageBox)+padding)

    cropped = image.crop(imageBox)
    cropped.save(figure)

## Sample-wise Distributions

These plotting functions can take a few minutes to complete.

In [None]:
%%time
for count_var in agem.count_array_names:
    for hue in [None, 'treatment', 'genotype']:
        plot = gsf.plots.gem.SamplewiseDistributions(agem, count_variable=count_var, hue_key=hue)
        hv.save(plot, f'figures/{count_var}_samplewise_kde_hue_{hue}.png', dpi=300, toolbar=None)