## siRNA knockdown fixed ##
This notebook fits traces with fixed mRNA expression.

An amount of mRNA is added to the cells.
At a certain time $t=0$, a fixing agent (CHX) is added, and mRNA expression is stopped.
Now, an initial amount $G_{u0}$ of pre-mature protein and an initial amount $G_0$ of mature protein is in the cell.

After adding the fixing agent, the pre-mature protein can mature with maturation rate $k_m$, and both pre-mature and mature protein can degrade with degradation rate $\beta$.

This system can be described by the following differential equations:
$$\begin{align*}
\frac{\mathrm{d}G_u}{\mathrm{d}t} &= -\beta G_u - k_m G_u \\
\frac{\mathrm{d}G}{\mathrm{d}t} &= -\beta G + k_m G_u
\end{align*}$$

The solution for the amount of mature protein $G(t)$ is:
$$
G(t) = G_0 \mathrm{e}^{-\beta t} + G_{u0}\left(\mathrm{e}^{-\beta t} - \mathrm{e}^{-(\beta+k_m)t}\right)
$$
The parameters to fit are $\beta$, $k_m$ and $G_{u0}$.

## Notebook structure
The notebook has the following structure:

At first, the model functions are defined and the data is loaded. The next section contains code for fitting the two models separately. The next section contains code for fitting the two traces in one run with parameters shared among the models.

Fitting requires that the result list `R` is defined, which can be done by running the corresponding cell. When `R` has been populated by fitting, the results can be plotted. There are cells for plotting the results of the separate fit, the results of the combined fit, and the pure parameter distributions of all fits.

Additionally, there are cells for saving and loading paramaters by python’s `pickle` module.

In [None]:
# Import modules needed

# Standard library
from collections import OrderedDict
from copy import deepcopy
import inspect
import os
import pickle
import sys

# Scientific stack
import numpy as np
np.seterr(divide='print')
import pandas as pd
pd.set_option('display.max_rows', None)
import scipy as sc
#import scipy.optimize as so
#import scipy.stats as ss
import sklearn.cluster as skc
from sklearn.neighbors import KernelDensity

# Matplotlib
%matplotlib inline
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.gridspec import GridSpec
#import matplotlib.lines as mlin
#import matplotlib.patches as mptch
import matplotlib.pyplot as plt

# Notebook utilities
import IPython
import ipywidgets as wdg

In [None]:
# Define utility functions
def getTimeStamp():
    """Returns a human-readable string representation of the current time"""
    import time
    from datetime import datetime
    return datetime.now().strftime("%Y-%m-%d–%H%M%S")


def getOutpath(filename='', timestamp=None):
    """Returns (and creates, if necessary) the path to a directory
    called “out” inside the current directory.
    If `filename` is given, the filename is appended to the output directory.
    A timestamp will be added to the filename if `timestamp != ''`.
    If timestamp is `None`, the current timestamp is used.
    """
    # Create output directory
    outpath = os.path.join(os.getcwd(), 'out')
    if not os.path.isdir(outpath) and not os.path.lexists(outpath):
        os.mkdir(outpath)

    # If requested, build filename
    if len(filename) > 0:
        if timestamp == None:
            timestamp = getTimeStamp()
        outpath = os.path.join(outpath, ((timestamp + '_') if len(timestamp) > 0 else '') + filename)
    return outpath

In [None]:
def model(t, G0, Gu0, beta, km):
    """General fixed expression model function"""
    return G0 * np.exp(-beta * t) + Gu0 * (np.exp(-beta * t) - np.exp(-(beta+km) * t))

def red(t, G0r, Gu0r, betr, kmr):
    """Model function for fixed RFP data"""
    return model(t=t, G0=G0r, Gu0=Gu0r, beta=betr, km=kmr)

def green(t, G0g, Gu0g, betg, kmg):
    """Model function for fixed GFP data"""
    return model(t=t, G0=G0g, Gu0=Gu0g, beta=betg, km=kmg)

In [None]:
# Set default parameter values
G0r_0 = 100
Gu0r_0 = 1000
betr_0 = 0.04
kmr_0 = 0.3

G0g_0 = 2000
Gu0g_0 = 2000
betg_0 = 0.04
kmg_0 = 0.1

In [None]:
class FitParameters:
    """FitParameters facilitates managing values and bounds of fit parameters"""
    def __init__(self, fun, independent=[], fixed=[]):
        # Store function
        self.fun = fun

        # Get parameters of fun
        params = inspect.signature(self.fun).parameters

        # Build data frame of parameters
        self.df = pd.DataFrame(columns=['value', 'min', 'max'],
                               index=[p for p in params.keys()],
                               dtype=np.float64)

        # Set “independent” and “fixed” flag
        self.df.add(pd.DataFrame(columns=['independent', 'fixed'], dtype=np.bool))
        for p in self.df.index.values:
            self.df.loc[p, 'independent'] = p in independent
            self.df.loc[p, 'fixed'] = p in fixed

        # Set default parameters
        for p in self.df.index.values:
            if params[p].default == inspect.Parameter.empty:
                if self.df.loc[p, 'independent']:
                    self.df.loc[p, 'value'] = np.NaN
                else:
                    self.df.loc[p, 'value'] = 0
            else:
                self.df.loc[p, 'value'] = params[p].default

    def set(self, p, **props):
        """Allows user to change parameter properties"""
        if p not in self.df.index.values:
            raise KeyError("Unknown parameter name: {}".format(par))

        for prop, val in props.items():
            if prop == 'value':
                self.df.loc[p, 'value'] = val
            elif prop == 'min':
                self.df.loc[p, 'min'] = val
            elif prop == 'max':
                self.df.loc[p, 'max'] = val
            elif prop == 'independent':
                self.df.loc[p, 'independent'] = val
            elif prop == 'fixed':
                self.df.loc[p, 'fixed'] = val
            else:
                raise KeyError("Illegal parameter property: {}".format(prop))

    def eval_params(self, params=[], independent=True, **vals):
        """Returns parameters for evaluating the function.

        Arguments:
        params: optional list of values of free parameters
        independent: optional switch whether independent variables are requested
        vals: dictionary of parameter values

        If a value for a parameter is specified in both `params` and `vals`,
        the value from `vals` is used.
        Values for independent parameters must be specified in `vals`.
        If `independent == False`, the independent variable needn’t be specified
        and will not be returned."""
        # Add additional values from `params` to vals
        if len(params) != 0:
            par_names = self.names()
            if np.size(par_names) != len(params):
                raise ValueError("Wrong number of parameters given ({})".format(len(params)))
            for pn, pv in zip(par_names, params):
                if pn not in vals:
                    vals[pn] = pv

        # Fill values unspecified so far from `self.df`
        for p in self.df.index.values:
            if p not in vals:
                if independent and self.df.loc[p, 'independent']:
                    raise ValueError("Independent parameter `{}` not specified".format(p))
                else:
                    vals[p] = self.df.loc[p, 'value']
            elif not independent and self.df.loc[p, 'independent']:
                del vals[p]
        return vals

    def eval(self, params=[], **vals):
        """Evaluates the function.

        Arguments:
        params: optional list of values of free parameters
        vals: dictionary of parameter values

        If a value for a parameter is specified in both `params` and `vals`,
        the value from `vals` is used.
        Values for independent parameters must be specified in `vals`."""
        return self.fun(**self.eval_params(params, **vals))

    def freeIdx(self):
        """Returns a list of names of free parameters"""
        return [p for p in self.df.index.values
                if not (self.df.loc[p, 'independent'] or self.df.loc[p, 'fixed'])]

    def bounds(self):
        """Returns a list of bound tuples of free parameters
        for use in scipy.optimize.minimize"""
        bnds = []
        for p in self.freeIdx():
            # Get parameter bounds
            min_val = self.df.loc[p, 'min']
            max_val = self.df.loc[p, 'max']

            # Replace missing values with default minimum and maximum values
            if np.isnan(min_val):
                min_val = None
            if np.isnan(max_val):
                max_val = None

            # Append to bounds list
            bnds.append((min_val, max_val))
        return bnds

    def initial(self):
        """Returns a numpy.ndarray of initial values for use in scipy.optimize.minimize"""
        return self.df.loc[self.freeIdx(), 'value'].values.copy()

    def index(self, p):
        """Returns the index of a given parameter in the parameter vector"""
        idx = np.flatnonzero(self.df.index.values == p)
        if len(idx) == 0:
            raise KeyError("Unknown parameter name: {}".format(p))
        return idx[0]

    def names(self, onlyFree=True):
        """Returns an array of the parameter names.

        If `onlyFree == True`, only free parameters are returned.
        Else, all parameters (including independent and fixed parameters) are returned."""
        if onlyFree:
            return np.array(self.freeIdx(), dtype=np.object_)
        else:
            return self.df.index.values.copy()

    def copy(self):
        """Returns a deep copy of this instance"""
        return deepcopy(self)

In [None]:
# Separate models
red_p = FitParameters(red, independent='t')
red_p.set('G0r', min=0, value=G0r_0)
red_p.set('Gu0r', min=0, value=Gu0r_0)
red_p.set('betr', min=0, value=betr_0)
red_p.set('kmr', min=0, value=kmr_0)

green_p = FitParameters(green, independent='t')
green_p.set('G0g', min=0, value=G0g_0)
green_p.set('Gu0g', min=0, value=Gu0g_0)
green_p.set('betg', min=0, value=betg_0)
green_p.set('kmg', min=0, value=kmg_0)

## Jacobian
To increase the efficiency of fitting, the Jacobian matrix of the objective function is provided to the optimization routine.
If the objective function is a typical negative log-likelihood function with normal distribution of residuals
$$
 L(\theta) = \sum_{t\in T} \frac{1}{2\sigma_t^2} \big(D_t - f(t\mid\theta)\big)^2 \text{,}
$$
where $D_t$ is the measured data at time $t$ and $f(t\mid\theta)$ is the value of the model function at time $t$ with parameters $\theta$, the Jacobian is:
$$\begin{align}
\nabla L(\theta) &= \nabla \sum_{t\in T} \frac{1}{2\sigma_t^2} \big(D_t - f\left(t\,\middle|\,\theta\right)\big)^2 \\
&= \sum_{t\in T} \nabla \frac{1}{2\sigma_t^2} \big( D_t - f\left(t\,\middle|\,\theta\right) \big)^2 \\
&= \sum_{t\in T} \frac{2}{2\sigma_t^2} \big( D_t - f\left(t\,\middle|\,\theta\right) \big) \nabla\big( D_t - f\left(t\,\middle|\,\theta\right) \big) \\
&= \sum_{t\in T} \frac{1}{\sigma_t^2} \big( D_t - f\left(t\,\middle|\,\theta\right) \big)\big(\nabla D_t - \nabla  f\left(t\,\middle|\,\theta\right)\big) \\
&= -\sum_{t\in T} \frac{1}{\sigma_t^2} \big(D_t - f\left(t\,\middle|\,\theta\right)\big) \nabla f\left(t\,\middle|\,\theta\right) \\
\end{align}$$
We see that for calculating the Jacobian of the objective function we need the Jacobian of the model function.

We use the general fixed expression model $G(t)$ function from above.

The Jacobian $\nabla G\left(t \,\middle|\, G_0, G_{u0}, \beta, k_m\right)$ of the general fixed expression model function is the vector of the derivatives with respect to the various parameters:
$$\begin{align*}
\frac{\partial G}{\partial G_0} &= \mathrm{e}^{-\beta t} \\
\frac{\partial G}{\partial G_{u0}} &= \mathrm{e}^{-\beta t} - \mathrm{e}^{-(\beta+k_m)t}\\
\frac{\partial G}{\partial \beta} &= -G_0 t \mathrm{e}^{-\beta t} - G_{u0} t \left( \mathrm{e}^{-(\beta+k_m)t} \right) = -t G(t)\\
\frac{\partial G}{\partial k_m} &= G_{u0} t \mathrm{e}^{-(\beta+k_m)t}\\
\end{align*}$$

In [None]:
def general_jacobian(t, G0, Gu0, beta, km):
    """Returns the Jacobi matrix of the general fixed expression model function
    with time along axis=0 and parameters along axis=1"""

    # Initialize Jacobian
    jac = np.zeros((np.size(t), 3))

    # Define abbreviations for frequent terms
    ebt = np.exp(-beta * t)
    ebkt = np.exp(-(beta + km) * t)

    # Derive w.r.t. G0
    #jac[:, 0] = ebt

    # Derive w.r.t. Gu0
    jac[:, 0] = ebt - ebkt

    # Derive w.r.t. beta
    jac[:, 1] = -t * (G0 * ebt + Gu0 * (ebt - ebkt))

    # Derive w.r.t. km
    jac[:, 2] = Gu0 * t * ebkt

    return jac

def red_jacobian(t, G0r, Gu0r, betr, kmr):
    """Wrapper function for Jacobian of red model function"""
    return general_jacobian(t=t, G0=G0r, Gu0=Gu0r, beta=betr, km=kmr)

def green_jacobian(t, G0g, Gu0g, betg, kmg):
    """Wrapper function for Jacobian of green model function"""
    return general_jacobian(t=t, G0=G0g, Gu0=Gu0g, beta=betg, km=kmg)

## Hessian
Analogously, the Hessian matrix is defined for better fit results:
$$\begin{align}
\frac{\partial^2 L}{\partial\theta_2\partial\theta_1} &= \frac{\partial^2}{\partial\theta_2\partial\theta_1} \sum_{t\in T} \frac{1}{2\sigma_t^2} \big(D_t - f\left(t\,\middle|\,\theta\right)\big)^2 \\
&= \sum_{t\in T} \frac{\partial^2}{\partial\theta_2\partial\theta_1} \frac{1}{2\sigma_t^2} \big( D_t - f\left(t\,\middle|\,\theta\right) \big)^2 \\
&= \sum_{t\in T} \frac{2}{2\sigma_t^2} \frac{\partial}{\partial\theta_2} \big( D_t - f\left(t\,\middle|\,\theta\right) \big) \frac{\partial}{\partial\theta_1} \big( D_t - f\left(t\,\middle|\,\theta\right) \big) \\
&= \sum_{t\in T} \frac{1}{\sigma_t^2} \frac{\partial}{\partial\theta_2} \big( D_t - f\left(t\,\middle|\,\theta\right) \big) \left(-\frac{\partial f(t)}{\partial\theta_1}\right) \\
&= \sum_{t\in T} \frac{1}{\sigma_t^2}
\left( \frac{\partial f(t)}{\partial\theta_2} \frac{\partial f(t)}{\partial\theta_1} - \big( D_t - f\left(t\,\middle|\,\theta\right) \big) \frac{\partial^2 f(t)}{\partial\theta_2\partial\theta_1} \right) \\
\end{align}$$

The second order derivatives of the general fixed expression model function are:
$$\begin{align*}
\frac{\partial^2 G}{\partial G_{u0}^2} &= 0 \\
\frac{\partial^2 G}{\partial \beta^2} &= G_0 t^2 \mathrm{e}^{-\beta t} + G_{u0} t^2 \left( \mathrm{e}^{-\beta t} - \mathrm{e}^{-(\beta+k_m)t} \right) \\
\frac{\partial^2 G}{\partial k_m^2} &= -G_{u0} t^2 \mathrm{e}^{-(\beta+k_m)t}\\
\frac{\partial^2 G}{\partial G_{u0} \partial \beta} &= t \left( \mathrm{e}^{-(\beta+k_m)t} - \mathrm{e}^{-\beta t} \right) \\
\frac{\partial^2 G}{\partial G_{u0} \partial k_m} &= t \mathrm{e}^{-(\beta+k_m)t} \\
\frac{\partial^2 G}{\partial \beta \partial k_m} &= -G_{u0} t^2 \mathrm{e}^{-(\beta+k_m)t}
\end{align*}$$

In [None]:
def general_hessian(t, G0, Gu0, beta, km):
    """Returns the Hessian matrix of the general fixed expression model function
    with time along axis=0 and parameters along axis=1"""

    # Initialize Hessian
    hes = np.zeros((np.size(t), 6))

    # Define abbreviations for frequent terms
    ebt = np.exp(-beta * t)
    ebkt = np.exp(-(beta + km) * t)
    t2 = t**2

    # Derive w.r.t. Gu0
    # hes[:, 0] equals 0, do nothing

    # Derive w.r.t. beta
    hes[:, 1] = G0 * t2 * ebt + Gu0 * t2 * (ebt - ebkt)

    # Derive w.r.t. km
    hes[:, 2] = -Gu0 * t2 * ebkt

    # Derive w.r.t. Gu0 and beta
    hes[:, 3] = t * (ebkt - ebt)

    # Derive w.r.t. Gu0 and km
    hes[:, 4] = t * ebkt

    # Derive w.r.t. beta and km
    hes[:, 5] = -Gu0 * t2 * ebkt

    return hes

def red_hessian(t, G0r, Gu0r, betr, kmr):
    """Wrapper function for Hessian of red model function"""
    return general_hessian(t=t, G0=G0r, Gu0=Gu0r, beta=betr, km=kmr)

def green_hessian(t, G0g, Gu0g, betg, kmg):
    """Wrapper function for Hessian of green model function"""
    return general_hessian(t=t, G0=G0g, Gu0=Gu0g, beta=betg, km=kmg)

## Read in data and prepare result list

In [None]:
# Calculate kernel density estimation of parameter distributions
def parameter_KDE(par_tab, bw_div=15, dens_res=200, nice_ends=True):
    """
    Returns a kernel density estimation of the parameter values for plotting.

    Input parameters:
        par_tab: pandas.DataFrame with parameters as columns and realizations as rows
        bw_div: ratio data range – bandwidth; scalar or list with one entry per column
        dens_res: sample number returned; scalar or list with one entry per column
        nice_ends: (optional) if `True`, the start and end points will be set to 0

    Returns:
        dict with parameter names as keys. The values are dicts with keys "val" and "prob".
        "val" contains an array of parameter values.
        "prob" contains an array of the same length as for "val", containing the
        probabilities/relative 
    """
    # Test lendth of `bw_div`
    if not hasattr(bw_div, '__len__'):
        this_bw_div = bw_div
        has_multiple_bw_div = False
    elif len(bw_div) == len(par_tab.columns):
        has_multiple_bw_div = True
    else:
        raise ValueError("bw_div has length {:d}, but it must be scalar "
                        "or a list with {:d} entries (one per column).".format(
                        len(dw_div), len(par_tab.columns)))

    # Test length of `dens_res`
    if not hasattr(dens_res, '__len__'):
        this_dens_res = dens_res
        has_multiple_dens_res = False
    elif len(dens_res) == len(par_tab.columns):
        has_multiple_dens_res = True
    else:
        raise ValueError("dens_res has length {:d}, but it must be scalar "
                        "or a list with {:d} entries (one per column).".format(
                        len(dens_res), len(par_tab.columns)))

    # Initialize return dictionary
    par_dist = {}

    for i, par_name in enumerate(par_tab.columns):
        # Get parameter values
        par_vals = par_tab.loc[:,par_name].values
        par_vals = par_vals.reshape((-1, 1))

        if has_multiple_dens_res:
            this_dens_res = dens_res[i]
        if has_multiple_bw_div:
            this_bw_div = bw_div[i]

        # Test parameter values for validity
        if np.any(np.logical_not(np.isfinite(par_vals))):
            print("Warning: invalid values encountered for “{}”".format(par_name))
            par_vals = par_vals(np.isfinite(par_vals))
            if par_vals.size > 0:
                # Reshape valid entries for KDE fit
                par_vals = par_vals.reshape((-1, 1))
            else:
                # No valid entries found; cancel distribution calculation
                par_dist[par_name] = {'val': [], 'prob': []}
                continue

        # Get parameter extrema and bandwidth
        par_min = np.min(par_vals)
        par_max = np.max(par_vals)
        bw = (par_max - par_min) / this_bw_div

        # Get kernel density estimation of parameter values
        kde = KernelDensity(kernel='epanechnikov', bandwidth=bw).fit(par_vals)
        par_x = np.linspace(par_min, par_max, this_dens_res).reshape((-1, 1))
        par_dens = np.exp(kde.score_samples(par_x))

        # Adjust values for nicer plotting (KDE >= 0, edges == 0)
        #par_dens[par_dens < 0] = 0
        if par_dens[0] != 0:
            par_dens = np.insert(par_dens, 0, 0)
            par_x = np.insert(par_x, 0, par_min)
        if par_dens[-1] != 0:
            par_dens = np.append(par_dens, 0)
            par_x = np.append(par_x, par_max)

        # Insert KDE into dict
        par_dist[par_name] = {'val': par_x.flatten(), 'prob': par_dens.flatten()}
    return par_dist

In [None]:
def plot_kde(ax, dist, label, clr_face='b', clr_edge='k', mark=None):
    """Plots the current parameter value in relation to the distribution
    in the whole dataset."""
    ax.fill_betweenx(dist['val'], dist['prob'], color=clr_face)
    if mark != None:
        ax.axhline(y=mark, color=clr_edge)
    ax.set_xticks([])
    ax.spines['left'].set_position('zero')
    for s in [ax.spines[pos] for pos in ['bottom', 'right', 'top']]:
        s.set_visible(False)
    ax.set_title(label)
    #ax.get_yaxis().set_major_formatter(StrMethodFormatter('{x:.2g}'))

In [None]:
# Prepare data loading

# Define available files
datafiles = [
    {
        "sample": "Huh7",
        "condition": "rfp",
        "measurement": "2017-09-08_seq6",
        "file": "data/2017-09-08_seq6_Huh7_CayRFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "rfp",
        "measurement": "2017-09-08_seq7",
        "file": "data/2017-09-08_seq7_Huh7_CayRFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "rfp",
        "measurement": "2017-09-08_seq8",
        "file": "data/2017-09-08_seq8_Huh7_CayRFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "rfp",
        "measurement": "2017-09-08_seq9",
        "file": "data/2017-09-08_seq9_Huh7_CayRFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "rfp",
        "measurement": "2017-09-08_seq10",
        "file": "data/2017-09-08_seq10_Huh7_CayRFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "rfp",
        "measurement": "2017-09-08_seq11",
        "file": "data/2017-09-08_seq11_Huh7_CayRFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "gfp",
        "measurement": "2017-08-18_seq6",
        "file": "data/2017-08-18_seq6_Huh7_eGFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "gfp",
        "measurement": "2017-08-18_seq7",
        "file": "data/2017-08-18_seq7_Huh7_eGFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "gfp",
        "measurement": "2017-08-18_seq8",
        "file": "data/2017-08-18_seq8_Huh7_eGFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "gfp",
        "measurement": "2017-08-18_seq9",
        "file": "data/2017-08-18_seq9_Huh7_eGFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "gfp",
        "measurement": "2017-08-18_seq10",
        "file": "data/2017-08-18_seq10_Huh7_eGFP_CHX_#molecules.xlsx"
    }, {
        "sample": "Huh7",
        "condition": "gfp",
        "measurement": "2017-08-18_seq11",
        "file": "data/2017-08-18_seq11_Huh7_eGFP_CHX_#molecules.xlsx"
    }
]

# By default, mark all files for loading
load_idcs = range(len(datafiles))

# Define function for loading data
def load_data_from_files():
    """Loads data from specified files into `D`.
    Requires `load_idcs` to hold a list of indices to `datafiles`."""
    global D
    D = []
    for i in load_idcs:
        # Show message
        print("Loading file: {}".format(datafiles[i]["file"]))

        # Read sheets from excel file
        X = pd.read_excel(datafiles[i]['file'], dtype=np.float64, sheet_name=[
            '#RFP', '#GFP', '#RFP_error', '#GFP_error'])

        # Write data into easy-to-access structure
        d = {}
        d['sample'] = datafiles[i]['sample']
        d['condition'] = datafiles[i]['condition']
        d['measurement'] = datafiles[i]['measurement']
        d['file'] = datafiles[i]['file']

        if d['condition'] == 'gfp':
            d['t'] = X['#GFP'].values[:,0].flatten()
        else:
            d['t'] = X['#RFP'].values[:,0].flatten()
        #d['rfp'] = X['RFP'].values[:,1:]
        #d['gfp'] = X['GFP_corrected'].values[:,1:]
        d['rfp'] = X['#RFP'].values[:,1:]
        d['gfp'] = X['#GFP'].values[:,1:]
        d['rfp_error'] = X['#RFP_error'].values[:,1:]
        d['gfp_error'] = X['#GFP_error'].values[:,1:]
        D.append(d)

In [None]:
def getDataLabel(i, filename=False):
    """Returns a nicely formatted name for the `i`-th element of `D`.
    Set `filename=True` for a filename-friendly output."""
    if filename:
        return "{0[measurement]}_{0[sample]}_{0[condition]}".format(D[i])
    return "{0[sample]}: {0[condition]} [{0[measurement]}]".format(D[i])

In [None]:
# Read in data from excel sheets

# Prompt user for files to load
lbl = wdg.Label('Select the files to load:')
lbl.layout.width = 'initial'
entries = []
for f in datafiles:
    entries.append("{} {}: {}".format(
        f['sample'], f['condition'], f['file']))
sel_entry = wdg.SelectMultiple(options=entries, rows=len(entries))
sel_entry.layout.width = 'initial'
bload = wdg.Button(description='Load')
bselall = wdg.Button(description='Select all')
bselnone = wdg.Button(description='Select none')

# Define callbacks
def sel_all_files(_):
    sel_entry.value = entries
def sel_no_files(_):
    sel_entry.value = ()
def load_button_clicked(_):
    global load_idcs
    load_idcs = [entries.index(r) for r in sel_entry.value]
    vb.close()
    load_data_from_files()
bselall.on_click(sel_all_files)
bselnone.on_click(sel_no_files)
bload.on_click(load_button_clicked)

# Finally, show the widgets
vb = wdg.VBox((lbl, sel_entry, wdg.HBox((bload,bselall,bselnone))))
IPython.display.display(vb)

In [None]:
# Provide output tables

# Initialize result dictionary
R = []

# Get a list of fit parameters
#par_names = red_p.names().tolist()
#par_names.extend(p for p in red_p.names() if p not in par_names)
#par_names.sort()

# Iteratively populate the result dictionary
for k in range(len(D)):
    R.insert(k, {})

    cond = D[k]['condition']
    if cond == 'rfp':
        cols = red_p.names()
    elif cond == 'gfp':
        cols = green_p.names()

    nTraces = np.shape(D[k][cond])[1]
    nTimes = np.shape(D[k][cond])[0]
    tpl_traces = np.empty((nTimes, nTraces))
    tpl_traces.fill(np.NaN)

    R[k][cond] = {}
    R[k][cond]['params'] = pd.DataFrame(index=np.arange(nTraces), columns=cols, dtype='float64')
    R[k][cond]['fit'] = np.copy(tpl_traces)
    R[k][cond]['success'] = np.zeros(nTraces, dtype=np.bool_)

### Pickle or load fitting results
Pickling is only reasonable if the result list `R` has already been populated by fitting (see below).

In [None]:
# Pickle fit results for future sessions
outfile = getOutpath('fixed_fit_results.pickled')
with open(outfile, 'wb') as f:
    pickle.dump(R, f)

In [None]:
# Load pickled results (requires file suffix “.pickled”)
pickfiles = [f for f in os.listdir(getOutpath()) if f.lower().endswith('.pickled')]
pickfiles.sort(reverse=True)

lbl = wdg.Label('Select the file to load:')
lbl.layout.width = 'initial'
rad = wdg.RadioButtons(options=pickfiles)
but = wdg.Button(description='Load')
vb = wdg.VBox([lbl, rad, but])
IPython.display.display(vb)

def clicked_on_but(b):
    global R
    with open(getOutpath(rad.value, ''), 'rb') as f:
        R = pickle.load(f)
    print('Loaded: ' + rad.value)
    vb.close()
but.on_click(clicked_on_but)

In [None]:
# Write results to XLSX
if len(R) != len(D):
    raise ValueError("R and D must have the same length!")

samples = set()
conditions = set()

for k in range(len(D)):
    # Collect information about this file
    sample = D[k]['sample']
    condition = D[k]['condition']
    measurement = D[k]['measurement']
    time = D[k]['t']

    if condition == 'rfp':
        rfp_raw = D[k]['rfp']
        rfp_error = D[k]['rfp_error']
        rfp_fit = R[k]['rfp']['fit']
        rfp_params = R[k]['rfp']['params']
    elif condition == 'gfp':
        gfp_raw = D[k]['gfp']
        gfp_error = D[k]['gfp_error']
        gfp_fit = R[k]['gfp']['fit']
        gfp_params = R[k]['gfp']['params']

    # Write data to file
    file = getOutpath("CHX__{}_{}_{}.xlsx".format(sample, measurement, condition))
    xlsx_writer = pd.ExcelWriter(file, engine='xlsxwriter')

    pd.DataFrame(time).to_excel(xlsx_writer, sheet_name="t")

    if condition == 'rfp':
        pd.DataFrame(rfp_raw).to_excel(xlsx_writer, sheet_name="RFP_raw")
        pd.DataFrame(rfp_error).to_excel(xlsx_writer, sheet_name="RFP_error")
        pd.DataFrame(rfp_fit).to_excel(xlsx_writer, sheet_name="RFP_fit")
        rfp_params.to_excel(xlsx_writer, sheet_name="RFP_params")
    elif condition == 'gfp':
        pd.DataFrame(gfp_raw).to_excel(xlsx_writer, sheet_name="GFP_raw")
        pd.DataFrame(gfp_error).to_excel(xlsx_writer, sheet_name="GFP_error")
        pd.DataFrame(gfp_fit).to_excel(xlsx_writer, sheet_name="GFP_fit")
        gfp_params.to_excel(xlsx_writer, sheet_name="GFP_params")
    
    xlsx_writer.save()

## Fit and plot separate models

In [None]:
def plotSeparate(ds, tr, pdf=None, par_kde=None):
    """Fits and plots the data, treating RFP and GFP separately.

    Keyword arguments:
    ds -- the dictionary key of the dataset
    tr -- the index of the trace in the dataset to be processed
    pdf -- a PdfPages object to which the figure is written if it is not None
    par_kde -- if containing dict of values of parameter distributions, plot distributions
    """

    # Get trace information
    cond = D[ds]['condition']

    # Plot fit results
    fig = plt.figure()

    if par_kde != None:
        fig.set_figwidth(1.6 * fig.get_figwidth())

        if cond == 'rfp':
            pn = ['Gu0r', 'betr', 'kmr']
            clr_face = '#ff000055'
            clr_edge = '#990000ff'
        elif cond == 'gfp':
            pn = ['Gu0g', 'betg', 'kmg']
            clr_face = '#00ff0055'
            clr_edge = '#009900ff'

        grid = (1, len(pn))
        gs = GridSpec(grid[0], grid[1])

        # Plot parameters
        for pi, label in enumerate(pn):
            ax = fig.add_subplot(gs[0, pi])
            data = par_kde[label]
            curr_val = R[ds][cond]['params'].loc[tr,label]
            plot_kde(ax, data, label, clr_face, clr_edge, curr_val)

        # Adjust subplot layout
        gs.tight_layout(fig, pad=0, rect=(0.5, 0, 1, 1))

        # Create axes for fit
        gs_fit = GridSpec(1, 1)
        ax = fig.add_subplot(gs_fit[0])
        gs_fit.tight_layout(fig, pad=0, rect=(0, 0, 0.5, 1))

    else:
        ax = fig.gca()

    if cond == 'rfp':
        p_f, = ax.plot(D[ds]['t'], R[ds][cond]['fit'][:,tr], '-', label='RFP (fit)', color='#ff0000', linewidth=1)
        p_d, = ax.plot(D[ds]['t'], D[ds][cond][:,tr], '-', label='RFP (measured)', color='#990000', linewidth=.5)
    elif cond == 'gfp':
        p_f, = ax.plot(D[ds]['t'], R[ds][cond]['fit'][:,tr], '-', label='GFP (fit)', color='#00ff00', linewidth=1)
        p_d, = ax.plot(D[ds]['t'], D[ds][cond][:,tr], '-', label='GFP (measured)', color='#009900', linewidth=.5)

    # Format plot
    ax.set_xlabel('Time [h]')
    ax.set_ylabel('Number of molecules [10³]')
    ax.set_title('{} #{:03d}\n(separate fit)'.format(getDataLabel(ds), tr))
    ax.legend(handles=[p_d, p_f])

    # Write figure to pdf
    if pdf != None:
        pdf.savefig(fig, bbox_inches='tight')

    # Show and close figure
    plt.show(fig)
    plt.close(fig)

In [None]:
# Fit traces separately
for ds in range(len(D)):

    # Define data-dependent functions and parameters
    cond = D[ds]['condition']
    if cond == 'rfp':
        ftprm = red_p
        mdl_fcn = red
        jac_fcn = red_jacobian
        hes_fcn = red_hessian
    elif cond == 'gfp':
        ftprm = green_p
        mdl_fcn = green
        jac_fcn = green_jacobian
        hes_fcn = green_hessian

    nTraces = np.shape(D[ds][cond])[1]

    for tr in range(nTraces):
        print('Fitting „{}“ #{:03d}/{:03d} …'.format(getDataLabel(ds), tr, nTraces))
        
        # Prepare data
        time = D[ds]['t']
        data = D[ds][cond][:,tr].flatten()
        wght = 1 / D[ds][cond + '_error'][:,tr]**2

        # Prepare parameters
        ftprm.set('G0' + cond[0], fixed=True, value=data[0])
        ftprm.set('Gu0' + cond[0], value=.5*(data.max() - data.min()))

        # Objective function (closure)
        def objective_fcn(params):
            """Objective function for separate model"""
            cur_val = ftprm.eval(params, t=time)
            chisq = np.sum((data - cur_val)**2 * wght)
            return chisq

        # Jacobian/gradient (closure)
        def gradient_fcn(params):
            """Gradient for separate model"""
            J = jac_fcn(**ftprm.eval_params(params, t=time))
            residuals = (data - ftprm.eval(params, t=time)).reshape((np.size(time),1))
            vrnc = wght.reshape(np.shape(residuals))
            return -np.sum(J * residuals * vrnc, axis=0).flatten()

        # Hessian/second derivative (closure)
        #def secderiv_fcn(params):
        #    """Hessian for separate model"""
        #    H = hes_fcn(**ftprm.eval_params(params, t=time))
        #    residuals = (data - ftprm.eval(params, t=time)).reshape((np.size(time), 1))
        #    jac = jac_fcn(**ftprm.eval_params(params, t=time))
        #    vrnc = wght.reshape((np.size(time), 1, 1))

        #    H *= residuals
        #    hes = np.empty((np.size(time), 3, 3))
        #    for i in range(3):
        #        hes[:,i,i] = jac[:,i]**2 - H[:,i]
        #    hes[:,0,1] = jac[:,0] * jac[:,1] - H[:,3]
        #    hes[:,0,2] = jac[:,0] * jac[:,2] - H[:,4]
        #    hes[:,1,2] = jac[:,1] * jac[:,2] - H[:,5]
        #    hes[:,1,0] = hes[:,0,1]
        #    hes[:,2,0] = hes[:,0,2]
        #    hes[:,2,1] = hes[:,1,2]
        #    hes *= vrnc

        #    return np.sum(hes, axis=0)

        # Fit the data
        result = sc.optimize.minimize(objective_fcn,
                                      ftprm.initial(),
                                      method='TNC',# one of: 'SLSQP' 'TNC' 'L-BFGS-B'
                                      bounds=ftprm.bounds(),
                                      jac=gradient_fcn,
                                      #hess=secderiv_fcn,
                                      options={'disp':True,
                                               'maxiter': 20000}
                                     )
        #result = sc.optimize.least_squares(
        #    objective_fcn,
        #    ftprm.initial(),
        #    jac=gradient_fcn,
        #    bounds=(0, np.inf),
        #    max_nfev=20000
        #)

        # Print result
        print("\tSuccess {}: {}".format(result.success, result.message))

        # Save results to R
        R[ds][cond]['params'].iloc[tr] = ftprm.eval_params(result.x, independent=False)
        best_fit = ftprm.eval(result.x, t=time)
        #best_fit = mdl_fcn(time, *result.x)
        R[ds][cond]['fit'][:,tr] = best_fit
        R[ds][cond]['success'][tr] = result.success

        # DEBUG
        #if tr >= 2:
        #    print("Breaking loop for debugging purposes")
        #    break

In [None]:
# Plot results of separate fit
ts = getTimeStamp()

for ds in range(len(D)):
    cond = D[ds]['condition']
    par_kde = parameter_KDE(R[ds][cond]['params'])
    pdffile = getOutpath('separate_{}.pdf'.format(getDataLabel(ds, True)))
    with PdfPages(pdffile) as pdf:
        for tr in range(np.shape(D[ds][cond])[1]):
            plotSeparate(ds, tr, pdf, par_kde)

            # DEBUG
            #if tr >= 2:
            #    print("Break loop")
            #    break

## Identify ill-shaped raw traces
For these measurements, the parts before CHX addition were cut away from the traces.
While most traces now start with an ascend, some traces have minima after the first value.
As our fitting model does not account for this behaviour, such traces may be fitted badly and affect the mean parameter values.

We therefore want to identify those traces and sort them out.

A trace is treated as ill-shaped if its smallest value before the maximum is
* the third or a later value or
* the second value and
  * its difference to the first value is larger than 3% of its difference to the maximum or
  * another value before the maximum is smaller than the first value.

In [None]:
def find_bad_traces(data):
    """
    Finds ill-shaped traces. A trace is ill-shaped or bad if
    any value before the maximum is smaller than the first value.
    However, if only the second value is smaller than the first value
    and the difference between first and second value does not exceed
    a threshold (3%) of the difference between maximum value and
    minimum value, the trace is not assumed bad.

    Argument:
        data -- array of traces (columns: traces, row: timepoints)

    Returns:
        1-dim indexing array, where the i-th element indicates if
        the i-th trace from data is bad (True) or not (False).
    """
    # Find maxima and minima of the traces
    maxima = data.argmax(axis=0)
    minima = np.zeros_like(maxima)
    for i, m in enumerate(maxima):
        minima[i] = data[:m,i].argmin(axis=0)

    # Mark all traces as bad where the first value is not the minimum
    bad_traces = minima > 0

    for i, m in enumerate(minima):
        # If only the second value is smaller than the first …
        if (m == 1) and data[2:maxima[i],i].min() > data[0,i]:
            # … check if the relative difference exceeds a threshold …
            amp = data[maxima[i],i] - data[m,i]
            if (data[0,i] - data[1,i]) / amp <= 0.3:
                # … and if not, do not discard the trace
                bad_traces[i] = False

    return bad_traces

In [None]:
# Search for bad traces
bad_traces_red = pd.DataFrame(False, index=chsq_red.index, columns=('bad',), dtype=np.bool_)
bad_traces_green = pd.DataFrame(False, index=chsq_green.index, columns=('bad',), dtype=np.bool_)

for i, d in enumerate(D):
    condition = d['condition']
    data = d[condition]

    # Identify bad traces:
    # the minimum is before the maximum, but not the first point
    #minima = data.argmin(axis=0)
    #maxima = data.argmax(axis=0)
    #bad_traces = (minima > 0) & (minima < maxima)
    bad_traces = find_bad_traces(data)

    # Save bad traces in condition-specific table
    if condition == 'rfp':
        bad_traces_red.loc[i, 'bad'] = bad_traces
    elif condition == 'gfp':
        bad_traces_green.loc[i, 'bad'] = bad_traces

In [None]:
# Plot bad traces
with PdfPages(getOutpath("fixed_bad_traces.pdf")) as pdf:
    for condition in ("rfp", "gfp"):
        if condition == 'rfp':
            bad_traces = bad_traces_red
        elif condition == 'gfp':
            bad_traces = bad_traces_green

        for i, j in bad_traces.index:
            if not bad_traces.loc[(i,j),'bad']:
                continue

            clr = condition[0]
            t = D[i]['t']

            f, ax = plt.subplots(1, 1)
            ax.plot(t, D[i][condition][:,j], '-', color=clr, lw=1, label="measured")
            ax.plot(t, R[i][condition]['fit'][:,j], '-k', lw=0.5, label="best fit")
            ax.legend()
            ax.set_xlabel("Time [h]")
            ax.set_ylabel("Fluorescence [a.u.]")
            ax.set_title("Trace {:03d} in {}".format(j, getDataLabel(i)))

            f.tight_layout(pad=0)
            plt.show(f)
            pdf.savefig(f)
            plt.close(f)

## Clustering-based data filtering
The $\chi^2$ values are examined. It is found that the $\chi^2$ values are nicely distributed and that no satisfying correlation between $\chi^2$ and outliers of parameter values exists. 

Hence, the idea of outlier filtering based on $\chi^2$ is discarded.
Instead, clustering-based outlier filtering is set up.

The traces are clustered with respect to $\beta$ and $k_\mathrm{m}$ using DBSCAN.
All traces that do not belong to the largest cluster are assumed to be outliers.
The center-of-mass of the largest cluster is written into an XLSX file for further use.

In [None]:
def chisquare(i_data, i_trace, to_dict=False, to_dataframe=False):
    """
    Computes the chi-square value of the best fit for a given trace

    Arguments:
        i_data -- index of the dataset in D or R
        i_trace -- index of the trace in the dataset

    Returns:
        chi-square value of given trace as scalar value
    """
    condition = D[i_data]['condition']
    data = D[i_data][condition][:,i_trace]
    t = D[i_data]['t']
    params = R[i_data][condition]['params'].iloc[i_trace,:].to_dict()

    if condition == 'rfp':
        fit = red(t=t, **params)
    elif condition == 'gfp':
        fit = green(t=t, **params)
    else:
        raise ValueError("Unknown condition: {}".format(condition))

    xq = np.sum((data - fit)**2)
    if to_dict or to_dataframe:
        xq_dict = {'chisq': xq, **params}
        if to_dataframe:
            return pd.DataFrame(xq_dict, index=(0,))
        return xq_dict
    return xq

In [None]:
# Create tables of chi-square values for RFP and GFP
muli = pd.MultiIndex(names=['Dataset', 'Trace'], levels=[[], []], labels=[[], []])
chsq_red = pd.DataFrame([], index=muli, columns=('chisq', 'G0r', 'Gu0r', 'betr', 'kmr'),
                        dtype=np.float_)
chsq_green = pd.DataFrame([], index=muli, columns=('chisq', 'G0g', 'Gu0g', 'betg', 'kmg'),
                          dtype=np.float_)

for i_data, d in enumerate(D):

    condition = d['condition']
    if condition == 'rfp':
        xq_tab = chsq_red
    elif condition == 'gfp':
        xq_tab = chsq_green
    else:
        raise ValueError("Unknown condition: {}".format(condition))

    for i_trace in range(d[condition].shape[1]):
        xq_tab.loc[(i_data, i_trace),:] = chisquare(i_data, i_trace,
                                                    to_dataframe=True).iloc[0,:]

# Plot chi-square histograms
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.hist(np.log10(chsq_red.loc[:,'chisq'].values), bins=50, color='red')
ax1.set_title(r"$\chi^2$ for RFP")
ax1.set_xlabel(r"$\log(\chi^2)$ [a.u.]")
ax1.set_ylabel("Occurrences")
ax2.hist(np.log10(chsq_green.loc[:,'chisq'].values), bins=50, color='green')
ax2.set_title(r"$\chi^2$ for GFP")
ax2.set_xlabel(r"$\log(\chi^2)$ [a.u.]")
ax2.set_ylabel("Occurrences")

f.tight_layout(pad=0)
plt.show(f)
with PdfPages(getOutpath("fixed_chisquare_histograms.pdf")) as pdf:
    pdf.savefig(f)
plt.close(f)

In [None]:
chsq_red

In [None]:
chsq_green

In [None]:
# Plot parameter – chi-square correlations
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)

# RFP: km
ax1.plot(chsq_red.loc[:,'chisq'].values, chsq_red.loc[:,'kmr'].values, '.r', ms=2)
ax1.set_ylabel(r"$k_\mathrm{m}$")
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_title("RFP")

# RFP: beta
ax3.plot(chsq_red.loc[:,'chisq'].values, chsq_red.loc[:,'betr'].values, '.r', ms=2)
ax3.set_xlabel(r"$\chi^2$")
ax3.set_ylabel(r"$\beta$")
ax3.set_yscale('log')

# GFP: km
ax2.plot(chsq_green.loc[:,'chisq'].values, chsq_green.loc[:,'kmg'].values, '.g', ms=2)
ax2.set_ylabel(r"$k_\mathrm{m}$")
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_title("GFP")

# GFP: beta
ax4.plot(chsq_green.loc[:,'chisq'].values, chsq_green.loc[:,'betg'].values, '.g', ms=2)
ax4.set_xlabel(r"$\chi^2$")
ax4.set_ylabel(r"$\beta$")
ax4.set_yscale('log')

f.tight_layout(pad=0, w_pad=2)
plt.show(f)
with PdfPages(getOutpath("fixed_chisquare_correlations.pdf")) as pdf:
    pdf.savefig(f)
plt.close(f)

In [None]:
# Plot beta – km correlations
f, (ax1, ax2) = plt.subplots(1, 2)

# RFP
ax1.plot(chsq_red.loc[:,'betr'], chsq_red.loc[:,'kmr'], 'r.', ms=2)
ax1.set_xlabel(r"$\beta$ [h${}^{-1}$]")
ax1.set_ylabel(r"$k_\mathrm{m}$ [h${}^{-1}$]")
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_title("RFP")

# GFP
ax2.plot(chsq_green.loc[:,'betg'], chsq_green.loc[:,'kmg'], 'g.', ms=2)
ax2.set_xlabel(r"$\beta$ [h${}^{-1}$]")
ax2.set_ylabel(r"$k_\mathrm{m}$ [h${}^{-1}$]")
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_title("GFP")

f.tight_layout(pad=0, w_pad=2)
plt.show(f)
with PdfPages(getOutpath("fixed_correlations.pdf")) as pdf:
    pdf.savefig(f)
plt.close(f)

In [None]:
def cluster_params(X, eps=0.15, min_samples=3, discard=None):
    """
    Clusters data X with DBSCAN

    Input:
        X -- n_samples x n_dimensions array of data to be clustered
        eps -- maximum distance parameter for DBSCAN
        min_samples -- minimum neighborhood size parameter for DBSCAN
        discard -- 1-dim logical indexing array of length n_samples with
                   True for cells to be discarded and False for cells to be clustered

    Returns:
        1-dim logical indexing array indicating rows in X that belong to largest cluster
    """
    # Prepare data (work in log-space)
    Xlog = np.log10(X)
    mask = np.all(np.isfinite(Xlog), axis=1)
    if discard is not None:
        mask = mask & ~discard
    Xlog = Xlog[mask,:]

    # Perform DBSCAN
    dbscan = skc.DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(Xlog)

    # Find largest cluster
    lbls = set(dbscan.labels_) - {-1}
    max_lbl_count = 0
    max_lbl = None
    for l in lbls:
        n_this = (dbscan.labels_ == l).sum()
        if n_this > max_lbl_count:
            max_lbl_count = n_this
            max_lbl = l

    # Build list of samples in cluster
    mask[mask] = dbscan.labels_ == max_lbl

    return mask

def center_of_mass(X):
    """
    Returns the center of mass of the data X.

    X is the data array with n_samples rows and n_dimensions columns.
    A 1-dim array with n_dimensions elements is returned, where each
    element is the mean value of the corresponding column in X.
    """
    return np.power(10, np.log10(X).mean(axis=0))

def std_dev(X):
    """
    Returns the standard deviation of the data X.

    X is the data array with n_samples rows and n_dimensions columns.
    A 1-dim array with n_dimensions elements is returned, where each
    element is the standard deviation of the corresponding column in X.
    """
    #return np.power(10, np.log10(X).std(axis=0))
    return np.std(X, axis=0)

In [None]:
# Clustering
Xr = chsq_red.loc[:,('betr','kmr')].values
Xg = chsq_green.loc[:,('betg','kmg')].values

bad_r = bad_traces_red['bad'].values
bad_g = bad_traces_green['bad'].values

cluster_r = cluster_params(Xr, eps=0.09, discard=bad_r)
cluster_g = cluster_params(Xg, eps=0.2, discard=bad_g)

outliers_r = ~(cluster_r | bad_r)
outliers_g = ~(cluster_g | bad_g)

com_r = center_of_mass(Xr[cluster_r,:])
com_g = center_of_mass(Xg[cluster_g,:])

# Plot clustering result
legend_opts = {'fontsize': 'small', 'borderpad': .3, 'labelspacing': .3,
               'handletextpad': .3, 'handlelength': 1}
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.plot(Xr[cluster_r,0], Xr[cluster_r,1], '.r', ms=2, label="accepted ({})".format(cluster_r.sum()))
ax1.plot(Xr[outliers_r,0], Xr[outliers_r,1], 'ok', mfc='none', ms=2, mew=0.5, label="outlier ({})".format(outliers_r.sum()))
ax1.plot(Xr[bad_r,0], Xr[bad_r,1], 'sk', mfc='none', ms=2, mew=0.5, label="bad trace ({})".format(bad_r.sum()))
ax1.plot(com_r[0], com_r[1], 'xk', label="mean")
ax1.legend(**legend_opts)
ax1.set_xlabel(r"$\beta$ [h${}^{-1}$]")
ax1.set_ylabel(r"$k_\mathrm{m}$ [h${}^{-1}$]")
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_title("RFP")

ax2.plot(Xg[cluster_g,0], Xg[cluster_g,1], '.g', ms=2, label="accepted ({})".format(cluster_g.sum()))
ax2.plot(Xg[outliers_g,0], Xg[outliers_g,1], 'ok', mfc='none', ms=2, mew=0.5, label="outlier ({})".format(outliers_g.sum()))
ax2.plot(Xg[bad_g,0], Xg[bad_g,1], 'sk', mfc='none', ms=2, mew=0.5, label="bad trace ({})".format(bad_g.sum()))
ax2.plot(com_g[0], com_g[1], 'xk', label="mean")
ax2.legend(**legend_opts)
ax2.set_xlabel(r"$\beta$ [h${}^{-1}$]")
ax2.set_ylabel(r"$k_\mathrm{m}$ [h${}^{-1}$]")
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_title("GFP")

f.tight_layout(pad=0, w_pad=2)
plt.show(f)
with PdfPages(getOutpath("fixed_correlations_cluster.pdf")) as pdf:
    pdf.savefig(f)
plt.close(f)

In [None]:
# Plot outlier traces
with PdfPages(getOutpath("fixed_outlier_traces.pdf")) as pdf:
    for condition in ("rfp", "gfp"):
        if condition == 'rfp':
            outliers_idx = outliers_r
            xq_tab = chsq_red
        elif condition == 'gfp':
            outliers_idx = outliers_g
            xq_tab = chsq_green

        for i, j in xq_tab.index[outliers_idx]:

            clr = condition[0]
            t = D[i]['t']

            f, ax = plt.subplots(1, 1)
            ax.plot(t, D[i][condition][:,j], '-', color=clr, lw=1, label="measured")
            ax.plot(t, R[i][condition]['fit'][:,j], '-k', lw=0.5, label="best fit")
            ax.legend()
            ax.set_xlabel("Time [h]")
            ax.set_ylabel("Fluorescence [a.u.]")
            ax.set_title("Trace {:03d} in {}".format(j, getDataLabel(i)))

            f.tight_layout(pad=0)
            plt.show(f)
            pdf.savefig(f)
            plt.close(f)

In [None]:
# Write values to XLSX files

# Build result tables
cols = ('mean', 'std_dev', 'stat_total', 'stat_used')
red_mean_tab = pd.DataFrame(index=chsq_red.columns[1:], columns=cols, dtype=np.float_)
green_mean_tab = pd.DataFrame(index=chsq_green.columns[1:], columns=cols, dtype=np.float_)

# Populate result tables
red_mean_tab.loc[['betr','kmr'],'mean'] = center_of_mass(chsq_red.loc[:,['betr','kmr']].values[cluster_r,:])
red_mean_tab.loc[['G0r','Gu0r'],'mean'] = np.mean(chsq_red.loc[:,['G0r','Gu0r']].values[cluster_r,:], axis=0)
red_mean_tab.loc[:,'std_dev'] = std_dev(chsq_red.loc[:,['G0r','Gu0r','betr','kmr']].values[cluster_r,:])
red_mean_tab.loc[:,'stat_total'] = cluster_r.size
red_mean_tab.loc[:,'stat_used'] = cluster_r.sum()

green_mean_tab.loc[['betg','kmg'],'mean'] = center_of_mass(chsq_green.loc[:,['betg','kmg']].values[cluster_g,:])
green_mean_tab.loc[['G0g','Gu0g'],'mean'] = np.mean(chsq_green.loc[:,['G0g','Gu0g']].values[cluster_g,:], axis=0)
green_mean_tab.loc[:,'std_dev'] = std_dev(chsq_green.loc[:,['G0g','Gu0g','betg','kmg']].values[cluster_g,:])
green_mean_tab.loc[:,'stat_total'] = cluster_g.size
green_mean_tab.loc[:,'stat_used'] = cluster_g.sum()

# Save distribution parameters to spreadsheet
xlsx_file = getOutpath("fixed_distribution_moments.xlsx")
xlsx_writer = pd.ExcelWriter(xlsx_file, engine='xlsxwriter')
red_mean_tab.to_excel(xlsx_writer, sheet_name="red", na_rep="NaN")
green_mean_tab.to_excel(xlsx_writer, sheet_name="green", na_rep="NaN")
xlsx_writer.save()

In [None]:
red_mean_tab

In [None]:
green_mean_tab

In [None]:
# Control: parameter – chi-square histograms for clustered data
legend_opts = {'fontsize': 'small', 'borderpad': .3, 'labelspacing': .3,
               'handletextpad': .3, 'handlelength': .5}
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)

# RFP: km
ax1.plot(chsq_red['chisq'].values[cluster_r], chsq_red['kmr'].values[cluster_r], '.r',
         ms=2, label="accepted ({})".format(cluster_r.sum()))
ax1.plot(chsq_red['chisq'].values[outliers_r], chsq_red['kmr'].values[outliers_r], 'ok',
         ms=2, mew=0.5, mfc='none', label="outlier ({})".format(outliers_r.sum()))
ax1.plot(chsq_red['chisq'].values[bad_r], chsq_red['kmr'].values[bad_r], 'sk',
         ms=2, mew=0.5, mfc='none', label="bad trace ({})".format(bad_r.sum()))
ax1.set_ylabel(r"$k_\mathrm{m}$")
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_title("RFP")
ax1.legend(**legend_opts)

# RFP: beta
ax3.plot(chsq_red['chisq'].values[cluster_r], chsq_red['betr'].values[cluster_r], '.r',
         ms=2, label="accepted ({})".format(cluster_r.sum()))
ax3.plot(chsq_red['chisq'].values[outliers_r], chsq_red['betr'].values[outliers_r], 'ok',
         ms=2, mew=0.5, mfc='none', label="outlier ({})".format(outliers_r.sum()))
ax3.plot(chsq_red['chisq'].values[bad_r], chsq_red['betr'].values[bad_r], 'sk',
         ms=2, mew=0.5, mfc='none', label="bad trace ({})".format(bad_r.sum()))
ax3.set_xlabel(r"$\chi^2$")
ax3.set_ylabel(r"$\beta$")
ax3.set_yscale('log')
ax3.legend(**legend_opts)

# GFP: km
ax2.plot(chsq_green['chisq'].values[cluster_g], chsq_green['kmg'].values[cluster_g], '.g',
         ms=2, label="accepted ({})".format(cluster_g.sum()))
ax2.plot(chsq_green['chisq'].values[outliers_g], chsq_green['kmg'].values[outliers_g], 'ok',
         ms=2, mew=0.5, mfc='none', label="outlier ({})".format(outliers_g.sum()))
ax2.plot(chsq_green['chisq'].values[bad_g], chsq_green['kmg'].values[bad_g], 'sk',
         ms=2, mew=0.5, mfc='none', label="bad trace ({})".format(bad_g.sum()))
ax2.set_ylabel(r"$k_\mathrm{m}$")
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_title("GFP")
ax2.legend(**legend_opts)

# GFP: beta
ax4.plot(chsq_green['chisq'].values[cluster_g], chsq_green['betg'].values[cluster_g], '.g',
         ms=2, label="accepted ({})".format(cluster_g.sum()))
ax4.plot(chsq_green['chisq'].values[outliers_g], chsq_green['betg'].values[outliers_g], 'ok',
         ms=2, mew=0.5, mfc='none', label="outlier ({})".format(outliers_g.sum()))
ax4.plot(chsq_green['chisq'].values[bad_g], chsq_green['betg'].values[bad_g], 'sk',
         ms=2, mew=0.5, mfc='none', label="bad trace ({})".format(bad_g.sum()))
ax4.set_xlabel(r"$\chi^2$")
ax4.set_ylabel(r"$\beta$")
ax4.set_yscale('log')
ax4.legend(**legend_opts)

f.tight_layout(pad=0, w_pad=2)
plt.show(f)
with PdfPages(getOutpath("fixed_chisquare_correlations_cluster.pdf")) as pdf:
    pdf.savefig(f)
plt.close(f)


In [None]:
# Control: Plot chi-square histograms for clustered data
f, (ax1, ax2) = plt.subplots(1, 2)

red_vals = np.log10(chsq_red['chisq'].values)
(_, _, patches) = ax1.hist((red_vals[cluster_r], red_vals[~cluster_r]), bins=50, stacked=True)
plt.setp(patches[0], facecolor='r', edgecolor='r', lw=.5)
plt.setp(patches[1], facecolor='none', edgecolor='k', lw=.5)
ax1.set_title(r"$\chi^2$ for RFP")
ax1.set_xlabel(r"$\log(\chi^2)$ [a.u.]")
ax1.set_ylabel("Occurrences")

green_vals = np.log10(chsq_green['chisq'].values)
(_, _, patches) = ax2.hist((green_vals[cluster_g], green_vals[~cluster_g]), bins=50, stacked=True)
plt.setp(patches[0], facecolor='g', edgecolor='g', lw=.5)
plt.setp(patches[1], facecolor='none', edgecolor='k', lw=.5)
ax2.set_title(r"$\chi^2$ for GFP")
ax2.set_xlabel(r"$\log(\chi^2)$ [a.u.]")
ax2.set_ylabel("Occurrences")

f.tight_layout(pad=0)
plt.show(f)
with PdfPages(getOutpath("fixed_chisquare_histograms_cluster.pdf")) as pdf:
    pdf.savefig(f)
plt.close(f)

In [None]:
# Write complete single-cell results to XLSX file

# Append “bad trace” and “outlier” to parameter table
all_red = chsq_red
all_red['bad'] = bad_traces_red
all_red['outlier'] = outliers_r

all_green = chsq_green
all_green['bad'] = bad_traces_green
all_green['outlier'] = outliers_g

# Create dataset information table
ds_info_tab = pd.DataFrame(index=[], columns=['Dataset', 'sample', 'condition', 'measurement'])
for i, d in enumerate(D):
    ds_info_tab.loc[i,'Dataset'] = i
    for k in ('sample', 'condition', 'measurement'):
        ds_info_tab.loc[i,k] = d[k]

# Save complete single-cell results to spreadsheet
xlsx_file = getOutpath("fixed_parameters_filter.xlsx")
xlsx_writer = pd.ExcelWriter(xlsx_file, engine='xlsxwriter')
all_red.to_excel(xlsx_writer, sheet_name="red", na_rep="NaN")
all_green.to_excel(xlsx_writer, sheet_name="green", na_rep="NaN")
ds_info_tab.to_excel(xlsx_writer, sheet_name="info")
xlsx_writer.save()

## Distributions
Based on the fits, the underlying parameter distributions are acquired.

## Playground
This section contains code that was/is used for developing ideas.

In [None]:
def rug_plot(ax, data, y_min=0, y_max=0.1, **kwargs):
    """
    Creates a rug plot in the given axis of the given data.

    Input:
        ax -- axes object in which to plot the rug plot
        data -- numpy array of horizontal positions at which to plot the rug plot
        y_min -- lower end of the rug plot in percent of axes (default: 0)
        y_max -- upper end of the rug plot in percent of axes (default: 0.1)
        kwargs -- (optional) keyword arguments to be passed to `axvline`

    Returns:
        list of return values of `axvline`

    The rug plot is plotted as a loop of calls to `axvline` over the elements
    of a flattened copy of the array `data`.
    """
    return [ax.axvline(d, ymin=y_min, ymax=y_max, **kwargs) for d in data.flatten()]

In [None]:
# Ask how to set the x-axis limits
restrict_xlim_to_data = False
def toggle_xlim_restr(b):
    global restrict_xlim_to_data
    restrict_xlim_to_data = b['new']
chbt = wdg.Checkbox(description="Restrict x-limits to clean data")
chbt.observe(toggle_xlim_restr, names="value")
IPython.display.display(chbt)

In [None]:
# Merge all red datasets
mrg_cnd = []
mrg_idx = []
for i, r in enumerate(R):
    if 'rfp' not in r:
        continue

    mrg_cnd.append(r['rfp']['params'])
    mrg_idx.append(i)
    #print("{}: {}".format(i, str(r['rfp']['params'].shape)))

big_red = pd.concat(mrg_cnd, keys=mrg_idx)

# Plot combined red parameter distributions
pn_red = ('G0r', 'Gu0r', 'betr', 'kmr')
kde = parameter_KDE(big_red,
                    bw_div=(15, 15, 15, 500),
                    dens_res=(200, 200, 200, 5000),
                    nice_ends=False)

# Prepare table of red parameter distribution moments
red_dist_tab = pd.DataFrame(index=pn_red,
                            columns=('mean', 'dirty_mean', 'std_dev',
                                     'stat_total', 'stat_thresh'),
                            dtype=np.float_)

f, axa = plt.subplots(len(kde), 1, figsize=(8,12))

for i, p in enumerate(pn_red):
    clr_face = '#ff000055'
    #clr_edge = '#990000ff'
    #plot_kde(axa[i], kde[p], p, clr_face)

    # Load data
    data = big_red.loc[:,p].values

    prob = kde[p]['prob']
    idx = prob >= .03 * prob.max()
    prob = prob[idx]
    val = kde[p]['val'][idx]

    max_val = val[-1]
    idx_fit = np.logical_and(data <= max_val, data != 0)

    mn = np.mean(data[idx_fit])
    dirty_mean = np.mean(data)
    std = np.std(data[idx_fit])
    scale = np.abs(val[prob.argmax()] - val[np.abs(prob - prob.max() * .5).argmin()])
    red_dist_tab.loc[p,:] = (mn, dirty_mean, std, data.size, idx_fit.sum())
    #print("{:4s}: mean={:9.4f}, std={:9.4f}, scale={:9.4f}, dirty_mean={:9.4}".format(
    #    p, mn, std, scale, dirty_mean))

    rug_plot(axa[i], data, color='#bbbbbb', lw=1)[0].set_label("Single observations")
    axa[i].plot(val, prob, '-k', label="Kernel density".format(p))
    axa[i].axvline(mn, color='b', label="Mean (clean)")
    axa[i].axvline(dirty_mean, color='r', ls='--', label="Mean (dirty)")

    # Format axes
    y_intervall = np.array((0, min(prob.max() * 1.1, axa[i].get_ylim()[1])))
    axa[i].set_ylim(y_intervall)

    x_intervall = np.array((0, val.max()))
    if not restrict_xlim_to_data:
        x_intervall[1] = max(x_intervall[1], data[data <= 1.5 * val.max()].max())
    x_intervall += np.array((-1, 1)) * .025 * (x_intervall[1] - x_intervall[0])
    axa[i].set_xlim(x_intervall)

    axa[i].text(x_intervall.sum() / 2, y_intervall[1] * .95, p,
           verticalalignment='top', horizontalalignment='center', size='x-large',
           bbox={'facecolor': 'red', 'edgecolor': 'none', 'alpha': .6})

    axa[i].legend()

axa[-1].set_xlabel("Parameter value [a.u.]")
axa[len(axa) // 2].set_ylabel("Frequency of occurrences")
f.tight_layout(pad=0)
plt.show(f)

with PdfPages(getOutpath("fixed_parameters_red.pdf")) as pdf:
    pdf.savefig(f)

plt.close(f)

In [None]:
# Do the same as above for all green parameters
# Merge all green datasets
mrg_cnd = []
mrg_idx = []
for i, r in enumerate(R):
    if 'gfp' not in r:
        continue

    mrg_cnd.append(r['gfp']['params'])
    mrg_idx.append(i)
    #print("{}: {}".format(i, str(r['rfp']['params'].shape)))

big_green = pd.concat(mrg_cnd, keys=mrg_idx)

# Plot combined red parameter distributions
pn_green = ('G0g', 'Gu0g', 'betg', 'kmg')
kde = parameter_KDE(big_green,
                    bw_div=(20, 5000, 30, 15),
                    dens_res=(250, 50000, 400, 200),
                    nice_ends=False)

# Prepare table of green parameter distribution moments
green_dist_tab = pd.DataFrame(index=pn_green,
                              columns=('mean', 'dirty_mean', 'std_dev',
                                       'stat_total', 'stat_thresh'),
                              dtype=np.float_)

f, axa = plt.subplots(len(kde), 1, figsize=(8,12))

for i, p in enumerate(pn_green):
    clr_face = '#00ff0055'

    # Load data
    data = big_green.loc[:,p].values

    prob = kde[p]['prob']
    idx = prob >= .03 * prob.max()
    prob = prob[idx]
    val = kde[p]['val'][idx]

    max_val = val[-1]
    idx_fit = np.logical_and(data <= max_val, data != 0)

    mn = np.mean(data[idx_fit])
    dirty_mean = np.mean(data)
    std = np.std(data[idx_fit])
    scale = np.abs(val[prob.argmax()] - val[np.abs(prob - prob.max() * .5).argmin()])
    green_dist_tab.loc[p,:] = (mn, dirty_mean, std, data.size, idx_fit.sum())
    #print("{:4s}: mean={:9.4f}, std={:9.4f}, scale={:9.4f}, dirty_mean={:9.4}".format(
    #    p, mn, std, scale, dirty_mean))

    # Plot dataset density
    rug_plot(axa[i], data, color='#bbbbbb', lw=1)[0].set_label("Single observations")
    axa[i].plot(val, prob, '-k', label="Kernel density".format(p))
    axa[i].axvline(mn, color='b', label="Mean (clean)")
    axa[i].axvline(dirty_mean, color='r', ls='--', label="Mean (dirty)")

    # Format axes
    y_intervall = np.array((0, min(prob.max() * 1.1, axa[i].get_ylim()[1])))
    axa[i].set_ylim(y_intervall)

    x_intervall = np.array((0, val.max()))
    if not restrict_xlim_to_data:
        x_intervall[1] = max(x_intervall[1], data[data <= 1.5 * val.max()].max())
    x_intervall += np.array((-1, 1)) * .025 * (x_intervall[1] - x_intervall[0])
    axa[i].set_xlim(x_intervall)

    axa[i].text(x_intervall.sum() / 2, y_intervall[1] * .95, p,
           verticalalignment='top', horizontalalignment='center', size='x-large',
           bbox={'facecolor': 'green', 'edgecolor': 'none', 'alpha': .6})

    axa[i].legend()

axa[-1].set_xlabel("Parameter value [a.u.]")
axa[len(axa) // 2].set_ylabel("Frequency")
f.tight_layout(pad=0)
plt.show(f)

with PdfPages(getOutpath("fixed_parameters_green.pdf")) as pdf:
    pdf.savefig(f)

plt.close(f)

In [None]:
red_dist_tab

In [None]:
green_dist_tab

In [None]:
# Save distribution parameters to spreadsheet
xlsx_file = getOutpath("fixed_distribution_moments.xlsx")
xlsx_writer = pd.ExcelWriter(xlsx_file, engine='xlsxwriter')
red_dist_tab.to_excel(xlsx_writer, sheet_name="red")
green_dist_tab.to_excel(xlsx_writer, sheet_name="green")
xlsx_writer.save()

In [None]:
# Print statistics
print(big_red.shape)
print(big_green.shape)

In [None]:
# Plot violin distributions of the data sets (both separate and combined)
pn_both = ()
pn_red = ('Gu0r', 'betr', 'kmr')
pn_green = ('Gu0g', 'betg', 'kmg')


with PdfPages(getOutpath('parameter_distributions.pdf')) as pdf:
    for ds in range(len(D)):
        par_kde = {}
        fit_types = []

        # Check for separate and single fit
        hasSeparate = False
        hasSingle = False
        if 'rfp' in R[ds] and 'gfp' in R[ds]:
            hasSeparate = True
            fit_types += ['rfp', 'gfp']
        elif 'rfp' in R[ds]:
            hasSingle = True
            fit_types += ['rfp']
        elif 'gfp' in R[ds]:
            hasSingle = True
            fit_types += ['gfp']

        # Check for combined fit
        par_kde_combined = {}
        if 'combined' in R[ds]:
            hasCombined = True
            fit_types += ['combined']
        else:
            hasCombined = False

        # Calculate parameter distributions
        for t in fit_types:
            par_kde[t] = parameter_KDE(R[ds][t]['params'])

        # Plot parameter distributions
        for typeName, hasType in zip(('separate', 'single', 'combined'),
                                     (hasSeparate, hasSingle, hasCombined)):
            if not hasType:
                continue

            if typeName == 'single':
                if 'rfp' in fit_types:
                    len_pn_type = len(pn_red)
                elif 'gfp' in fit_types:
                    len_pn_type = len(pn_green)
                else:
                    raise ValueError("Unknown fit types: {}".format(fit_type))

                grid = (1, len(pn_both) + len_pn_type)
            else:
                grid = (2, len(pn_both) + max(len(pn_red), len(pn_green)))

            fig = plt.figure()
            gs = GridSpec(grid[0], grid[1])

            if typeName == 'combined':
                # Combined fit; define specific settings
                pn_green_temp = pn_green
                pn_red_temp = pn_red
                offset_both = len(pn_both)
                kde_label_green = 'combined'
                kde_label_red = 'combined'

                # Plot combined parameters
                for pi, label in enumerate(pn_both):
                    ax = plt.subplot(gs.new_subplotspec((pi, 0), rowspan=2))
                    data = par_kde['combined'][label]
                    clr_face = '#0000ff55'
                    #clr_edge = '#000099ff'
                    plot_kde(ax, data, label, clr_face)
            else:
                # Separate fit; define specific settings
                pn_green_temp = pn_both + pn_green
                pn_red_temp = pn_both + pn_red
                offset_both = 0
                kde_label_green = 'gfp'
                kde_label_red = 'rfp'

            # Plot green parameters
            if (typeName != 'single') or 'gfp' in fit_types:
                for pi, par_label in enumerate(pn_green_temp):
                    ax = plt.subplot(gs.new_subplotspec((0, pi+offset_both)))
                    data = par_kde[kde_label_green][par_label]
                    clr_face = '#00ff0055'
                    #clr_edge = '#009900ff'
                    plot_kde(ax, data, par_label, clr_face)

            # Plot red parameters
            if (typeName != 'single') or 'rfp' in fit_types:
                i_row = 0 if typeName == 'single' else 1
                for pi, par_label in enumerate(pn_red_temp):
                    ax = plt.subplot(gs.new_subplotspec((i_row, pi+offset_both)))
                    data = par_kde[kde_label_red][par_label]
                    clr_face = '#ff000055'
                    #clr_edge = '#990000ff'
                    plot_kde(ax, data, par_label, clr_face)

            # Show and close figure
            fig.suptitle(getDataLabel(ds) + " (" + typeName + " fit)")
            fig.tight_layout(pad=0, rect=(0, 0, 1, .93))
            pdf.savefig(fig, bbox_inches='tight')
            plt.show(fig)
            plt.close(fig)

In [None]:
# Plot parameter correlations

# Get parameters to be correlated
par_cor = (('tr', 'tg'), ('m_ktl', 'm_ktl'), ('kmr', 'kmg'),
           ('betr', 'betg'), ('deltr', 'deltg'), ('offr', 'offg'))

for i, r in enumerate(R):
    with PdfPages(os.path.join(getOutpath(), '{:s}_parameter_correlations.pdf'.format(getTimeStamp()))) as pdf:
        for pr, pg in par_cor:
            # Get parameter values
            valr = r['red']['params'].loc[:,pr].values
            valg = r['green']['params'].loc[:,pg].values

            # Sort out outliers
            idx = np.ones(np.size(valr), dtype=np.bool_)
            isr = valr.argsort()[-2:]
            isg = valg.argsort()[-2:]

            if valr[isr[0]] < 0.9 * valr[isr[1]]:
                idx[isr[1]] = False
            if valg[isg[0]] < 0.9 * valg[isg[1]]:
                idx[isg[1]] = False

            # Plot
            fig = plt.figure(figsize=(4.5,4))
            ax = fig.add_subplot(111)
            ax.set_xscale('log')
            ax.set_yscale('log')
            ax.plot(valr[idx], valg[idx], '.')

            ax.set_autoscale_on(False)
            lmt = np.array([ax.get_xlim(), ax.get_ylim()])
            diag = (lmt[:,0].max(), lmt[:,1].min())
            ax.plot(diag, diag, '-k')

            ax.set_xlabel(pr, color='r')
            ax.set_ylabel(pg, color='g')
            ax.set_title("{}\nCorrelation {} – {}".format(getDataLabel(i), pr, pg))

            fig.tight_layout(pad=0)
            plt.show(fig)
            pdf.savefig(fig)
            plt.close(fig)

In [None]:
# Plot the parameter distributions for the datasets
ds_keys = list(R.keys())
ds_keys.sort()
params = R[ds_keys[0]]['combined']['params'].columns
grid = (len(params), len(ds_keys))
i_col = 0

pdffile = os.path.join(getOutpath(), '{:s}_parameters.pdf'.format(getTimeStamp()))
with PdfPages(pdffile) as pdf:
    fig = plt.figure()
    fig.set_figheight(grid[0] * .8 * fig.get_figheight())
    fig.set_figwidth(grid[1] * .8 * fig.get_figwidth())

    for ds in ds_keys:
        i_row = 0
        for p in params:
            ax = plt.subplot2grid(grid, (i_row, i_col))
            ax.hist(R[ds]['combined']['params'][p], bins=100)
            if i_row == grid[0] - 1:
                ax.set_xlabel('Value [a.u.]')
            if i_col == 0:
                ax.set_ylabel('Occurrences [#]')
            ax.set_title('{:s}: {:s}'.format(ds, p))
            i_row += 1
        i_col += 1

    pdf.savefig(fig)
    plt.show(fig)
    plt.close(fig)

In [None]:
# Plot onset time correlations
pdffile = os.path.join(getOutpath(), '{:s}_onset_correlations.pdf'.format(getTimeStamp()))
with PdfPages(pdffile) as pdf:
    for k in R.keys():
        fig = plt.figure()
        plt.plot([0, 30], [0, 30], 'k-')
        plt.plot(R[k]['combined']['params']['tr'], R[k]['combined']['params']['tg'], '.')
        plt.xlabel('Onset RFP [h]')
        plt.ylabel('Onset GFP [h]')
        plt.title(k)
        pdf.savefig(fig)
        plt.show()
        plt.close()
    

In [None]:
# Degradation rate ratio
def plotHistograms(maxH):
    Rkeys = sorted(R.keys())
    for ds in Rkeys:
        #deltg = R[ds]['green']['params']['deltg']
        #deltr = R[ds]['red']['params']['deltr']
        deltg = R[ds]['combined']['params']['deltg']
        deltr = R[ds]['combined']['params']['deltr']
        quot = deltg / deltr

        fig = plt.figure()
        plt.hist(quot, bins=150, range=(0, maxH))
        plt.title(ds)
        plt.xlabel('$\delta_\mathrm{green} / \delta_\mathrm{red}$ [a.u.]')
        plt.ylabel('Occurrences [#]')
        plt.show(fig)
        plt.close(fig)

wdg.interact(plotHistograms, maxH=wdg.IntSlider(
    value=100, min=0, max=1000, step=10, description='Histogram maximum', continuous_update=False));

In [None]:
# Fit distribution to degradation rate quotient histograms
def gamma(x, p=2, b=1, s=10):
    return s * b**p * x**(p-1) * np.exp(-b * x) / sc.special.gamma(p)

def gamma2(x, p1=1.9, p2=2.1, b1=0.9, b2=1.1, s1=10, s2=10):
    return gamma(x, p1, b1, s1) + gamma(x, p2, b2, s2)

def weibull(x, lmbd=.2, k=2, s=10):
    return s * lmbd * k * (lmbd * x)**(k - 1) * np.exp(- (lmbd * x)**k)

def weibull2(x, lmbd1=.15, lmbd2=.25, k1=1.9, k2=2.1, s1=10, s2=10):
    return weibull(x, lmbd=lmbd1, k=k1, s=s1) + weibull(x, lmbd=lmbd2, k=k2, s=s2)

# Define models
model_gamma = lm.Model(gamma)
model_gamma.set_param_hint(name='p', min=.01)
model_gamma.set_param_hint(name='b', min=.01)
model_gamma.set_param_hint(name='s', min=1)

model_gamma2 = lm.Model(gamma2)
model_gamma2.set_param_hint(name='p1', min=.01)
model_gamma2.set_param_hint(name='p2', min=.01)
model_gamma2.set_param_hint(name='b1', min=.01)
model_gamma2.set_param_hint(name='b2', min=.01)
model_gamma2.set_param_hint(name='s1', min=1)
model_gamma2.set_param_hint(name='s2', min=1)

model_weibull = lm.Model(weibull)
model_weibull.set_param_hint(name='lmbd', min=.001)
model_weibull.set_param_hint(name='k', min=.001, max=5)
model_weibull.set_param_hint(name='s', min=1)

model_weibull2 = lm.Model(weibull2)
model_weibull2.set_param_hint(name='lmbd1', min=.001)
model_weibull2.set_param_hint(name='lmbd2', min=.001)
model_weibull2.set_param_hint(name='k1', min=.001, max=5)
model_weibull2.set_param_hint(name='k2', min=.001, max=5)
model_weibull2.set_param_hint(name='s1', min=1)
model_weibull2.set_param_hint(name='s2', min=1)

maxH = 40

with PdfPages(os.path.join(getOutpath(), '{:s}_degradation_distribution.pdf'.format(getTimeStamp()))) as pdf:
    for ds in sorted(R.keys()):
        # Calculate degradation rate quotient
        deltg = R[ds]['combined']['params']['deltg']
        deltr = R[ds]['combined']['params']['deltr']
        quot = deltg / deltr

        # Create histogram
        fig = plt.figure()
        ax = fig.add_subplot(1, 2, 1)
        hist_val, hist_edg = ax.hist(quot, bins=70, range=(0, maxH), label='Histogram')[:2]
        hist_ctr = (hist_edg[:-1] + hist_edg[1:]) / 2

        # Fit models
        result_g = model_gamma.fit(hist_val, x=hist_ctr)
        result_g2 = model_gamma2.fit(hist_val, x=hist_ctr)
        result_w = model_weibull.fit(hist_val, x=hist_ctr)
        result_w2 = model_weibull2.fit(hist_val, x=hist_ctr)

        # Select models
        #print('gamma: {}'.format(result_g.chisqr))
        #print('gamma2: {}'.format(result_g2.chisqr))
        #print('weibull: {}'.format(result_w.chisqr))
        #print('weibull2: {}'.format(result_w2.chisqr))

        if result_g2.chisqr < .7 * result_g.chisqr:
            res_g = result_g2
            name_g = 'gamma2'
        else:
            res_g = result_g
            name_g = 'gamma'

        if result_w2.chisqr < .7 * result_w.chisqr:
            res_w = result_w2
            name_w = 'weibull2'
        else:
            res_w = result_w
            name_w = 'weibull'

        # Plot models
        x = np.linspace(.1, 5, 100)
        ax.plot(hist_ctr, res_g.best_fit, '-', label=name_g, color='orange')
        ax.plot(hist_ctr, res_w.best_fit, '-', label=name_w, color='magenta')
        ax.legend()
        ax.set_xlabel('$\delta_\mathrm{green} / \delta_\mathrm{red}$ [a.u.]')
        ax.set_ylabel('Counts [#]')
        ax.set_title(ds)

        # Print fit reports
        rep = res_g.fit_report(show_correl=False) + '\n' + res_w.fit_report(show_correl=False)
        ax = fig.add_subplot(1, 2, 2)
        ax.set_axis_off()
        ax.text(0, 1, rep, ha='left', va='top', family='monospace', size=5.5)

        # Display, save and close figure
        plt.show(fig)
        pdf.savefig(fig)
        plt.close(fig)

In [None]:
# Scatter plot of degradation rates
Rkeys = sorted(R.keys())
for ds in Rkeys:
    deltg = R[ds]['combined']['params']['deltg']
    deltr = R[ds]['combined']['params']['deltr']

    fig = plt.figure()
    h = plt.plot(deltg, deltr, '.')
    plt.title(ds)
    plt.xlabel('$\delta_\mathrm{green}$ [a.u.]')
    plt.ylabel('$\delta_\mathrm{red}$ [a.u.]')
    plt.xscale('log')
    plt.yscale('log')
    plt.show(fig)
    plt.close(fig)