# Analysis of Cxcr7 Response to Bafilomycin Treatment

### Notes

- **Experiment & Data**
    - Scope: 880 Airy-SR
    - Structure: neuromasts
    - Genetic background: wild-type
    - Perturbation1: Bafilomcin treatment (inhibits lysosomal degradation)
    - Perturbation2: Heat-shock expression of Sdf1a
    - Markers:
        - BAC(Cxcr7) expressing WT Cxcr7
        - Red membrane marker (Lyn:Ruby) as counterlabel

### Prep

In [None]:
### Imports

# Generic
from __future__ import division
import os, sys
import numpy as np
np.random.seed(42)
import scipy.ndimage as ndi
import matplotlib.pyplot as plt
%matplotlib inline

# Specific
import re, pickle
from ipywidgets import interact
from util.widget_helpers import savebutton
from scipy.stats import mannwhitneyu

In [None]:
### Settings

# Path of dir containing data
fpath = 'data_full'

# Ordered list of experiments to load
labels     = ['WT_20170902',
              'baf_20170902',
              'hs_20170902',
              'bafANDhs_20170902',
              'WT_20170903',
              'baf_20170903',
              'hs_20170903',
              'bafANDhs_20170903',
              'p1WT_20180210',
              'p1baf_20180210',
              'p2WT_20180210',
              'p2baf_20180210',
              'p2hs_20180210',
              'p2bafANDhs_20180210',
              'p3WT_20180211',
              'p3baf_20180211',
              'p3hs_20180211',
              'p3bafANDhs_20180211']

# Dates to exclude
exclude_dates = []

# Dict of colors to represent the different experiments  
color_dict = {'WT_20170902'       : 'lightskyblue',
              'baf_20170902'      : 'limegreen',
              'hs_20170902'       : 'tomato',
              'bafANDhs_20170902' : 'orange',
              'WT_20170903'       : 'lightskyblue',
              'baf_20170903'      : 'limegreen',
              'hs_20170903'       : 'tomato',
              'bafANDhs_20170903' : 'orange',
              'p1WT_20180210'     : 'lightskyblue',
              'p1baf_20180210'    : 'limegreen',
              'p2WT_20180210'     : 'lightskyblue',
              'p2baf_20180210'    : 'limegreen',
              'p2hs_20180210'     : 'tomato',
              'p2bafANDhs_20180210' : 'orange',
              'p3WT_20180211'     : 'lightskyblue',
              'p3baf_20180211'    : 'limegreen',
              'p3hs_20180211'     : 'tomato',
              'p3bafANDhs_20180211' : 'orange'}

# Ordered list of experiment sets for grouping
experiments = ['WT', 'HS', 'BAF', 'BAF+HS']

# Dict merging experiments into sets
experiment_dict = {'WT_20170902'       : 'WT',
                   'baf_20170902'      : 'BAF',
                   'hs_20170902'       : 'HS',
                   'bafANDhs_20170902' : 'BAF+HS',
                   'WT_20170903'       : 'WT',
                   'baf_20170903'      : 'BAF',
                   'hs_20170903'       : 'HS',
                   'bafANDhs_20170903' : 'BAF+HS',
                   'p1WT_20180210'     : 'WT',
                   'p1baf_20180210'    : 'BAF',
                   'p2WT_20180210'     : 'WT',
                   'p2baf_20180210'    : 'BAF',
                   'p2hs_20180210'     : 'HS',
                   'p2bafANDhs_20180210' : 'BAF+HS',
                   'p3WT_20180211'     : 'WT',
                   'p3baf_20180211'    : 'BAF',
                   'p3hs_20180211'     : 'HS',
                   'p3bafANDhs_20180211' : 'BAF+HS'}

# Dict of colors for experiment sets
experiment_color_dict = {'WT'     : 'lightskyblue',
                         'BAF'    : 'limegreen',
                         'HS'     : 'tomato',
                         'BAF+HS' : 'orange'}

# Dict matching experiments with controls
control_dict = {'WT_20170902'       : 'WT_20170902',
                'baf_20170902'      : 'WT_20170902',
                'hs_20170902'       : 'WT_20170902',
                'bafANDhs_20170902' : 'WT_20170902',
                'WT_20170903'       : 'WT_20170903',
                'baf_20170903'      : 'WT_20170903',
                'hs_20170903'       : 'WT_20170903',
                'bafANDhs_20170903' : 'WT_20170903',
                'p1WT_20180210'     : 'p1WT_20180210',
                'p1baf_20180210'    : 'p1WT_20180210',
                'p2WT_20180210'     : 'p2WT_20180210',
                'p2baf_20180210'    : 'p2WT_20180210',
                'p2hs_20180210'     : 'p2WT_20180210',
                'p2bafANDhs_20180210' : 'p2WT_20180210',
                'p3WT_20180211'     : 'p3WT_20180211',
                'p3baf_20180211'    : 'p3WT_20180211',
                'p3hs_20180211'     : 'p3WT_20180211',
                'p3bafANDhs_20180211' : 'p3WT_20180211'}

# Dict matching experiment sets with controls
experiment_control_dict = {'WT'     : 'WT',
                           'BAF'    : 'WT',
                           'HS'     : 'WT',
                           'BAF+HS' : 'WT'}

In [None]:
### Data Loading

# Elaborate data loading & parsing function
def load_data(fpath):

    # Get filenames
    fnames = [fname for fname in os.listdir(fpath) if fname.endswith('masked_LMs.npz')]

    # Extract experiment labels and dates
    label_pattern = re.compile(r'\(([A-Za-z0-9_]+)\)')
    data_labels   = [re.search(label_pattern, fname).group(1) for fname in fnames]
    date_pattern  = re.compile(r'_([0-9]{8})\)')
    data_dates    = [re.search(date_pattern, fname).group(1) for fname in fnames]
    
    # Keep only desired experiments
    fnames      = [fname for fname,label,date in zip(fnames,data_labels,data_dates) 
                   if label in labels and not date in exclude_dates]
    data_labels = [label for label,date in zip(data_labels,data_dates)
                   if label in labels and not date in exclude_dates]
    data_dates  = [date for label,date  in zip(data_labels,data_dates)
                   if label in labels and not date in exclude_dates]
    
    # Sort according to experiment list
    sort_indices = [labels.index(label) for label in data_labels]  # Get indices
    data_labels  = [label for index,label in sorted(zip(sort_indices, data_labels))]  # Sort the labels
    data_dates   = [date  for index,date  in sorted(zip(sort_indices, data_dates))]   # Sort the dates
    fnames       = [fname for index,fname in sorted(zip(sort_indices, fnames))]       # Sort the filenames  
    
    # Metadata prep: load data
    metadata = []
    with open(os.path.join(fpath, r"metadata.txt"), "r") as infile:
        for line in infile.readlines():
            metadata.append(line.strip().split('\t'))

    # Metadata prep: int conversion function
    def cint(astr):
        try: return int(astr)
        except ValueError: return np.nan            
            
    # Load the data
    data_masked = []
    data_memsub = []
    data_intens = []
    data_meta   = []
    for fname in fnames:

        # Load masked dataset
        masked_LMs = np.load(os.path.join(fpath, fname))
        data_masked.append(masked_LMs)

        # Load memsubbed dataset
        memsub_LMs = np.load(os.path.join(fpath, fname[:-8]+"_memsub_LMs.npz"))
        data_memsub.append(memsub_LMs)
        
        # Load intensity values
        with open(os.path.join(fpath, fname[:-14]+"measurements.pkl"),"rb") as infile:
            data_intensity = pickle.load(infile)
        data_intens.append(data_intensity)
        
        # Metadata: load corresponding values
        for line in metadata:
            if line[0]==fname[:-20]:
                data_meta.append([cint(line[5]),cint(line[6])])
                    
    # Get the number of landmarks
    num_lms = data_masked[0]['lm_cx7'].shape[0]
        
    # Unfold the .npz structure
    data = {}
    for key in data_masked[0].files:
        data[key] = np.array([d[key] for d in data_masked])
    for key in data_memsub[0].files:
        data[key] = np.array([d[key] for d in data_memsub])
        
    # Unfold the intensity dicts
    for key in data_intens[0].keys():
        data['intensity_'+key] = np.array([d[key] for d in data_intens])
        
    # Unfold the metadata
    for i,key in enumerate(['meta_time','meta_nm']):
        data[key] = np.array([d[i] for d in data_meta])
    
    # Return results
    return np.array(data_labels), num_lms, data

# Call data loading func
data_labels, num_lms, data = load_data(fpath)

# Get sorted data keys
data_keys = sorted(data.keys())

# Remove labels of excluded dates
labels = [label for label in labels if not label[-8:] in exclude_dates]

# Get experiment labels
data_experiments = np.array([experiment_dict[label] for label in data_labels])

# Report
print "\nLoaded", len(data_labels), "datasets with", num_lms, "landmarks per set."
print "\nSamples per condition:"
for label in labels: print "  {0:16}{1:>4}".format(label, np.sum(data_labels==label))
print "\nSamples per experiment:"
for experiment in experiments: print "  {0:2}{1:>4}".format(experiment, np.sum(data_experiments==experiment))
print "\nAvailable measurements are:"
for key in data_keys: print "  {0:<21}{1}".format(key, data[key].shape)

### Point Cloud Distribution Analysis: Apical Distance Histogram

In [None]:
def apical_distance_hist(d, label, min_sampling, color_dict, bins=25,
                         alpha=0.5, xlabel='', ylabel=''):

    # Subsample to get everything to the same number of LMs
    min_sample_nr  = num_lms * np.min(np.unique(min_sampling, return_counts=True)[1])
    d = d[np.random.choice(np.arange(d.shape[0]),
                           size=min_sample_nr, replace=False)]    
    
    # Get color
    c = color_dict[label]

    # Make hist
    plt.hist(d, bins=np.linspace(0, np.max(d), bins), 
             histtype='stepfilled', color=c, edgecolor=None, alpha=alpha, 
             label=label)
    plt.hist(d, bins=np.linspace(0, np.max(d), bins), 
             histtype='step', color=c, alpha=1)
    
    # Labeling
    plt.legend(fontsize=10)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

In [None]:
### Interactive Apical Distance Histogram

# Initiate
@interact(val=['memsub_lum_dist_cx7']+[v for v in data_keys if 'lum_dist' in v 
                                       and not v=='memsub_lum_dist_cx7'],
          show_all=False)
#@savebutton
def interactive_hist(val='memsub_lum_dist_cx7',
                     show_all=False):
    
    # Prep plot
    plt.figure(figsize=(12,4))
    
    # Make the summarized plots (per experiment)
    if not show_all:
        for experiment in experiments:
            plot_labels = [label for label in labels
                           if experiment_dict[label]==experiment]
            data_to_plot = data[val][np.in1d(data_labels,plot_labels)].flatten()
            apical_distance_hist(data_to_plot, experiment, 
                                 data_experiments, experiment_color_dict,
                                 alpha=0.2)
    
    # Make the individual plots (per label)
    else:
        for label in labels:
            data_to_plot = data[val][data_labels==label].flatten()
            apical_distance_hist(data_to_plot, label, 
                                 data_labels, color_dict,
                                 alpha=0.1)            
    
    # Global plot cosmetics
    plt.title("Apical Distance Histogram ["+val+"]")
    plt.xlabel("Distance from Lumen $[\mu m]$")
    plt.ylabel("Number of Landmarks")

### Point Cloud Distribution Analysis: Registered Overlays (2D)

In [None]:
def registered_overlay_scatter(d, label, fig, ax, min_sampling, ylbl='y', clbl='z',
                               xlim=[-50,50], ylim=[-20,20], vlim=[-0.5,1.5]):

    # Subsample to get everything to the same number of LMs
    min_sample_nr  = num_lms * np.min(np.unique(min_sampling, return_counts=True)[1])
    d = d[np.random.choice(np.arange(d.shape[0]),
                           size=min_sample_nr, replace=False)]     

    # Make scatter plot
    scat = ax.scatter(d[:,2], d[:,1], label=label,
                      c=d[:,0], cmap='inferno', vmin=vlim[0], vmax=vlim[1],
                      s=5, edgecolor='', alpha=0.5)

    # Colorbar
    cbar = fig.colorbar(scat, ax=ax)
    cbar.set_label(clbl, rotation=270)

    # Set limits
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # Label
    ax.legend()
    ax.set_ylabel(ylbl)

In [None]:
### Interactive Landmark Overlays: XY

# Initiate
@interact(val=['lm_memsub_cx7_tf']+[v for v in data_keys if not 'lum_dist' in v 
                                    and 'tf' in v and not v=='lm_memsub_cx7_tf'])
#@savebutton
def interactive_hist(val='lm_memsub_cx7_tf'):
    
    # Prep plot
    fig, ax = plt.subplots(len(experiments), figsize=(10,3*len(experiments)),
                           sharex=True, sharey=True)
    
    # Make the plots
    for experiment,axis in zip(experiments, ax):
        plot_labels = [label for label in labels
                       if experiment_dict[label]==experiment]
        data_to_plot = data[val][np.in1d(data_labels,plot_labels)]
        data_to_plot = np.concatenate(data_to_plot)
        registered_overlay_scatter(data_to_plot, experiment, fig, axis, data_experiments,
                                   ylbl='y', clbl='z', xlim=[-50,50], ylim=[-20,20], vlim=[-0.5,1.5])
        
    # Global plot cosmetics
    plt.xlabel('x')
    plt.suptitle('Registered Landmark Distributions (x,y) of '+val, fontsize=14)
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)

In [None]:
### Interactive Landmark Overlays: XZ

# Initiate
@interact(val=['lm_memsub_cx7_tf']+[v for v in data_keys if not 'lum_dist' in v 
                                    and 'tf' in v and not v=='lm_memsub_cx7_tf'])
#@savebutton
def interactive_hist(val='lm_memsub_cx7_tf'):
    
    # Prep plot
    fig, ax = plt.subplots(len(experiments), figsize=(10,3*len(experiments)),
                           sharex=True, sharey=True)
    
    # Make the plots
    for experiment,axis in zip(experiments, ax):
        plot_labels = [label for label in labels
                       if experiment_dict[label]==experiment]
        data_to_plot = data[val][np.in1d(data_labels,plot_labels)]
        data_to_plot = np.concatenate(data_to_plot)
        data_to_plot = data_to_plot[:,[1,0,2]]  # Changing the axis
        registered_overlay_scatter(data_to_plot, experiment, fig, axis, data_experiments,
                                   ylbl='z', clbl='y', xlim=[-50,50], ylim=[-0.5,1.5], vlim=[-20,20])
        
    # Global plot cosmetics
    plt.xlabel('x')
    plt.suptitle('Registered Landmark Distributions (x,z) of '+val, fontsize=14)
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)

### Intensity Analysis: Absolute Intensity Plots

In [None]:
def intensity_absolute_boxplot(d, val, is_normed=False, pool=False):

    # Pool by experiments
    # Note: This is a bit hamfisted but it works for now.
    if pool:
        exp_d = []
        for exp in experiments:
            exp_d.append([])
            for dset, lbl in zip(d, labels):
                if experiment_dict[lbl] == exp:
                    exp_d[-1].append(dset)
            exp_d[-1] = np.concatenate(exp_d[-1])
        d = exp_d
        lbls  = experiments
        cdict = experiment_color_dict
        ctrldict = experiment_control_dict
    else:
        lbls  = labels
        cdict = color_dict
        ctrldict = control_dict

    # Compute and print p-values
    print "\np-values:"
    ctrl_idx = [lbls.index(ctrldict[label]) for label in lbls]
    p = [mannwhitneyu(d[i], d[ctrl_idx[i]], alternative='two-sided')[1] 
         if ((np.std(d[i])>0) and (np.std(d[ctrl_idx[i]])>0)) else 'NOT ENOUGH DATA'
         for i,label in enumerate(lbls)]
    for i, label in enumerate(lbls):
        print label, '\t', p[i]
    
    # Prep
    fig, ax = plt.subplots(1, figsize=(1.5*len(lbls),4))

    # Create boxplot
    bp = ax.boxplot(d, showfliers=False)

    # Boxplot cosmetics
    for item in ['boxes', 'whiskers']:
        plt.setp(bp[item], color='k')
    ax.set_xticklabels([lbl+" (N="+str(len(d[idx]))+")" 
                        for idx,lbl in enumerate(lbls)])
    fig.autofmt_xdate()
    if is_normed:
        ax.set_title("Ctrl-Normed Mean Intensity: "+val[10:])
    else:
        ax.set_title("Mean Intensity: "+val[10:])
        
    # Add jittered data
    for i,label in enumerate(lbls):
        y = d[i]                                      # Values
        x = np.random.normal(i+1, 0.04, size=len(y))  # Jitter
        ax.scatter(x, y, c=cdict[label], 
                   alpha=0.7, edgecolor='', s=30)

    # Axis cosmetics
    ax.set_ylim([ax.get_ylim()[0] - ax.get_ylim()[0] * 0.1, 
                 ax.get_ylim()[1] + ax.get_ylim()[1] * 0.1])
    if is_normed:
        ax.set_ylabel("Ctrl-Normed Mean Intensity")
    else:
        ax.set_ylabel("Mean Intensity")

In [None]:
### Interactive Absolute Intensity Boxplot

# Initiate
@interact(val=['intensity_all_cx7']+[v for v in data_keys if 'intensity' in v
                                     and not v=='intensity_all_cx7'],
          ctrl_norm=True, pool=True)
#@savebutton
def interactive_rbp(val='intensity_all_cx7', 
                    ctrl_norm=True, pool=True):
    
    # Get data
    d = [data[val][data_labels==label] for label in labels]
    
    # Normalize by mean of respective control
    if ctrl_norm:
        ctrl_idx = [labels.index(control_dict[label]) for label in labels]
        d = [d[i] / np.mean(d[ctrl_idx[i]]) for i,label in enumerate(labels)]
    
    # Make the plot
    intensity_absolute_boxplot(d, val, is_normed=ctrl_norm, pool=pool)