# Analysis of Different Cxcr7 Mutants

### Notes

- **Experiment & Data**
    - Scope: 880 Airy-SR
    - Structure: neuromasts
    - Genetic background: wild-type
    - Markers: 
        - BAC(Cxcr7) expressing different Cxcr7 mutants tagged with a green FP
        - Red membrane marker (Lyn:Ruby) as counterlabel

### Prep

In [None]:
### Imports

# Generic
from __future__ import division
import os, sys
import numpy as np
np.random.seed(42)
import scipy.ndimage as ndi
import matplotlib.pyplot as plt
%matplotlib inline

# Specific
import re, pickle
from ipywidgets import interact
from util.widget_helpers import savebutton
from scipy.stats import mannwhitneyu

In [None]:
### Settings

# Path of dir containing data
fpath = 'data_full'

# Ordered list of experiments to load
labels     = ['WT_20170507',
              'WT_20170508',
              'WT_20170509',
              'WT_20170902',
              'WT_20170903',
              'WT_20180605',
              'WT_20180606',
              
              'KA_20170508',
              'KA_20170507',
              
              'STA_20180303',
              'STA_20170501',
              
              'hs_20170722',
              'hs_20170723',
              'hs_20170724',
              'hs_20170902',
              'hs_20170903',
              'hs_20180605',
              'hs_20180606']

# Dates to exclude
exclude_dates = []

# Dict of colors to represent the different experiments  
color_dict = {'WT_20170507'     : 'springgreen',
              'WT_20170508'     : 'springgreen',
              'WT_20170509'     : 'springgreen',
              'WT_20170902'     : 'springgreen',
              'WT_20170903'     : 'springgreen',
              'WT_20180605'     : 'springgreen',
              'WT_20180606'     : 'springgreen',
              'WT_20180605'     : 'springgreen',
              'WT_20180606'     : 'springgreen',
              'hsCTRL_20170722' : 'springgreen',
              
              'KA_20170508'     : 'lightskyblue', 
              'KA_20170507'     : 'lightskyblue',
              'KAwt_20180302'   : 'lightskyblue',
              
              'STA_20180303'    : 'mediumorchid',
              'STA_20170501'    : 'mediumorchid',
              'STAwt_20180317'  : 'mediumorchid',
              'STAwt_20180318'  : 'mediumorchid',
              
              'hs_20170722'     : 'red',
              'hs_20170723'     : 'red',
              'hs_20170724'     : 'red',
              'hs_20170902'     : 'red',
              'hs_20170903'     : 'red',
              'hs_20180605'     : 'red',
              'hs_20180606'     : 'red'}

# Ordered list of experiment sets for grouping
experiments = ['WT', 'KA', 'STA', 'HS']

# Dict merging experiments into sets
experiment_dict = {'WT_20170507'     : 'WT',
                   'WT_20170508'     : 'WT',
                   'WT_20170509'     : 'WT',
                   'WT_20170902'     : 'WT',
                   'WT_20170903'     : 'WT',
                   'WT_20180605'     : 'WT',
                   'WT_20180606'     : 'WT',
                   'WT_20180605'     : 'WT',
                   'WT_20180606'     : 'WT',
                   'hsCTRL_20170722' : 'WT',
                   
                   'KA_20170508'     : 'KA', 
                   'KA_20170507'     : 'KA',
                   'KAwt_20180302'   : 'KA',
                   
                   'STA_20180303'    : 'STA',
                   'STA_20170501'    : 'STA',
                   'STAwt_20180317'  : 'STA',
                   'STAwt_20180318'  : 'STA',
                   
                   'hs_20170722'     : 'HS',
                   'hs_20170723'     : 'HS',
                   'hs_20170724'     : 'HS',
                   'hs_20170902'     : 'HS',
                   'hs_20170903'     : 'HS',
                   'hs_20180605'     : 'HS',
                   'hs_20180606'     : 'HS'}

# Dict of colors for experiment sets
experiment_color_dict = {'WT'   : 'springgreen',
                         'KA'   : 'lightskyblue',
                         'STA'  : 'mediumorchid',
                         'HS'   : 'red'}

In [None]:
### Data Loading

# Elaborate data loading & parsing function
def load_data(fpath):

    # Get filenames
    fnames = [fname for fname in os.listdir(fpath) if fname.endswith('masked_LMs.npz')]

    # Extract experiment labels and dates
    label_pattern = re.compile(r'\(([A-Za-z0-9_]+)\)')
    data_labels   = [re.search(label_pattern, fname).group(1) for fname in fnames]
    date_pattern  = re.compile(r'_([0-9]{8})\)')
    data_dates    = [re.search(date_pattern, fname).group(1) for fname in fnames]
    
    # Keep only desired experiments
    fnames      = [fname for fname,label,date in zip(fnames,data_labels,data_dates) 
                   if label in labels and not date in exclude_dates]
    data_labels = [label for label,date in zip(data_labels,data_dates)
                   if label in labels and not date in exclude_dates]
    data_dates  = [date for label,date  in zip(data_labels,data_dates)
                   if label in labels and not date in exclude_dates]
    
    # Sort according to experiment list
    sort_indices = [labels.index(label) for label in data_labels]  # Get indices
    data_labels  = [label for index,label in sorted(zip(sort_indices, data_labels))]  # Sort the labels
    data_dates   = [date  for index,date  in sorted(zip(sort_indices, data_dates))]   # Sort the dates
    fnames       = [fname for index,fname in sorted(zip(sort_indices, fnames))]       # Sort the filenames  
    
    # Metadata prep: load data
    metadata = []
    with open(os.path.join(fpath, r"metadata.txt"), "r") as infile:
        for line in infile.readlines():
            metadata.append(line.strip().split('\t'))

    # Metadata prep: int conversion function
    def cint(astr):
        try: return int(astr)
        except ValueError: return np.nan            
            
    # Load the data
    data_masked = []
    data_memsub = []
    data_intens = []
    data_meta   = []
    for fname in fnames:

        # Load masked dataset
        masked_LMs = np.load(os.path.join(fpath, fname))
        data_masked.append(masked_LMs)

        # Load memsubbed dataset
        memsub_LMs = np.load(os.path.join(fpath, fname[:-8]+"_memsub_LMs.npz"))
        data_memsub.append(memsub_LMs)
        
        # Load intensity values
        with open(os.path.join(fpath, fname[:-14]+"measurements.pkl"),"rb") as infile:
            data_intensity = pickle.load(infile)
        data_intens.append(data_intensity)
        
        # Metadata: load corresponding values
        for line in metadata:
            if line[0]==fname[:-20]:
                data_meta.append([cint(line[5]),cint(line[6])])
                    
    # Get the number of landmarks
    num_lms = data_masked[0]['lm_cx7'].shape[0]
        
    # Unfold the .npz structure
    data = {}
    for key in data_masked[0].files:
        data[key] = np.array([d[key] for d in data_masked])
    for key in data_memsub[0].files:
        data[key] = np.array([d[key] for d in data_memsub])
        
    # Unfold the intensity dicts
    for key in data_intens[0].keys():
        data['intensity_'+key] = np.array([d[key] for d in data_intens])
        
    # Unfold the metadata
    for i,key in enumerate(['meta_time','meta_nm']):
        data[key] = np.array([d[i] for d in data_meta])
    
    # Return results
    return np.array(data_labels), num_lms, data

# Call data loading func
data_labels, num_lms, data = load_data(fpath)

# Get sorted data keys
data_keys = sorted(data.keys())

# Remove labels of excluded dates
labels = [label for label in labels if not label[-8:] in exclude_dates]

# Get experiment labels
data_experiments = np.array([experiment_dict[label] for label in data_labels])

# Report
print "\nLoaded", len(data_labels), "datasets with", num_lms, "landmarks per set."
print "\nSamples per condition:"
for label in labels: print "  {0:16}{1:>4}".format(label, np.sum(data_labels==label))
print "\nSamples per experiment:"
for experiment in experiments: print "  {0:2}{1:>4}".format(experiment, np.sum(data_experiments==experiment))
print "\nAvailable measurements are:"
for key in data_keys: print "  {0:<21}{1}".format(key, data[key].shape)

### Point Cloud Distribution Analysis: Apical Distance Histogram

In [None]:
def apical_distance_hist(d, label, bins=25, alpha=0.5,
                         xlabel='', ylabel=''):

    # Subsample to get everything to the same number of LMs
    min_sample_nr  = num_lms * np.min(np.unique(data_experiments, return_counts=True)[1])
    d = d[np.random.choice(np.arange(d.shape[0]),
                           size=min_sample_nr, replace=False)]    
    
    # Get color
    c = experiment_color_dict[label]
    
    # Make hist
    plt.hist(d, bins=np.linspace(0, np.max(d), bins), 
             histtype='stepfilled', color=c, edgecolor=None, alpha=alpha, 
             label=label)
    plt.hist(d, bins=np.linspace(0, np.max(d), bins), 
             histtype='step', color=c, alpha=1)
    
    # Labeling
    plt.legend()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

In [None]:
### Interactive Apical Distance Histogram

# Initiate
@interact(val=['memsub_lum_dist_cx7']+[v for v in data_keys if 'lum_dist' in v
                                       and not v=='memsub_lum_dist_cx7'],
          WT=True, KA=True, STA=True, HS=False)
#@savebutton
def interactive_hist(val='memsub_lum_dist_cx7',
                     WT=True, KA=True, STA=True, HS=False):
    
    # Prep plot
    plt.figure(figsize=(12,4))
    
    # Make the plots
    for check, experiment in zip([WT,KA,STA,HS], experiments):
        if check:
            data_to_plot = data[val][data_experiments==experiment].flatten()
            apical_distance_hist(data_to_plot, experiment)
    
    # Global plot cosmetics
    plt.title("Apical Distance Histogram ["+val+"]")
    plt.xlabel("Distance from Lumen $[\mu m]$")
    plt.ylabel("Number of Landmarks")

### Point Cloud Distribution Analysis: Registered Overlays (2D)

In [None]:
def registered_overlay_scatter(d, label, fig, ax, ylbl='y', clbl='z',
                               xlim=[-50,50], ylim=[-20,20], vlim=[-0.5,1.5]):

    # Subsample to get everything to the same number of LMs
    min_sample_nr  = num_lms * np.min(np.unique(data_experiments, return_counts=True)[1])
    d = d[np.random.choice(np.arange(d.shape[0]),
                           size=min_sample_nr, replace=False)]    

    # Make scatter plot
    scat = ax.scatter(d[:,2], d[:,1], label=label,
                      c=d[:,0], cmap='inferno', vmin=vlim[0], vmax=vlim[1],
                      s=5, edgecolor='', alpha=0.5)
    
    # Colorbar
    cbar = fig.colorbar(scat, ax=ax)
    cbar.set_label(clbl, rotation=270)

    # Set limits
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # Label
    ax.legend()
    ax.set_ylabel(ylbl)

In [None]:
### Interactive Landmark Overlays: XY

# Initiate
@interact(val=['lm_memsub_cx7_tf']+[v for v in data_keys if not 'lum_dist' in v 
                                    and 'tf' in v and not v=='lm_memsub_cx7_tf'])
#@savebutton
def interactive_hist(val="lm_memsub_cx7_tf"):
    
    # Prep plot
    fig, ax = plt.subplots(len(experiments), figsize=(10,3*len(experiments)),
                           sharex=True, sharey=True)
    
    # Make the plots
    for experiment,axis in zip(experiments, ax):
        data_to_plot = data[val][data_experiments==experiment]
        data_to_plot = np.concatenate(data_to_plot)
        registered_overlay_scatter(data_to_plot, experiment, fig, axis, ylbl='y', clbl='z',
                                   xlim=[-50,50], ylim=[-20,20], vlim=[-0.5,1.5])
        
    # Global plot cosmetics
    plt.xlabel('x')
    plt.suptitle('Registered Landmark Distributions (x,y) of '+val, fontsize=14)
    plt.tight_layout()
    plt.subplots_adjust(top=0.95)

In [None]:
### Interactive Landmark Overlays: XZ

# Initiate
@interact(val=["lm_memsub_cx7_tf"]+[v for v in data_keys if not 'lum_dist' in v 
                                    and 'tf' in v and not v=="lm_memsub_cx7_tf"])
@savebutton
def interactive_hist(val="lm_memsub_cx7_tf"):
    
    # Prep plot
    fig, ax = plt.subplots(len(experiments), figsize=(10,3*len(experiments)),
                           sharex=True, sharey=True)
    
    # Make the plots
    for experiment,axis in zip(experiments, ax):
        data_to_plot = data[val][data_experiments==experiment]
        data_to_plot = np.concatenate(data_to_plot)
        data_to_plot = data_to_plot[:,[1,0,2]]       # Changing the axis
        registered_overlay_scatter(data_to_plot, experiment, fig, axis, ylbl='z', clbl='y',
                                   xlim=[-50,50], ylim=[-0.5,1.5], vlim=[-20,20])
        
    # Global plot cosmetics
    plt.xlabel('x')
    plt.suptitle('Registered Landmark Distributions (x,z) of '+val, fontsize=14)
    plt.tight_layout()
    plt.subplots_adjust(top=0.95)

### Intensity Analysis: Absolute Intensity Plots

In [None]:
def intensity_absolute_boxplot(d, val, pool=False):

    # Pool by experiments
    # Note: This is a bit hamfisted but it works for now.
    if pool:
        exp_d = []
        for exp in experiments:
            exp_d.append([])
            for dset, lbl in zip(d, labels):
                if experiment_dict[lbl] == exp:
                    exp_d[-1].append(dset)
            exp_d[-1] = np.concatenate(exp_d[-1])
        d = exp_d
        lbls  = experiments
        cdict = experiment_color_dict
    else:
        lbls  = labels
        cdict = color_dict
        
    # Prep
    fig, ax = plt.subplots(1, figsize=(len(lbls),4))
    
    # Create boxplot
    bp = ax.boxplot(d, showfliers=False, widths=0.7)

    # Boxplot cosmetics
    for item in ['boxes', 'whiskers']:
        plt.setp(bp[item], color='k', linestyle='-')
    ax.set_xticklabels([lbl+" (N="+str(len(d[idx]))+")" 
                        for idx,lbl in enumerate(lbls)])
    plt.setp(bp['medians'], color='r', alpha=0.5, linewidth=1.2)
    fig.autofmt_xdate()
    ax.set_title("Mean Intensity: "+val[10:])
        
    # Add jittered data
    for i,label in enumerate(lbls):
        y = d[i]                                      # Values
        x = np.random.normal(i+1, 0.08, size=len(y))  # Jitter
        ax.scatter(x, y, c=cdict[label], 
                   alpha=0.7, edgecolor='', s=20)

    # Axis cosmetics
    ax.set_ylim([ax.get_ylim()[0] - ax.get_ylim()[0] * 0.1, 
                 ax.get_ylim()[1] + ax.get_ylim()[1] * 0.1])
    ax.set_ylabel("Mean Intensity")

In [None]:
### Interactive Absolute Intensity Boxplot

# Initiate
time_max = int(np.nanmax(data["meta_time"]))
@interact(val=['intensity_all_cx7']+[v for v in data_keys if 'intensity' in v
                                     and not v=='intensity_all_cx7'],
          time_thresh=(0,time_max), pool=False)
#@savebutton
def interactive_rbp(val='intensity_all_cx7', 
                    time_thresh=0, pool=False):
    
    # Get data
    d = [data[val][data_labels==label] for label in labels]
    
    # For pooling, data should be control-normed (but it can't be here)
    if pool:
        import warnings
        warnings.warn("Pooling makes no sense without normalization "+
                      "and normalization is not possible here!")
        
    # Keep only those older than time_thresh (where time data is available)
    if not time_thresh == 0:
        t = [data["meta_time"][data_labels==label] for label in labels]
        with np.errstate(invalid='ignore'):
            for i,dd,tt in zip(range(len(d)),d,t):
                d[i] = dd[np.logical_or(tt > time_thresh, np.isnan(tt))]
                         
    # Make the plot
    intensity_absolute_boxplot(d, val, pool=pool)