# Analysis of Colocalization

### Notes

- **Experiment & Data**
    - Scope: 880 Airy-SR
    - Structure: neuromasts
    - Genetic background: wild-type
    - Perturbation: Heat-shock expression of Sdf1a
    - Markers: 
        - BAC(Cxcr7) expressing GFP-labeled WT Cxcr7 or Cxcr4
        - Red or far-red vesicle markers
        

- **Note:** I thought `abcam` was short for some sort of anti-ubiquitin antibody but it's actually just the company from which Mie ordered said antibody. Because it would be a pain to go back and change the naming now, just note that in this analysis the term `abcam` refers to **(anti-)ubiquitin**...

### Prep

In [None]:
### Imports

# Generic
from __future__ import division
import os, sys
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline

# Specific
import re, pickle
from ipywidgets import interact
from util.widget_helpers import savebutton
from scipy.stats import mannwhitneyu

In [None]:
### Parameters

# Identifiers for target file names
fname_suffix = '_8bit_maskcoloc.pkl'

# Path of dir containing point cloud data
fpath = 'data_full'

# Ordered list of experiments to load
labels = ['COLOClamp1CXCR4WT_20180427',
          'COLOClamp1CXCR4HS_20180427',
          'COLOClamp1WT_20180214',
          'COLOClamp1HS_20180214',
          'COLOClamp1WT_20180215',
          'COLOClamp1HS_20180215',
          'COLOClamp1WT_20180520',
          'COLOClamp1HS_20180520',
          'COLOClamp1WT_20180521',
          'COLOClamp1HS_20180521',
          'revCOLOClamp1WT_20191013',
          'revCOLOClamp1KA_20191013',
          'revCOLOCabcamWT_20191027',
          'revCOLOCabcamHS90min_20191027',
          'revCOLOCabcamHS210min_20191027',
          'revCOLOCabcamWT_20191103',
          'revCOLOCabcamHS90min_20191103',
          'revCOLOCabcamHS210min_20191103']

# Dict of colors to represent the different experiments  
color_dict = {'COLOClamp1WT_20180214'      : 'springgreen',
              'COLOClamp1HS_20180214'      : 'tomato',
              'COLOClamp1WT_20180215'      : 'springgreen',
              'COLOClamp1HS_20180215'      : 'tomato',
              'COLOClamp1WT_20180520'      : 'springgreen',
              'COLOClamp1HS_20180520'      : 'tomato',
              'COLOClamp1WT_20180521'      : 'springgreen',
              'COLOClamp1HS_20180521'      : 'tomato',
              'COLOClamp1CXCR4WT_20180427' : 'greenyellow',
              'COLOClamp1CXCR4HS_20180427' : 'orangered',
              'revCOLOClamp1WT_20191013'   : 'springgreen',
              'revCOLOClamp1KA_20191013'   : 'cyan',
              'revCOLOCabcamWT_20191027'       : 'darkgreen',
              'revCOLOCabcamHS90min_20191027'  : 'darkred',
              'revCOLOCabcamHS210min_20191027' : 'darkred',
              'revCOLOCabcamWT_20191103'       : 'darkgreen',
              'revCOLOCabcamHS90min_20191103'  : 'darkred',
              'revCOLOCabcamHS210min_20191103' : 'darkred'}

# Ordered list of experiment sets for grouping
experiments = ['lamp1_CXCR4_WT', 'lamp1_CXCR4_HS', 'lamp1_WT', 'lamp1_HS', 
               'lamp1_WT_rev', 'lamp1_KA_rev',
               'abcam_WT_rev', 'abcam_HS90min_rev', 'abcam_HS210min_rev']

# Dict merging experiments into sets
experiment_dict = {'COLOClamp1WT_20180214'      : 'lamp1_WT',
                   'COLOClamp1HS_20180214'      : 'lamp1_HS',
                   'COLOClamp1WT_20180215'      : 'lamp1_WT',
                   'COLOClamp1HS_20180215'      : 'lamp1_HS',
                   'COLOClamp1WT_20180520'      : 'lamp1_WT',
                   'COLOClamp1HS_20180520'      : 'lamp1_HS',
                   'COLOClamp1WT_20180521'      : 'lamp1_WT',
                   'COLOClamp1HS_20180521'      : 'lamp1_HS',
                   'COLOClamp1CXCR4WT_20180427' : 'lamp1_CXCR4_WT',
                   'COLOClamp1CXCR4HS_20180427' : 'lamp1_CXCR4_HS',
                   'revCOLOClamp1WT_20191013'   : 'lamp1_WT_rev',
                   'revCOLOClamp1KA_20191013'   : 'lamp1_KA_rev',
                   'revCOLOCabcamWT_20191027'       : 'abcam_WT_rev',
                   'revCOLOCabcamHS90min_20191027'  : 'abcam_HS90min_rev',
                   'revCOLOCabcamHS210min_20191027' : 'abcam_HS210min_rev',
                   'revCOLOCabcamWT_20191103'       : 'abcam_WT_rev',
                   'revCOLOCabcamHS90min_20191103'  : 'abcam_HS90min_rev',
                   'revCOLOCabcamHS210min_20191103' : 'abcam_HS210min_rev'}

# Dict of colors for experiment sets
experiment_color_dict = {'lamp1_WT'       : 'springgreen',
                         'lamp1_HS'       : 'tomato',
                         'lamp1_CXCR4_WT' : 'greenyellow',
                         'lamp1_CXCR4_HS' : 'orangered',
                         'lamp1_WT_rev'   : 'springgreen',
                         'lamp1_KA_rev'   : 'cyan',
                         'abcam_WT_rev'       : 'darkgreen', 
                         'abcam_HS90min_rev'  : 'darkred', 
                         'abcam_HS210min_rev' : 'darkred'}

# Dict matching experiment sets with controls
experiment_control_dict = {'lamp1_WT'       : 'lamp1_WT',
                           'lamp1_HS'       : 'lamp1_WT',
                           'lamp1_CXCR4_WT' : 'lamp1_CXCR4_WT',
                           'lamp1_CXCR4_HS' : 'lamp1_CXCR4_WT',
                           'lamp1_WT_rev'   : 'lamp1_WT_rev',
                           'lamp1_KA_rev'   : 'lamp1_WT_rev',
                           'abcam_WT_rev'       : 'abcam_WT_rev', 
                           'abcam_HS90min_rev'  : 'abcam_WT_rev', 
                           'abcam_HS210min_rev' : 'abcam_WT_rev'}

# Dict matching experiments with controls
control_dict = {'COLOClamp1WT_20180214'      : 'COLOClamp1WT_20180214',
                'COLOClamp1HS_20180214'      : 'COLOClamp1WT_20180214',
                'COLOClamp1WT_20180215'      : 'COLOClamp1WT_20180215',
                'COLOClamp1HS_20180215'      : 'COLOClamp1WT_20180215',
                'COLOClamp1WT_20180520'      : 'COLOClamp1WT_20180520',
                'COLOClamp1HS_20180520'      : 'COLOClamp1WT_20180520',
                'COLOClamp1WT_20180521'      : 'COLOClamp1WT_20180521',
                'COLOClamp1HS_20180521'      : 'COLOClamp1WT_20180521',
                'COLOClamp1CXCR4WT_20180427' : 'COLOClamp1CXCR4WT_20180427',
                'COLOClamp1CXCR4HS_20180427' : 'COLOClamp1CXCR4WT_20180427',
                'revCOLOClamp1WT_20191013'   : 'revCOLOClamp1WT_20191013',
                'revCOLOClamp1KA_20191013'   : 'revCOLOClamp1WT_20191013',
                'revCOLOCabcamWT_20191027'       : 'revCOLOCabcamWT_20191027',
                'revCOLOCabcamHS90min_20191027'  : 'revCOLOCabcamWT_20191027',
                'revCOLOCabcamHS210min_20191027' : 'revCOLOCabcamWT_20191027',
                'revCOLOCabcamWT_20191103'       : 'revCOLOCabcamWT_20191103',
                'revCOLOCabcamHS90min_20191103'  : 'revCOLOCabcamWT_20191103',
                'revCOLOCabcamHS210min_20191103' : 'revCOLOCabcamWT_20191103'}

In [None]:
### Data Loading

# Function packaging to keep global namespace clean
def load_data(fpath):

    # Get filenames
    fnames = [fname for fname in os.listdir(fpath) if fname.endswith(fname_suffix)]
    for fname in fnames: print fname
    # Extract experiment labels and dates
    label_pattern = re.compile(r'\(([A-Za-z0-9_]+)\)')
    data_labels   = [re.search(label_pattern, fname).group(1) for fname in fnames]
    date_pattern  = re.compile(r'_([0-9]{8})\)')
    data_dates    = [re.search(date_pattern, fname).group(1) for fname in fnames]
    
    # Keep only desired experiments
    fnames      = [fname for fname,label,date in zip(fnames,data_labels,data_dates) if label in labels]
    data_labels = [label for label,date in zip(data_labels,data_dates) if label in labels]
    data_dates  = [date for label,date  in zip(data_labels,data_dates) if label in labels]
    
    # Sort according to experiment list
    sort_indices = [labels.index(label) for label in data_labels]  # Get indices
    data_labels  = [label for index,label in sorted(zip(sort_indices, data_labels))]  # Sort the labels
    data_dates   = [date  for index,date  in sorted(zip(sort_indices, data_dates))]   # Sort the dates
    fnames       = [fname for index,fname in sorted(zip(sort_indices, fnames))]       # Sort the filenames  
    
    # Load the data
    data_coloc = []
    for fname in fnames:
        
        # Load intensity values
        with open(os.path.join(fpath, fname), "rb") as infile:
            d_coloc = pickle.load(infile)
        data_coloc.append(d_coloc)
        
    # Unfold into one array per measurement
    data = {}
    for key in data_coloc[0].keys():
        if not key=='t_mask':
            data[key] = np.array([d[key] for d in data_coloc])
    
    # Return results
    return np.array(data_labels), data
    
# Call data loading func
data_labels, data = load_data(fpath)

# Get sorted data keys
data_keys = sorted(data.keys())

# Get experiment labels
data_experiments = np.array([experiment_dict[label] for label in data_labels])

# Report
print "\nLoaded", len(data_labels), "datasets."
print "\nSamples per condition:"
for label in labels: print "  {0:16}{1:>4}".format(label, np.sum(data_labels==label))
print "\nSamples per experiment:"
for experiment in experiments: print "  {0:2}{1:>4}".format(experiment, np.sum(data_experiments==experiment))
print "\nAvailable measurements are:"
for key in data_keys: print "  {0:<21}{1}".format(key, data[key].shape)

### Comparative Boxplot

In [None]:
### Plot generation function

def nice_boxplot(ax, data_list, colors):
    
    # Create and style boxplot
    bp = ax.boxplot(data_list, widths=0.6, 
                    patch_artist=True, showfliers=False)
    plt.setp(bp['whiskers'], color='0.25', linewidth=1.2, linestyle='-')
    plt.setp(bp['caps'], color='0.25', linewidth=1.2)
    plt.setp(bp['medians'], color='0.25', linewidth=1.2)
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_edgecolor('none')
        
    # Add jittered data
    for i, y in enumerate(data_list):
        x = np.random.normal(i+1, 0.06, size=len(y))
        ax.plot(x, y, '.', color='k', markeredgewidth=0.0, alpha=0.5, zorder=10)
        
    # Axis cosmetics
    ax.set_ylim([ax.get_ylim()[0] - (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.02, 
                 ax.get_ylim()[1] + (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.02])

In [None]:
### Interactive boxplot

# Print button stuff
from ipywidgets import Button, Text, HBox, fixed
from IPython.display import display, clear_output
button_shown = False

# Interactive display
@interact(measure=['global_otsu_mean_ratio']+[v for v in sorted(data.keys())
                                              if not v=='global_otsu_mean_ratio'], 
          normalize=True, logFC=True, pool=True)
#@savebutton
def interactive_boxplot(measure='global_otsu_mean_ratio',
                        normalize=True, logFC=True, pool=True):
    
    # Prep data
    plot_data   = []
    plot_colors = []
    plot_labels = labels[:]
    for label in labels:
        if normalize:
            plot_data.append(data[measure][data_labels==label] / np.mean(data[measure][data_labels==control_dict[label]]))
            if logFC:
                plot_data[-1] = np.log(plot_data[-1])
        else:
            plot_data.append(data[measure][data_labels==label])
        plot_colors.append(color_dict[label])

    # Pool datalabels
    if pool:
    
        # Create dict of pooled data
        plot_data_dict = {}
        for li, label in enumerate(labels):
            if not experiment_dict[label] in plot_data_dict.keys():
                plot_data_dict[experiment_dict[label]] = []
            plot_data_dict[experiment_dict[label]].append(plot_data[li])
        
        # Create data, color and label lists of pooled data
        plot_data = []
        plot_colors = []
        plot_labels = []
        for exp_lbl in experiments:
            if exp_lbl in plot_data_dict.keys():
                plot_data.append(np.concatenate(plot_data_dict[exp_lbl]))
                plot_colors.append(experiment_color_dict[exp_lbl])
                plot_labels.append(exp_lbl)   
        
    # Prep figure
    fig, ax = plt.subplots(1, 1, figsize=(10,4))
    
    # Compute and print p-values
    plot_control_dict = control_dict if not pool else experiment_control_dict
    ctrl_idx = [plot_labels.index(plot_control_dict[label]) for label in plot_labels]
    p = [mannwhitneyu(plot_data[i], plot_data[ctrl_idx[i]], alternative='two-sided')[1] 
         if ((np.std(plot_data[i])>0) and (np.std(plot_data[ctrl_idx[i]])>0)) else 'NOT ENOUGH DATA'
         for i,label in enumerate(plot_labels)]
    for i, label in enumerate(plot_labels):
        print label, '\t', p[i]
    
    # Create plot
    nice_boxplot(ax, plot_data, plot_colors)
    ax.set_xticklabels(plot_labels, rotation=45, horizontalalignment='right')
    ax.set_ylabel(measure)