<a id=top></a>

# Analysis of Engineered Features

## Table of Contents

**Note:** In this notebook, the engineered features are referred to as "covariates".

----

1. [Preparations](#prep)
2. [Analysis of Covariates](#covar_analysis)
    1. [Boxplots](#covar_analysis_boxplots)
    2. [Forward Mapping (onto Shape Space)](#covar_analysis_fwdmap)
    3. [Back Mapping (Tissue Consensus Map)](#covar_analysis_backmap)
    4. [Covariate Correlations](#covar_analysis_correlations)
3. [Covariate-Shape Relationships](#covar_fspace)
    1. [Covariate-Shape Correlations](#covar_fspace_correlations)
    2. [Covariate Relation Graph](#covar_fspace_graph)

<a id=prep></a>

## 1. Preparations

----

In [None]:
### Import modules

# External, general
from __future__ import division
import os, sys
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline

# External, specific
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML
from scipy.stats import linregress, pearsonr, gaussian_kde
from scipy.spatial import cKDTree
import seaborn as sns
sns.set_style('white')
import networkx as nx

# Internal
import katachi.utilities.loading as ld
import katachi.utilities.plotting as kp

In [None]:
### Load data

# Prep loader
loader = ld.DataLoaderIDR()
loader.find_imports(r"data/experimentA/extracted_measurements/", recurse=True, verbose=True)

# Import embedded feature space
dataset_suffix = "shape_TFOR_pca_measured.tsv"
#dataset_suffix = "shape_CFOR_pca_measured.tsv"
#dataset_suffix = "tagRFPtUtrCH_TFOR_pca_measured.tsv"
#dataset_suffix = "mKate2GM130_TFOR_pca_measured.tsv"
fspace_pca, prim_IDs, fspace_idx = loader.load_dataset(dataset_suffix)
print "Imported feature space of shape:", fspace_pca.shape

# Import TFOR centroid locations
centroids = loader.load_dataset("_other_measurements.tsv", IDs=prim_IDs)[0][:,3:6][:,::-1]
print "Imported TFOR centroids of shape:", centroids.shape
    
# Import engineered features
covar_df, _, _ = loader.load_dataset("_other_measurements.tsv", IDs=prim_IDs, force_df=True)
del covar_df['Centroids RAW X']; del covar_df['Centroids RAW Y']; del covar_df['Centroids RAW Z']
covar_names = list(covar_df.columns)
print "Imported covariates of shape:", covar_df.shape

In [None]:
### Report
print "\ncovar_df.head()"
display(covar_df.head())
print "\ncovar_df.describe()"
display(covar_df.describe())

In [None]:
### Z-standardize the covariates

covar_df_z = (covar_df - covar_df.mean()) / covar_df.std()

<a id=covar_analysis></a>

## 2. Analysis of Covariates

----

### Boxplots <a id=covar_analysis_boxplots></a>

In [None]:
### General boxplot of Covariates

# Interactive selection of covariates
wid = widgets.SelectMultiple(
        options=covar_names,
        value=covar_names,
        description='Covars',
)

# Interactive plot
@widgets.interact(selected=wid, standardized=True)
def covariate_boxplot(selected=covar_names,
                      standardized=True):

    # Select data
    if standardized:
        covar_df_plot = covar_df_z[list(selected)]
    else:
        covar_df_plot = covar_df[list(selected)]
    
    # Plot
    fig = plt.figure(figsize=(12,3))
    covar_df_plot.boxplot(grid=False)
    plt.tick_params(axis='both', which='major', labelsize=6)
    fig.autofmt_xdate()
    if standardized: plt.title("Boxplot of Covariates [standardized]")
    if not standardized: plt.title("Boxplot of Covariates [raw]")
    plt.show()

### Forward Mapping (onto Shape Space) <a id=covar_analysis_fwdmap></a>

In [None]:
### Interactive mapping of covariates onto PCA-transformed shape space

# Set interactions
@widgets.interact(covariate=covar_names,
                  prim_ID=prim_IDs,
                  PCx=(1, fspace_pca.shape[1], 1),
                  PCy=(1, fspace_pca.shape[1], 1),
                  standardized=False,
                  show_all_prims=True)

# Show 
def show_PCs(covariate=covar_names[0], prim_ID=prim_IDs[0], 
             PCx=1, PCy=2, standardized=False, show_all_prims=True): 
    
    # Select covariate data
    if standardized:
        covar_df_plot = covar_df_z[covariate]
    else:
        covar_df_plot = covar_df[covariate]
    
    # Prep
    plt.figure(figsize=(9,7))
    
    # If all should be shown...
    if show_all_prims:
        
        # Plot
        plt.scatter(fspace_pca[:,PCx-1], fspace_pca[:,PCy-1],
                    c=covar_df_plot, cmap=plt.cm.plasma,
                    s=10, edgecolor='', alpha=0.75)
    
        # Cosmetics  
        cbar = plt.colorbar()
        if standardized:
            cbar.set_label(covariate+" [standardized]", rotation=270, labelpad=15)
        else:
            cbar.set_label(covariate+" [raw]", rotation=270, labelpad=15)
        plt.xlabel("PC "+str(PCx))
        plt.ylabel("PC "+str(PCy))
        plt.title("PCA-Transformed Shape Space [All Prims]")
        plt.show()
        
    # If individual prims should be shown...
    else:
        
        # Plot
        plt.scatter(fspace_pca[fspace_idx==prim_IDs.index(prim_ID), PCx-1], 
                    fspace_pca[fspace_idx==prim_IDs.index(prim_ID), PCy-1],
                    c=covar_df_plot[fspace_idx==prim_IDs.index(prim_ID)], 
                    cmap=plt.cm.plasma, s=10, edgecolor='',
                    vmin=covar_df_plot.min(), vmax=covar_df_plot.max())
        
        # Cosmetics
        cbar = plt.colorbar()
        if standardized:
            cbar.set_label(covariate+" [standardized]", rotation=270, labelpad=15)
        else:
            cbar.set_label(covariate+" [raw]", rotation=270, labelpad=15)
        plt.xlabel("PC "+str(PCx))
        plt.ylabel("PC "+str(PCy))
        plt.title("PCA-Transformed Shape Space [prim "+prim_ID+"]")
        plt.show()

### Back Mapping (Tissue Consensus Map) <a id=covar_analysis_backmap></a>

In [None]:
### Interactive mapping of covariates onto centroids in TFOR

# Axis range
xlim = (-175, 15)
ylim = (- 25, 25)

# Set interactions
@widgets.interact(covariate=covar_names,
                  standardized=['no','z'])

# Plot
def centroid_backmap(covariate=covar_names[0],
                     standardized='no'):  

    # Select covariate data
    if standardized=='no':
        covar_df_plot = covar_df[covariate]
    elif standardized=='z':
        covar_df_plot = covar_df_z[covariate]
    
    # Init
    fig,ax = plt.subplots(1, figsize=(12,5))
    
    # Back-mapping plot
    #zord = np.argsort(covar_df_plot)
    zord = np.arange(len(covar_df_plot)); np.random.shuffle(zord)  # Random is better!
    scat = ax.scatter(centroids[zord,2], centroids[zord,1],
                      color=covar_df_plot[zord], cmap=plt.cm.plasma,
                      edgecolor='', s=15, alpha=0.75)

    # Cosmetics
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.invert_yaxis()  # To match images
    ax.set_xlabel('TFOR x')
    ax.set_ylabel('TFOR y')
    cbar = plt.colorbar(scat,ax=ax)
    if standardized:
        ax.set_title('Centroid Back-Mapping of '+covariate+' [standardized]')
        cbar.set_label(covariate+' [standardized]', rotation=270, labelpad=10)
    else:
        ax.set_title('Centroid Back-Mapping of '+covariate+' [raw]')
        cbar.set_label(covariate+' [raw]', rotation=270, labelpad=20)
    
    # Done
    plt.tight_layout()
    plt.show()

In [None]:
### Contour plot backmapping plot for publication

# Set interactions
@widgets.interact(covariate=covar_names,
                  standardized=['no','z'])

# Plot
def contour_backmap(covariate=covar_names[0],
                     standardized='no'):  

    # Settings
    xlim = (-130, 8)
    ylim = ( -19, 19)

    # Select covariate data
    if standardized=='no':
        covar_df_plot = covar_df[covariate]
    elif standardized=='z':
        covar_df_plot = covar_df_z[covariate]

    # Tools for smoothing on scatter
    from katachi.utilities.pcl_helpers import pcl_gaussian_smooth
    from scipy.spatial.distance import pdist, squareform

    # Cut off at prim contour outline
    kernel_prim = gaussian_kde(centroids[:,1:].T)
    f_prim = kernel_prim(centroids[:,1:].T)
    f_prim_mask = f_prim > f_prim.min() + (f_prim.max()-f_prim.min())*0.1
    plot_values    = covar_df_plot[f_prim_mask]
    plot_centroids = centroids[f_prim_mask]

    # Smoothen
    pdists = squareform(pdist(plot_centroids[:,1:]))
    plot_values = pcl_gaussian_smooth(pdists, plot_values[:,np.newaxis], sg_percentile=0.5)[:,0]

    # Initialize figure
    fig, ax = plt.subplots(1, figsize=(8, 3.25))

    # Contourf plot
    cfset = ax.tricontourf(plot_centroids[:,2], plot_centroids[:,1], plot_values, 20, 
                           cmap='plasma')

    # Illustrative centroids from a single prim
    plt.scatter(centroids[fspace_idx==prim_IDs.index(prim_IDs[0]), 2], 
                centroids[fspace_idx==prim_IDs.index(prim_IDs[0]), 1],
                c='', alpha=0.5)

    # Cosmetics
    ax.set_xlabel('TFOR x', fontsize=16)
    ax.set_ylabel('TFOR y', fontsize=16)
    plt.tick_params(axis='both', which='major', labelsize=13)
    plt.xlim(xlim); plt.ylim(ylim)
    ax.invert_yaxis()  # To match images

    # Colorbar
    cbar = plt.colorbar(cfset, ax=ax, pad=0.01)
    cbar.set_label(covariate, rotation=270, labelpad=10, fontsize=16)
    cbar.ax.tick_params(labelsize=13)

    # Done
    plt.tight_layout()
    plt.show()

### Covariate Correlations <a id=covar_analysis_correlations></a>

In [None]:
### Interactive linear fitting plot

# Set interaction
@widgets.interact(covar_x=covar_names, 
                  covar_y=covar_names)

# Plotting function
def corr_plot_covar(covar_x=covar_names[0], 
                    covar_y=covar_names[1]):

    # Prep
    plt.figure(figsize=(5,3))
    
    # Scatterplot
    plt.scatter(covar_df[covar_x], covar_df[covar_y],
                facecolor='darkblue', edgecolor='',
                s=5, alpha=0.5)
    plt.xlabel(covar_x)
    plt.ylabel(covar_y)
    
    # Linear regression and pearson
    fitted  = linregress(covar_df[covar_x], covar_df[covar_y])
    pearson = pearsonr(covar_df[covar_x], covar_df[covar_y])
    
    # Report
    print "Linear regression:"
    for param,value in zip(["slope","intercept","rvalue","pvalue","stderr"], fitted):
        print "  {}:\t{:.2e}".format(param,value)
    print "Pearson:"
    print "  r:\t{:.2e}".format(pearson[0])
    print "  p:\t{:.2e}".format(pearson[1])
    
    # Add fit to plot
    xmin,xmax = (covar_df[covar_x].min(), covar_df[covar_x].max())
    ymin,ymax = (covar_df[covar_y].min(), covar_df[covar_y].max())
    ybot,ytop = (xmin*fitted[0]+fitted[1], xmax*fitted[0]+fitted[1])
    plt.plot([xmin,xmax], [ybot,ytop], c='blue', lw=2, alpha=0.5)
    
    # Cosmetics and show
    plt.xlim([xmin,xmax])
    plt.ylim([ymin,ymax])
    plt.show()

In [None]:
### Full pairwise correlation plot

# Create the plot
mclust = sns.clustermap(covar_df_z.corr(method='pearson'),
                        figsize=(10, 10),
                        cmap='RdBu')

# Fix the y axis orientation
mclust.ax_heatmap.set_yticklabels(mclust.ax_heatmap.get_yticklabels(),
                                  rotation=0)

# Other cosmetics
mclust.ax_heatmap.set_title("Pairwise Correlations Cluster Plot", y=1.275)
plt.ylabel("Pearson\nCorr. Coef.")
plt.show()

<a id=covar_fspace></a>

## 3. Covariate-Shape Relationships

----

### Covariate-Shape Correlations <a id=covar_fspace_correlations></a>

In [None]:
### Interactive linear fitting plot

# Set interaction
@widgets.interact(covar_x=covar_names, 
                  PC_y=range(1,fspace_pca.shape[1]+1))

# Plotting function
def corr_plot_covar(covar_x=covar_names[0], 
                    PC_y=1):
    
    # Prep
    PC_y = int(PC_y)
    plt.figure(figsize=(5,3))
    
    # Scatterplot
    plt.scatter(covar_df[covar_x], fspace_pca[:, PC_y-1],
                facecolor='darkred', edgecolor='',
                s=5, alpha=0.5)
    plt.xlabel(covar_x)
    plt.ylabel("PC "+str(PC_y))
    
    # Linear regression and pearson
    fitted  = linregress(covar_df[covar_x], fspace_pca[:, PC_y-1])
    pearson = pearsonr(covar_df[covar_x], fspace_pca[:, PC_y-1])
    
    # Report
    print "Linear regression:"
    for param,value in zip(["slope","intercept","rvalue","pvalue","stderr"], fitted):
        print "  {}:\t{:.2e}".format(param,value)
    print "Pearson:"
    print "  r:\t{:.2e}".format(pearson[0])
    print "  p:\t{:.2e}".format(pearson[1])
    
    # Add fit to plot
    xmin,xmax = (covar_df[covar_x].min(), covar_df[covar_x].max())
    ymin,ymax = (fspace_pca[:, PC_y-1].min(), fspace_pca[:, PC_y-1].max())
    ybot,ytop = (xmin*fitted[0]+fitted[1], xmax*fitted[0]+fitted[1])
    plt.plot([xmin,xmax], [ybot,ytop], c='red', lw=2, alpha=0.5)
    
    # Cosmetics and show
    plt.xlim([xmin,xmax])
    plt.ylim([ymin,ymax])
    plt.show()

In [None]:
### Selected linear fits

# Settings for TFOR PC 3
if 'TFOR' in dataset_suffix:
    covar_x = 'Z Axis Length'
    PC_y    = 3
    x_reduc = 0
    lbl_x   = 'TFOR PC 3'
    lbl_y   = 'Z Axis Length\n(Cell Height)'

# Settings for CFOR PC 1
if 'CFOR' in dataset_suffix:
    covar_x = 'Sphericity'
    PC_y    = 1
    x_reduc = 2
    lbl_x   = 'CFOR PC 1'
    lbl_y   = 'Sphericity'

# Prep
plt.figure(figsize=(6,4))

# Scatterplot
plt.scatter(fspace_pca[:, PC_y-1], covar_df[covar_x],
            facecolor='darkblue', edgecolor='',
            s=5, alpha=0.25)
plt.xlabel(covar_x)
plt.ylabel("PC "+str(PC_y))

# Linear regression and pearson
fitted  = linregress(fspace_pca[:, PC_y-1], covar_df[covar_x])
pearson = pearsonr(fspace_pca[:, PC_y-1], covar_df[covar_x])

# Report
print "Linear regression:"
for param,value in zip(["slope","intercept","rvalue","pvalue","stderr"], fitted):
    print "  {}:\t{:.2e}".format(param,value)
print "Pearson:"
print "  r:\t{:.2e}".format(pearson[0])
print "  p:\t{:.2e}".format(pearson[1])

# Add fit to plot
ymin,ymax = (covar_df[covar_x].min(), covar_df[covar_x].max())
xmin,xmax = (fspace_pca[:, PC_y-1].min()-x_reduc, fspace_pca[:, PC_y-1].max())
ybot,ytop = (xmin*fitted[0]+fitted[1], xmax*fitted[0]+fitted[1])
plt.plot([xmin,xmax], [ybot,ytop], c='black', lw=1, alpha=0.5)

# Cosmetics
plt.tick_params(axis='both', which='major', labelsize=16)
plt.xlabel(lbl_x, fontsize=18)
plt.ylabel(lbl_y, fontsize=18)
plt.xlim([xmin,xmax])
plt.ylim([ymin,ymax+0.05])
plt.tight_layout()

# Done
plt.show()

In [None]:
### Full pairwise correlation plot 

# Prepare the pairwise correlation
fspace_pca_z = (fspace_pca - fspace_pca.mean(axis=0)) / fspace_pca.std(axis=0)
fspace_pca_z_df = pd.DataFrame(fspace_pca_z[:,:25])
pairwise_corr = covar_df_z.expanding(axis=1).corr(fspace_pca_z_df, pairwise=True).iloc[-1, :, :]  # Ouf, pandas...

# Create the plot
mclust = sns.clustermap(pairwise_corr,
                        figsize=(10, 10),
                        col_cluster=False,
                        cmap='RdBu')

# Fix the y axis orientation
mclust.ax_heatmap.set_yticklabels(mclust.ax_heatmap.get_yticklabels(),
                                  rotation=0)

# Other cosmetics
mclust.ax_heatmap.set_title("Pairwise Correlations Cluster Plot", y=1.275)
mclust.ax_heatmap.set_xticklabels(range(1,fspace_pca_z_df.shape[1]+1))
plt.ylabel("Pearson\nCorr. Coef.")

# Done
plt.show()

### Covariate Relation Graph <a id=covar_fspace_graph></a>

In [None]:
# Parameters
num_PCs      = 8           # Number of PCs to include
corr_measure = 'pearsonr'  # Correlation measure to use
threshold    = 0.30        # Threshold to include a correlation as relevant

# Get relevant data
if corr_measure == 'pearsonr':
    covar_fspace_dists = pairwise_corr.get_values()[:, :num_PCs]  # Retrieved from above!
else:
    raise NotImplementedError()

# Generate the plot
kp.covar_pc_bigraph(covar_fspace_dists, threshold, covar_names,
                    height=0.6, verbose=True, show=False)

# Done
plt.show()

----
[back to top](#top)