## RUN: Feature Engineering

This notebook runs the extraction of a bunch of engineered features from segmented cells or ISLA point clouds.

### Prep

In [None]:
### Imports

# Generic
from __future__ import division
import os, sys, pickle
import numpy as np

# Internal
from katachi.pipelines import feature_engineering as feat

In [None]:
### Function to parse relevant IDs from IDR bulk data

def parse_from_IDR(dir_path, target):
    
    # Get all samples
    samples = [d for d in os.listdir(dir_path) if len(d)==10
               and os.path.isdir(os.path.join(dir_path, d))]
    
    # Select relevant samples
    relevant_samples = []
    for d in samples:
        
        # Get image files
        images = [i for i in os.listdir(os.path.join(dir_path, d))
                  if i.startswith(d) and i.endswith('.tif')]
        
        # Special case for membranes only
        if target=='membranes_only':
            if all(['lynEGFP' in img for img in images]):
                relevant_samples.append(d)
            
        # All other cases
        else:
            if any([img.endswith(target+'.tif') for img in images]):
                relevant_samples.append(d)
    
    return relevant_samples

### Feature Engineering for `cldnB:lyn-EGFP`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'membranes_only')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg"]

# Run segmentation pipeline
feat.feature_engineering(dir_path, channels, 
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + cxcr4b:NLS-tdTomato`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'NLStdTomato')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "NLStdTomato"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         processes=1,  # For some reason, this stalled when multiprocessed!
                         recurse=True, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + Actb2:mKate-Rab11a`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'mKate2rab11')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "mKate2rab11"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + RNA:mKate2-Rab5a`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'mKate2rab5')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "mKate2rab5"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + RNA:mKate2-GM130(rat)`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'mKate2GM130')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "mKate2GM130"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + lexOP:CDMPR-tagRFPt`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'CDMPRtagRFPt')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "CDMPRtagRFPt"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14,
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + LexOP:B4GalT1(1-55Q)-tagRFPt`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'b4galT1tagRFPt')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "b4galT1tagRFPt"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + atoh1a:dtomato`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'atoh1a')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

# Additional arguments  
channels = ["atoh1a"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         no_lms=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + 6xUAS:tagRFPt-UtrCH`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'tagRFPtUtrCH')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "tagRFPtUtrCH"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `cldnB:lyn-EGFP + LysoTracker Deep Red`

In [None]:
# Target directory
dir_path = r'data\experimentA\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'lysotrackerdeepred')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg", "lysotrackerdeepred"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14, 
                         profiling=True, verbose=True)

### Feature Engineering for `pea3 smFISH`

In [None]:
# Target directory
dir_path = r'data\experimentB\image_data'

# Parse relevant IDs from IDR bulk data
relevant_samples = parse_from_IDR(dir_path, 'pea3smFISH')
print "Found %i relevant samples!" % len(relevant_samples)

# Additional arguments  
channels = ["seg"]

# Run segmentation pipeline
cov.covariate_extraction(dir_path, channels,
                         IDs=relevant_samples,
                         overwrite_previous=True,
                         recurse=True, processes=14,
                         profiling=True, verbose=True)