# Comp Screening - cell painting Scale Features
#### _BEM 09-20-2021_

### Metadata
* U2OS cells screened with single / full plate(s) of KI FDA library
* Assayed with 5-color cell painting assay
* 9x fields captured per well @ 20X magnification
* All plates from all runs

### Preprocessing
Cell profiler used to image correct, capture QC metrics, segment, and feature extract (AWS), cytominer used to aggregate

### What this does
* 1) On a per batch basis
    * 1) remove EMPTY wells and DMSO, TREAT wells < 50 cells
    * 2) drop uninformative features (same value, 0s)
    * 3) integrate all batches into single consistent full dataset
    
    
* 2) On a per plate basis
    * 1) scale each feature with sklearn RobustScaler
    * 2) -OR- Perform transformation of features with sklearn PowerTransformer (Yeo-Johnson)


In [1]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer

pd.set_option('mode.chained_assignment', None)

scale_path = '../1_SCALED_median_aggregated/'

## Ingestion and Scaling functions

In [2]:
## function for cleaning up pertubration names
def remove_ion(lst):
    lst = [x[:x.find('·')] if x.find('·') != -1 else x for x in lst]
    return lst

## function for gathering metadata and feature names and returning as set
def extract_feat_meta(table):
    
    features = [col for col in \
                table.columns if 'Metadata' not in col and \
                'Number_of_Cells' not in col and \
                'Metadata_run' not in col]

    metadata = [col for col in \
                table.columns if 'Metadata' in col and \
                'Metadata_run' not in col] 
    
    return set(features), set(metadata)

In [3]:
## functions for iterating scaling / transforming by plate
def st_by_plate(table, batch, date, out_dir, oper, DMSO = False):
    
    ### INPUTS:
    ## table - input data table in pandas
    ## batch - str, batch name to append to output file
    ## date - str, date to append to output file
    ## oper - str, 'scale' applies robust scaling or 'transform' applies power transformation
    ## DMSO - bool, if True, train on DMSO wells, transfrom all, if False train+trans all 
        
    if oper == 'scale':
        print('Starting feature scaling by plate')        
    elif oper == 'transform':
        print('Starting feature transformation by plate')
    else:
        print('ERROR - speficy valid oper argument')
        return
    
    # load all plates, features
    plates = list(set(table.Metadata_Plate.values))
    features = [col for col in \
                    table.columns if 'Metadata' not in col and \
                    'Number_of_Cells' not in col]
        
    # run over plates
    for p in plates:
        if oper == 'scale':
            table.loc[(table.Metadata_Plate == p), features] = robscale(p, table, features, DMSO)
            
        else:
            table.loc[(table.Metadata_Plate == p), features] = ptrans(p, table, features, DMSO)
        
    print('Saving final tables')
    if oper == 'scale' and DMSO == True:
        save_stan = out_dir+date+'_robscaleDMSO_QC_median_'+batch+'_feature_table.gz'
    elif oper == 'scale' and DMSO == False:
        save_stan = out_dir+date+'_robscale_QC_median_'+batch+'_feature_table.gz'
    elif oper == 'transform' and DMSO == False:
        save_stan = out_dir+date+'_ptrans_QC_median_'+batch+'_feature_table.gz'
    elif oper == 'transform' and DMSO == True:
        save_stan = out_dir+date+'_ptransDMSO_QC_median_'+batch+'_feature_table.gz'
    table.to_csv(save_stan, index=False, compression='gzip')
    print('Finished with %s' %batch)
    
def robscale(p, table, features, DMSO):
    
    X = table.loc[(table.Metadata_Plate == p),(table.columns.isin(features))]
    
    if DMSO == True:
        # extract dmso samples
        X_DMSO = table.loc[(table.Metadata_Plate == p)&(table.Metadata_perturbation == 'DMSO'),
                           (table.columns.isin(features))]
        
        # center all data by dmso
        DMSO_scaler = RobustScaler(with_scaling = False, quantile_range = (0,100)).fit(X_DMSO)
        X_center = DMSO_scaler.transform(X)
        
        # robust scale all data
        scaler = RobustScaler(with_centering = False, quantile_range = (1,99),
                              unit_variance = True).fit(X_center)
        X_scale = scaler.transform(X_center)
                
    else:
        # center and robust scale by all data
        scaler = RobustScaler(quantile_range = (1,99), unit_variance = True).fit(X)
        X_scale = scaler.transform(X)
        
    return X_scale
    
def ptrans(p, table, features, DMSO):
    
    X = table.loc[(table.Metadata_Plate == p),(table.columns.isin(features))]
    
    # power transform without scaling
    transformer = PowerTransformer(standardize=False).fit(X)
    trans_X = transformer.transform(X)
    
    if DMSO == True:
        # extract dmso samples
        X_DMSO = table.loc[(table.Metadata_Plate == p)&(table.Metadata_perturbation == 'DMSO'),
                   (table.columns.isin(features))]
        
        # center all data by transformed dmso
        trans_X_DMSO = transformer.transform(X_DMSO)
        DMSO_scaler = RobustScaler(with_scaling = False, 
                                   quantile_range = (0,100)).fit(trans_X_DMSO)
        trans_X_center = DMSO_scaler.transform(trans_X)
        
        # robust scale all data
        scaler = RobustScaler(with_centering = False, quantile_range = (1,99),
                              unit_variance = True).fit(trans_X_center)
        trans_X_scale = scaler.transform(trans_X_center)
    
    else:
        # center and robust scale by all transformed data
        scaler = RobustScaler(unit_variance = True).fit(trans_X)
        trans_X_scale = scaler.transform(trans_X)
    
    return trans_X_scale

## Data Ingestion and merging

#### Load batch-wise median aggregated files, clean up

In [4]:
# first 2 ground truth batches

# batch 1
GT_run1_batch1 = pd.read_csv('10022020_QCfiltered_median_aggragated_batch_1_feature_table.gz',
                             low_memory=False)

# drop EMPTY wells, wells < 50 cells, wells with NaN features
GT_run1_batch1 = GT_run1_batch1.loc[(GT_run1_batch1.Metadata_compound_name!='EMPTY') & (GT_run1_batch1.Number_of_Cells>50)].dropna()

# Fix missing columns & names
GT_run1_batch1['Metadata_compression'] = 1
GT_run1_batch1['Metadata_replicates'] = 2
GT_run1_batch1['Metadata_run'] = 'GT_run1_batch1'
GT_run1_batch1.rename(columns={'Metadata_compound_name':'Metadata_perturbation'}, inplace=True)

# get feats & metadata
feat_gtr1b1, meta_gtr1b1 = extract_feat_meta(GT_run1_batch1)


# batch 2
GT_run1_batch2 = pd.read_csv('10022020_QCfiltered_median_aggragated_batch_2_feature_table.gz',
                             low_memory=False)

# drop EMPTY wells, wells < 50 cells, wells with NaN features
GT_run1_batch2 = GT_run1_batch2.loc[(GT_run1_batch2.Metadata_compound_name!='EMPTY') & (GT_run1_batch2.Number_of_Cells>50)].dropna()

# Fix missing columns & names
GT_run1_batch2['Metadata_compression'] = 1
GT_run1_batch2['Metadata_replicates'] = 2
GT_run1_batch2['Metadata_run'] = 'GT_run1_batch2'
GT_run1_batch2.rename(columns={'Metadata_compound_name':'Metadata_perturbation'}, inplace=True)

# get feats & metadata
feat_gtr1b2, meta_gtr1b2 = extract_feat_meta(GT_run1_batch2)

In [5]:
# first 2 compressed screen batches

# run 1
CS_run1 = pd.read_csv('04122021_QCfiltered_median_aggragated_compressed_screen_run1_feature_table.gz',
                      low_memory=False)

# drop wells < 50 cells, wells with NaN features
CS_run1 = CS_run1.loc[(CS_run1.Number_of_Cells>50)].dropna()

# rename plates to match picklist names
plate_ref = pd.read_csv('compressed_screen_run1.csv')
plate_ref = plate_ref.set_index('Assay_Plate_Barcode').T.to_dict('list')

for key in list(plate_ref.keys()):
    CS_run1['Metadata_Plate'] = CS_run1['Metadata_Plate'].replace(key,str(plate_ref[key][0]))

# drop FAILED ECHO plates
# plate1 == 2x3r random
drop_plate = ['plate1']
CS_run1 = CS_run1.loc[~(CS_run1.Metadata_Plate.isin(drop_plate))]
    
# Fix missing columns
CS_run1['Metadata_KI_ID'] = 'NA'
CS_run1['Metadata_run'] = 'CS_run1'

# get feats & metadata
feat_csr1, meta_csr1 = extract_feat_meta(CS_run1)

# run 2
CS_run2 = pd.read_csv('04122021_QCfiltered_median_aggragated_compressed_screen_run2_feature_table.gz',
                      low_memory=False)

# drop wells < 50 cells, wells with NaN features
CS_run2 = CS_run2.loc[(CS_run2.Number_of_Cells>50)].dropna()

# rename plates to match picklist names
plate_ref = pd.read_csv('compressed_screen_run2.csv')
plate_ref = plate_ref.set_index('Assay_Plate_Barcode').T.to_dict('list')

for key in list(plate_ref.keys()):
    CS_run2['Metadata_Plate'] = CS_run2['Metadata_Plate'].replace(key,str(plate_ref[key][0]))

# drop FAILED ECHO plates
# plate1 == 2x3r random
drop_plate = ['plate1']
CS_run2 = CS_run2.loc[~(CS_run2.Metadata_Plate.isin(drop_plate))]
    
# Fix missing columns
CS_run2['Metadata_KI_ID'] = 'NA'
CS_run2['Metadata_run'] = 'CS_run2'

# get feats & metadata
feat_csr2, meta_csr2 = extract_feat_meta(CS_run2)

In [6]:
# final comp screen & ground truth batches
run3 = pd.read_csv('06232021_QCfiltered_median_aggragated_compressed_screen_run3_feature_table.gz',
                   low_memory=False)

# fill missing metadata for ground truth & compressed plates
run3.Metadata_compression.fillna(1, inplace=True)
run3.Metadata_replicates.fillna(4, inplace=True)
run3.Metadata_KI_ID.fillna("NA", inplace=True)

# drop extra index rows (if present)
run3.drop(columns=['Unnamed: 0'], inplace=True)

# drop wells < 50 cells, wells with NaN features
run3 = run3.loc[(run3.Number_of_Cells>50)].dropna()

# rename plates to match picklist names
plate_ref = pd.read_csv('compressed_screen_run3_plate_rename.csv')
plate_ref = plate_ref.set_index('Assay_Plate_Barcode').T.to_dict('list')

for key in list(plate_ref.keys()):
    run3['Metadata_Plate'] = run3['Metadata_Plate'].replace(key,str(plate_ref[key][0]))

# Fix perturbation name before separating
run3.Metadata_perturbation = remove_ion(run3.Metadata_perturbation)
run3.loc[run3.Metadata_KI_ID=='KI-ENZO-FDA-299', 'Metadata_perturbation'] = 'Methyldopa Sesquihydrate (L-Α-Methyl-Dopa Sesquihyrate)'

# Split into compressed screen and ground truth
CS_run3 = run3[run3['Metadata_KI_ID']=='NA'].reset_index(drop=True)
GT_run2 = run3[run3['Metadata_KI_ID']!='NA'].reset_index(drop=True)
del run3

# Fix missing columns
GT_run2['Metadata_run'] = 'GT_run2'
CS_run3['Metadata_run'] = 'CS_run3'

# drop EMPTY wells
GT_run2 = GT_run2.loc[(GT_run2.Metadata_perturbation!='EMPTY')].reset_index(drop=True)

# get feats & metadata
feat_gtr2, meta_gtr2 = extract_feat_meta(GT_run2)
feat_csr3, meta_csr3 = extract_feat_meta(CS_run3)

#### Merge all batches

In [7]:
# define mutual features and metadata across batches
features = list(set.intersection(feat_gtr1b1, feat_gtr1b2, feat_csr1, feat_csr2, feat_gtr2, feat_csr3))
metadata = list(set.intersection(meta_gtr1b1, meta_gtr1b2, meta_csr1, meta_csr2, meta_gtr2, meta_csr3))

# merge into one big data table
frame = [GT_run1_batch1, GT_run1_batch2, GT_run2, CS_run1, CS_run2, CS_run3]
data = pd.concat(frame)[['Metadata_run'] + metadata + ['Number_of_Cells'] + features]
data.Metadata_Plate = data.Metadata_run + '_' + data.Metadata_Plate

# drop features with a single value
nunique = data.nunique()
cols_to_drop = nunique[data.nunique() == 1].index
data = data.drop(cols_to_drop, axis=1)

print('merged dataset has '+str(data.shape[0])+' samples and '+str(data.shape[1])+ ' features')

merged dataset has 20686 samples and 3402 features


In [8]:
# remove non-merged datasets
del GT_run1_batch1, GT_run1_batch2, GT_run2, CS_run1, CS_run2, CS_run3

In [9]:
# save merged dataset
data.to_csv('09202021_QC_median_ALL_feature_table.gz', index=False, compression='gzip')

## Data Scaling

In [40]:
data = pd.read_csv('09202021_QC_median_ALL_feature_table.gz',
                   low_memory=False)

In [10]:
st_by_plate(data, 'all', '0920201', scale_path, oper = 'scale', DMSO = True)

Starting feature scaling by plate
Saving final tables
Finished with all
