# Comp Screening - cell painting perform PCA reduction + batch correction
## FOR COMPLTETE (datalock) cell painting dataset
#### _BEM 09-27-2021_


### Metadata
* U2OS cells screened with single / full plate(s) of KI FDA library
* 57 plates
* 9x fields captured per well @ 20X magnification

### What this does
Performs PCA reductions (keeping PCs accounting for 95% variance exp.), and also saves a harmony batch-corrected PCA space

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import scanpy as sc
import anndata

pd.set_option('mode.chained_assignment', None)
data_path = '../2_DENOISED_median_aggregated/'

## Define Functions

#### AnnData ingest

In [2]:
def df_anndata(df):

    # convert pandas comp screen df to scanpy anndata
    
    features = [col for col in \
                    df.columns if 'Metadata' not in col and \
                    'Number_of_Cells' not in col]

    metadata = [col for col in \
                    df.columns if 'Metadata' in col \
                    or 'Number_of_Cells' in col]  

    adata = anndata.AnnData(X = df[features])
    adata.obs = df[metadata]

 
    return adata

#### PCA and harmony

In [3]:
def PCA_harmony(adata, batch_var):

    #run PCA
    sc.tl.pca(adata)

    # get num of PCs to use (up to 95% var explained)
    num_pcs = sum(np.cumsum(adata.uns['pca']['variance_ratio']) < 0.95)+1
    print('95% of variance explained by '+str(num_pcs)+' PCs')
    
    # run harmony
    sc.external.pp.harmony_integrate(adata, batch_var, max_iter_harmony = 25)
 
    return adata

## Ingest, PCA, harmony, and save datasets

#### Robust DMSO scale

In [9]:
# DT
data = pd.read_csv(data_path+'09242021_QC_both_rd_DT_feature_table.gz', low_memory = False)

# convert to anndata, run PCA + harmony, save as anndata
adata = df_anndata(data)
adata = PCA_harmony(adata, batch_var='Metadata_run')
adata.write(filename='09272021_PCH_rd_DT_table.h5ad', compression='gzip')



95% of variance explained by 39 PCs


2021-09-27 12:22:03,588 - harmonypy - INFO - Iteration 1 of 25
2021-09-27 12:22:04,985 - harmonypy - INFO - Iteration 2 of 25
2021-09-27 12:22:06,422 - harmonypy - INFO - Iteration 3 of 25
2021-09-27 12:22:08,102 - harmonypy - INFO - Iteration 4 of 25
2021-09-27 12:22:09,737 - harmonypy - INFO - Iteration 5 of 25
2021-09-27 12:22:11,281 - harmonypy - INFO - Iteration 6 of 25
2021-09-27 12:22:12,765 - harmonypy - INFO - Iteration 7 of 25
2021-09-27 12:22:14,236 - harmonypy - INFO - Iteration 8 of 25
2021-09-27 12:22:15,656 - harmonypy - INFO - Iteration 9 of 25
2021-09-27 12:22:17,076 - harmonypy - INFO - Iteration 10 of 25
2021-09-27 12:22:18,535 - harmonypy - INFO - Iteration 11 of 25
2021-09-27 12:22:20,039 - harmonypy - INFO - Converged after 11 iterations
... storing 'Metadata_run' as categorical
... storing 'Metadata_Well' as categorical
... storing 'Metadata_perturbation' as categorical
... storing 'Metadata_Plate' as categorical
... storing 'Metadata_KI_ID' as categorical


In [4]:
# FL
data = pd.read_csv(data_path+'09242021_QC_both_rd_FL_feature_table.gz', low_memory = False)

# convert to anndata, run PCA + harmony, save as anndata
adata = df_anndata(data)
adata = PCA_harmony(adata, batch_var='Metadata_run')
adata.write(filename='09272021_PCH_rd_FL_table.h5ad', compression='gzip')



95% of variance explained by 32 PCs


2021-10-01 08:10:01,532 - harmonypy - INFO - Iteration 1 of 25
2021-10-01 08:10:03,076 - harmonypy - INFO - Iteration 2 of 25
2021-10-01 08:10:05,238 - harmonypy - INFO - Iteration 3 of 25
2021-10-01 08:10:07,307 - harmonypy - INFO - Iteration 4 of 25
2021-10-01 08:10:09,020 - harmonypy - INFO - Iteration 5 of 25
2021-10-01 08:10:11,563 - harmonypy - INFO - Iteration 6 of 25
2021-10-01 08:10:13,412 - harmonypy - INFO - Iteration 7 of 25
2021-10-01 08:10:15,443 - harmonypy - INFO - Iteration 8 of 25
2021-10-01 08:10:17,088 - harmonypy - INFO - Iteration 9 of 25
2021-10-01 08:10:18,737 - harmonypy - INFO - Converged after 9 iterations
... storing 'Metadata_run' as categorical
... storing 'Metadata_Well' as categorical
... storing 'Metadata_perturbation' as categorical
... storing 'Metadata_Plate' as categorical
... storing 'Metadata_KI_ID' as categorical


In [11]:
# OG316
data = pd.read_csv(data_path+'09242021_QC_both_rd_OG316_feature_table.gz', low_memory = False)

# convert to anndata, run PCA + harmony, save as anndata
adata = df_anndata(data)
adata = PCA_harmony(adata, batch_var='Metadata_run')
adata.write(filename='09272021_PCH_rd_OG316_table.h5ad', compression='gzip')



95% of variance explained by 30 PCs


2021-09-27 12:22:37,101 - harmonypy - INFO - Iteration 1 of 25
2021-09-27 12:22:39,094 - harmonypy - INFO - Iteration 2 of 25
2021-09-27 12:22:41,281 - harmonypy - INFO - Iteration 3 of 25
2021-09-27 12:22:43,492 - harmonypy - INFO - Iteration 4 of 25
2021-09-27 12:22:45,686 - harmonypy - INFO - Iteration 5 of 25
2021-09-27 12:22:47,742 - harmonypy - INFO - Iteration 6 of 25
2021-09-27 12:22:49,677 - harmonypy - INFO - Iteration 7 of 25
2021-09-27 12:22:51,756 - harmonypy - INFO - Iteration 8 of 25
2021-09-27 12:22:53,158 - harmonypy - INFO - Converged after 8 iterations
... storing 'Metadata_run' as categorical
... storing 'Metadata_Well' as categorical
... storing 'Metadata_perturbation' as categorical
... storing 'Metadata_Plate' as categorical
... storing 'Metadata_KI_ID' as categorical


#### mRMR Robust DMSO scale

In [5]:
# FL
data = pd.read_csv(data_path+'09252021_QC_mrmr_rd_FL_feature_table.gz', low_memory = False)

# convert to anndata, run PCA + harmony, save as anndata
adata = df_anndata(data)
adata = PCA_harmony(adata, batch_var='Metadata_run')
adata.write(filename='09272021_PCH_mrmr_rd_FL_table.h5ad', compression='gzip')



95% of variance explained by 45 PCs


2021-10-01 08:10:24,544 - harmonypy - INFO - Iteration 1 of 25
2021-10-01 08:10:26,251 - harmonypy - INFO - Iteration 2 of 25
2021-10-01 08:10:28,707 - harmonypy - INFO - Iteration 3 of 25
2021-10-01 08:10:30,355 - harmonypy - INFO - Iteration 4 of 25
2021-10-01 08:10:31,931 - harmonypy - INFO - Iteration 5 of 25
2021-10-01 08:10:33,564 - harmonypy - INFO - Iteration 6 of 25
2021-10-01 08:10:35,658 - harmonypy - INFO - Iteration 7 of 25
2021-10-01 08:10:38,283 - harmonypy - INFO - Iteration 8 of 25
2021-10-01 08:10:41,067 - harmonypy - INFO - Iteration 9 of 25
2021-10-01 08:10:44,235 - harmonypy - INFO - Converged after 9 iterations
... storing 'Metadata_run' as categorical
... storing 'Metadata_Well' as categorical
... storing 'Metadata_perturbation' as categorical
... storing 'Metadata_Plate' as categorical
... storing 'Metadata_KI_ID' as categorical


In [14]:
# OG316
data = pd.read_csv(data_path+'09252021_QC_mrmr_rd_OG316_feature_table.gz', low_memory = False)

# convert to anndata, run PCA + harmony, save as anndata
adata = df_anndata(data)
adata = PCA_harmony(adata, batch_var='Metadata_run')
adata.write(filename='09272021_PCH_mrmr_rd_OG316_table.h5ad', compression='gzip')



95% of variance explained by 43 PCs


2021-09-27 12:24:30,967 - harmonypy - INFO - Iteration 1 of 25
2021-09-27 12:24:33,003 - harmonypy - INFO - Iteration 2 of 25
2021-09-27 12:24:35,070 - harmonypy - INFO - Iteration 3 of 25
2021-09-27 12:24:37,069 - harmonypy - INFO - Iteration 4 of 25
2021-09-27 12:24:38,699 - harmonypy - INFO - Iteration 5 of 25
2021-09-27 12:24:40,304 - harmonypy - INFO - Iteration 6 of 25
2021-09-27 12:24:41,696 - harmonypy - INFO - Iteration 7 of 25
2021-09-27 12:24:42,845 - harmonypy - INFO - Iteration 8 of 25
2021-09-27 12:24:43,924 - harmonypy - INFO - Iteration 9 of 25
2021-09-27 12:24:44,913 - harmonypy - INFO - Iteration 10 of 25
2021-09-27 12:24:45,931 - harmonypy - INFO - Iteration 11 of 25
2021-09-27 12:24:46,843 - harmonypy - INFO - Iteration 12 of 25
2021-09-27 12:24:47,745 - harmonypy - INFO - Converged after 12 iterations
... storing 'Metadata_run' as categorical
... storing 'Metadata_Well' as categorical
... storing 'Metadata_perturbation' as categorical
... storing 'Metadata_Plate' a