# Process single cell morphology features for CellProfiler readouts - All CellProfiler Method

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

import sys
sys.path.append("../../")
import sc_count_add_save_util as sc_util

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../../../CellProfiler_pipelines"
output_dir = "../../data/Plate1/CellProfiler"

## Set up paths to sqlite files and outputs

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file = "NF1_data_allcp_plate1.sqlite"
single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"

# Set path with name for outputted data
sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler.csv.gz")
sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz")
sc_norm_fs_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_cellprofiler.csv.gz")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load in platemap

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,C,6,C6,NF1,WT
1,C,7,C7,NF1,Null
2,D,6,D6,NF1,WT
3,D,7,D7,NF1,Null
4,E,6,E6,NF1,WT
5,E,7,E7,NF1,Null
6,F,6,F6,NF1,WT
7,F,7,F7,NF1,Null


## Set up `SingleCells` class from Pycytominer

In [6]:
# Instantiate SingleCells class
sc = cells.SingleCells(
    sql_file=single_cell_file,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



## Merge single cells 

In [7]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df = sc.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df, sc_output_file)

print(sc_df.shape)
sc_df.head()

(242, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,3,1,...,1393.410857,1331.583275,653.826838,618.063979,606.832257,590.114791,147.195839,144.355017,148.179465,148.875403
1,C,6,NF1,WT,1,1,C6,2,4,2,...,1369.498111,1276.305513,332.941295,317.56745,321.873215,292.116754,60.632767,61.876198,65.202076,60.022847
2,C,6,NF1,WT,1,1,C6,3,5,3,...,1338.091947,1299.373271,432.829034,398.306003,401.091835,358.84984,74.837374,71.033793,80.523205,80.845266
3,C,6,NF1,WT,1,1,C6,4,7,4,...,899.439956,874.837386,211.898029,189.348918,186.3333,188.292692,113.059608,113.194846,110.997393,109.83439
4,C,6,NF1,WT,1,1,C6,5,8,5,...,1231.630414,1218.998954,306.13973,295.581509,310.469726,287.78839,496.084704,502.046808,490.259298,491.171009


## Normalize Data

In [8]:
# Normalize single cell data and write to file
normalize_sc_df = normalize(
    sc_df,
    method="standardize"
)

output(normalize_sc_df, sc_norm_output_file)

print(normalize_sc_df.shape)
normalize_sc_df.head()

(242, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,3,1,...,-0.277354,-0.278153,0.437881,0.406082,0.358158,0.358081,0.760504,0.736324,0.765755,0.781857
1,C,6,NF1,WT,1,1,C6,2,4,2,...,-0.308203,-0.353348,-0.069536,-0.073583,-0.088878,-0.116121,0.001109,0.015249,0.039691,-0.000841
2,C,6,NF1,WT,1,1,C6,3,5,3,...,-0.34872,-0.321969,0.088417,0.055295,0.035398,-0.009929,0.125722,0.09531,0.173753,0.182583
3,C,6,NF1,WT,1,1,C6,4,7,4,...,-0.91462,-0.899471,-0.260942,-0.278251,-0.301509,-0.281335,0.461035,0.463905,0.440407,0.437947
4,C,6,NF1,WT,1,1,C6,5,8,5,...,-0.486065,-0.431303,-0.111917,-0.108678,-0.106767,-0.123008,3.821213,3.86346,3.759005,3.797123


## Feature Selection

In [9]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df = feature_select(
    normalize_sc_df,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df, sc_norm_fs_output_file)

print(feature_select_norm_sc_df.shape)
feature_select_norm_sc_df.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(242, 441)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas2_GFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,C,6,NF1,WT,1,1,C6,1,3,1,...,0.904672,-0.005997,-0.036913,0.685169,1.281912,-1.006389,-0.319286,1.117939,-0.09163,0.533778
1,C,6,NF1,WT,1,1,C6,2,4,2,...,0.922557,-0.591964,0.074769,0.884695,0.027386,-1.045552,-0.724098,1.115956,-0.156061,0.001415
2,C,6,NF1,WT,1,1,C6,3,5,3,...,0.737608,-0.484682,-0.522132,0.29387,0.922769,-0.963727,-0.292361,1.026312,-0.249043,0.030412
3,C,6,NF1,WT,1,1,C6,4,7,4,...,0.278456,0.144388,0.742653,1.295467,0.554441,-0.824492,0.319486,0.779574,-0.774104,0.456474
4,C,6,NF1,WT,1,1,C6,5,8,5,...,0.526774,0.897291,1.016776,1.223951,1.373739,-0.83796,-0.057416,0.991402,-0.42024,3.365358


## Add single cell count metadata and save csv

In [10]:
sc_util.add_sc_count_metadata(sc_output_file)
sc_util.add_sc_count_metadata(sc_norm_output_file)
sc_util.add_sc_count_metadata(sc_norm_fs_output_file)

---

### Visualize basic count statistics

In [11]:
sc_df.Metadata_genotype.value_counts()

Null    173
WT       69
Name: Metadata_genotype, dtype: int64

In [12]:
pd.crosstab(sc_df.Metadata_genotype, sc_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,21,0,29,0,56,0,67
WT,25,0,7,0,14,0,23,0
