# Process single cell morphology features for CellProfiler readouts - All CellProfiler Method

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../../../CellProfiler_pipelines"
output_dir = "../../data/Plate2/CellProfiler"

## Set paths to sqlite files

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file = "NF1_data_allcp_plate2.sqlite"
single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP_Plate2.csv"

# Set path with name for outputted data
sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler_plate2.csv.gz")
sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler_plate2.csv.gz")
sc_norm_fs_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_cellprofiler_plate2.csv.gz")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load and view platemap file

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,A,1,A1,NF1,WT
1,A,6,A6,NF1,WT
2,A,7,A7,NF1,Null
3,A,12,A12,NF1,Null
4,B,1,B1,NF1,WT


## Set up `SingleCells` class from Pycytominer

In [6]:
# Instantiate SingleCells class
sc = cells.SingleCells(
    sql_file=single_cell_file,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



## Merge single cells 

In [7]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df = sc.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df, sc_output_file)

print(sc_df.shape)
sc_df.head()

(1681, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,5,1,A1,1,5,1,...,274.128177,256.161575,56.498892,52.447198,60.283678,56.1033,23.386889,24.063035,23.415663,23.086004
1,A,1,NF1,WT,5,1,A1,2,6,2,...,394.820673,399.201706,71.658509,73.598676,75.209149,73.926833,27.980848,27.97031,27.940705,27.981245
2,A,1,NF1,WT,5,1,A1,3,7,3,...,214.041285,212.730558,39.903007,41.244305,40.393178,39.882608,42.105095,40.796027,41.380481,42.911391
3,A,1,NF1,WT,5,1,A1,4,8,4,...,228.770343,218.898123,67.635275,64.743992,62.738745,64.846021,42.515068,43.729601,43.846305,42.286463
4,A,1,NF1,WT,5,1,A1,5,9,5,...,229.947359,227.989432,64.592939,66.0357,69.603941,68.479207,37.257204,35.536674,36.570249,37.428376


## Normalize Data

In [8]:
# Normalize single cell data and write to file
normalize_sc_df = normalize(
    sc_df,
    method="standardize"
)

output(normalize_sc_df, sc_norm_output_file)

print(normalize_sc_df.shape)
normalize_sc_df.head()

(1681, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,5,1,A1,1,5,1,...,-0.49535,-0.514045,-0.55561,-0.559995,-0.548735,-0.555726,-0.348458,-0.340556,-0.350308,-0.34604
1,A,1,NF1,WT,5,1,A1,2,6,2,...,-0.36401,-0.35587,-0.526935,-0.519503,-0.520462,-0.52141,-0.326066,-0.321458,-0.328143,-0.322053
2,A,1,NF1,WT,5,1,A1,3,7,3,...,-0.560737,-0.562072,-0.587002,-0.581442,-0.586415,-0.586955,-0.257219,-0.258769,-0.262309,-0.248892
3,A,1,NF1,WT,5,1,A1,4,8,4,...,-0.544709,-0.555251,-0.534545,-0.536454,-0.544085,-0.538894,-0.255221,-0.244431,-0.250231,-0.251954
4,A,1,NF1,WT,5,1,A1,5,9,5,...,-0.543428,-0.545198,-0.5403,-0.533981,-0.53108,-0.531899,-0.28085,-0.284476,-0.285872,-0.27576


## Feature Selection

In [9]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df = feature_select(
    normalize_sc_df,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df, sc_norm_fs_output_file)

print(feature_select_norm_sc_df.shape)
feature_select_norm_sc_df.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(1681, 408)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_SumEntropy_DAPI_3_02_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,A,1,NF1,WT,5,1,A1,1,5,1,...,-0.364244,1.094257,0.321026,-0.492136,-0.175,0.858842,1.222413,-0.242247,-0.528095,-0.306101
1,A,1,NF1,WT,5,1,A1,2,6,2,...,0.631454,-1.713055,-2.008387,-1.501222,-0.193657,0.810932,-0.794624,0.25753,-0.508496,-0.315519
2,A,1,NF1,WT,5,1,A1,3,7,3,...,0.884551,-0.875895,-1.017064,-0.192425,0.103023,0.874961,-0.657868,-0.390421,-0.56538,-0.223078
3,A,1,NF1,WT,5,1,A1,4,8,4,...,1.137054,1.20559,1.279489,1.366709,0.164872,0.626944,3.100735,-0.332243,-0.512645,-0.186787
4,A,1,NF1,WT,5,1,A1,5,9,5,...,-0.535551,-1.56283,-1.54328,0.469955,0.191673,0.095901,-0.724712,-0.415192,-0.510776,-0.23283


---

### Visualize basic count statistics

In [10]:
sc_df.Metadata_genotype.value_counts()

Null    994
WT      687
Name: Metadata_genotype, dtype: int64

In [11]:
pd.crosstab(sc_df.Metadata_genotype, sc_df.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,44,0,52,0,47,0,61,0,62,...,0,74,0,53,0,73,0,52,0,58
WT,47,0,52,0,48,0,47,0,54,0,...,45,0,54,0,32,0,27,0,29,0
