# Process single cell morphology features for CellProfiler readouts - PyBaSiC and CellProfiler Cellpose plugin Method

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../../../CellProfiler_pipelines"
output_dir = "../../data/Plate2/CellProfiler"

## Set up paths to sqlite files and outputs

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file_pbcellpose = "NF1_data_pybasic_cellpose_plate2.sqlite"
single_cell_file_pbcellpose = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file_pbcellpose}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"

# Set path with name for outputted data
sc_output_file_pbcellpose = pathlib.Path(f"{output_dir}/nf1_sc_pybasic_cellpose.csv.gz")
sc_norm_output_file_pbcellpose = pathlib.Path(f"{output_dir}/nf1_sc_norm_pybasic_cellpose.csv.gz")
sc_norm_fs_output_file_pbcellpose = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_pybasic_cellpose.csv.gz")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load in platemap

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,C,6,C6,NF1,WT
1,C,7,C7,NF1,Null
2,D,6,D6,NF1,WT
3,D,7,D7,NF1,Null
4,E,6,E6,NF1,WT
5,E,7,E7,NF1,Null
6,F,6,F6,NF1,WT
7,F,7,F7,NF1,Null


## Set up `SingleCells` class from Pycytominer

In [6]:
# Instantiate SingleCells class
sc_pbcellpose = cells.SingleCells(
    sql_file=single_cell_file_pbcellpose,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



## Merge single cells 

In [7]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df_pbcellpose = sc_pbcellpose.merge_single_cells(
    platemap=platemap_file,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df_pbcellpose, sc_output_file_pbcellpose)

print(sc_df_pbcellpose.shape)
sc_df_pbcellpose.head()

(489, 1207)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,41,1,C6,1,3,1,...,473.89152,524.922002,70.289585,72.963011,71.267385,69.964042,16.701252,16.650668,16.775116,16.813829
1,C,6,NF1,WT,41,1,C6,2,2,2,...,1096.467269,1116.783813,218.281644,192.695133,196.077555,203.201875,58.786191,59.901258,60.431127,57.922474
2,C,6,NF1,WT,41,1,C6,3,7,3,...,197.2589,188.661448,39.4026,41.377134,39.426123,37.220615,21.631577,21.218124,21.627348,21.617526
3,C,6,NF1,WT,41,1,C6,4,4,4,...,436.000193,418.975407,212.12951,206.372847,190.983243,177.085857,59.971583,62.281709,63.020135,58.880631
4,C,6,NF1,WT,41,1,C6,5,6,5,...,486.185578,505.904836,174.081441,175.894007,196.251293,178.208564,19.810783,19.607358,20.160891,20.279006


## Normalize data

In [8]:
# Normalize single cell data and write to file
normalize_sc_pbcellpose = normalize(
    sc_df_pbcellpose,
    method="standardize"
)

output(normalize_sc_pbcellpose, sc_norm_output_file_pbcellpose)

print(normalize_sc_pbcellpose.shape)
normalize_sc_pbcellpose.head()

(489, 1207)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,41,1,C6,1,3,1,...,-0.593832,-0.555318,-0.739697,-0.726651,-0.744189,-0.733498,-0.550942,-0.54641,-0.552055,-0.546
1,C,6,NF1,WT,41,1,C6,2,2,2,...,-0.199442,-0.169527,-0.464921,-0.499042,-0.510183,-0.480366,-0.392736,-0.382545,-0.386521,-0.389145
2,C,6,NF1,WT,41,1,C6,3,7,3,...,-0.769074,-0.774501,-0.797044,-0.786696,-0.803888,-0.795705,-0.532408,-0.529105,-0.533656,-0.527671
3,C,6,NF1,WT,41,1,C6,4,4,4,...,-0.617836,-0.624377,-0.476344,-0.473041,-0.519734,-0.529983,-0.38828,-0.373527,-0.376704,-0.385489
4,C,6,NF1,WT,41,1,C6,5,6,5,...,-0.586044,-0.567714,-0.546987,-0.530981,-0.509857,-0.52785,-0.539253,-0.535208,-0.539217,-0.532778


## Feature selection

In [9]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_pbcellpose = feature_select(
    normalize_sc_pbcellpose,
    operation=feature_select_ops
)

output(feature_select_norm_sc_pbcellpose, sc_norm_fs_output_file_pbcellpose)

print(feature_select_norm_sc_pbcellpose.shape)
feature_select_norm_sc_pbcellpose.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(489, 430)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_02_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,C,6,NF1,WT,41,1,C6,1,3,1,...,-0.729167,-0.143921,1.57063,0.826506,0.865021,0.520608,2.399301,-0.495785,-0.686681,-0.488469
1,C,6,NF1,WT,41,1,C6,2,2,2,...,0.437948,-0.420335,0.165652,-0.130393,-0.125498,0.078749,1.090608,0.758471,-0.434826,-0.338086
2,C,6,NF1,WT,41,1,C6,3,7,3,...,-1.273252,0.228035,2.136978,0.611131,0.855641,1.109937,1.600118,-1.749432,-0.743925,-0.477399
3,C,6,NF1,WT,41,1,C6,4,4,4,...,-0.009157,0.050257,0.150516,-0.335615,-0.654451,-0.647629,0.674194,-1.031921,-0.53544,-0.354371
4,C,6,NF1,WT,41,1,C6,5,6,5,...,-1.742837,-0.063784,1.116793,2.112533,0.825494,0.629049,1.311322,-0.713744,-0.48421,-0.485137


---

### Visualize basic count statistics

In [10]:
sc_df_pbcellpose.Metadata_genotype.value_counts()

Null    323
WT      166
Name: Metadata_genotype, dtype: int64

In [11]:
pd.crosstab(sc_df_pbcellpose.Metadata_genotype, sc_df_pbcellpose.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,85,0,79,0,81,0,78
WT,33,0,53,0,39,0,41,0
