# Process single cell morphology features for CellProfiler readouts - Plate 2

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../../../CellProfiler_pipelines"
output_dir = "../../data/Plate2/CellProfiler"

## Set paths to sqlite files

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file_pbcp = "NF1_data_pybasic_cp_plate2.sqlite"
single_cell_file_pbcp = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file_pbcp}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP_Plate2.csv"

# Set path with name for outputted data
sc_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_pybasic_cp_plate2.csv.gz")
sc_norm_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_norm_pybasic_cp_plate2.csv.gz")
sc_norm_fs_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_pybasic_cp_plate2.csv.gz")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load and view platemap file

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,A,1,A1,NF1,WT
1,A,6,A6,NF1,WT
2,A,7,A7,NF1,Null
3,A,12,A12,NF1,Null
4,B,1,B1,NF1,WT


## Set up `SingleCells` class from Pycytominer

In [6]:
# Instantiate SingleCells class
sc_pbcp = cells.SingleCells(
    sql_file=single_cell_file_pbcp,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



## Merge single cells 

In [7]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df_pbcp = sc_pbcp.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df_pbcp, sc_output_file_pbcp)

print(sc_df_pbcp.shape)
sc_df_pbcp.head()

(1693, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,280.520646,273.351254,53.686773,55.955022,64.611044,54.947705,55.544501,54.2913,55.150837,55.177395
1,A,1,NF1,WT,7,1,A1,2,6,2,...,365.11279,362.980311,70.90259,63.88953,66.37022,74.359993,19.934434,20.208669,20.080833,19.993572
2,A,1,NF1,WT,7,1,A1,3,7,3,...,71.222079,73.047405,66.100169,68.757536,73.428729,70.643762,66.539723,62.693809,64.268234,64.801753
3,A,1,NF1,WT,7,1,A1,4,8,4,...,254.032758,241.14965,34.454894,36.484509,40.140021,34.244751,30.783978,30.197546,31.491141,31.491939
4,A,1,NF1,WT,7,1,A1,5,9,5,...,300.624328,298.341883,219.88761,221.224759,226.438659,224.156695,113.106798,108.369583,110.599379,113.678538


## Normalize Data

In [8]:
# Normalize single cell data and write to file
normalize_sc_df_pbcp = normalize(
    sc_df_pbcp,
    method="standardize"
)

output(normalize_sc_df_pbcp, sc_norm_output_file_pbcp)

print(normalize_sc_df_pbcp.shape)
normalize_sc_df_pbcp.head()

(1693, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,-0.473274,-0.47987,-0.534766,-0.52669,-0.508151,-0.530588,-0.272119,-0.273001,-0.275923,-0.268837
1,A,1,NF1,WT,7,1,A1,2,6,2,...,-0.375152,-0.374361,-0.490004,-0.505724,-0.503552,-0.479163,-0.442185,-0.436948,-0.44492,-0.438218
2,A,1,NF1,WT,7,1,A1,3,7,3,...,-0.716049,-0.715662,-0.502491,-0.492861,-0.485096,-0.489007,-0.219608,-0.232583,-0.231987,-0.222504
3,A,1,NF1,WT,7,1,A1,4,8,4,...,-0.503999,-0.517777,-0.58477,-0.578139,-0.572135,-0.585433,-0.39037,-0.388899,-0.389935,-0.382863
4,A,1,NF1,WT,7,1,A1,5,9,5,...,-0.449955,-0.450451,-0.102636,-0.089981,-0.085027,-0.082334,0.002787,-0.012869,-0.008725,0.012797


## Feature Selection

In [9]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df_pbcp = feature_select(
    normalize_sc_df_pbcp,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df_pbcp, sc_norm_fs_output_file_pbcp)

print(feature_select_norm_sc_df_pbcp.shape)
feature_select_norm_sc_df_pbcp.head()

(1693, 411)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_02_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_SumEntropy_DAPI_3_02_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,1.291065,1.225985,1.173539,1.381689,0.195047,1.088877,2.57988,-0.208679,-0.500045,-0.188397
1,A,1,NF1,WT,7,1,A1,2,6,2,...,0.425598,-0.234417,-0.588608,-0.627445,-0.161176,0.654427,1.241258,0.16231,-0.444274,-0.38888
2,A,1,NF1,WT,7,1,A1,3,7,3,...,0.775996,-0.425222,-0.684639,0.005761,0.563572,0.454405,-0.865679,-1.895556,-0.481339,-0.225384
3,A,1,NF1,WT,7,1,A1,4,8,4,...,1.126244,0.815001,0.68548,0.824516,0.117949,1.269003,1.735603,-0.356326,-0.552462,-0.320972
4,A,1,NF1,WT,7,1,A1,5,9,5,...,0.918591,0.53317,0.505952,0.811728,-0.242043,-0.940439,-0.13922,-0.009618,-0.095167,0.040681


---

### Visualize basic count statistics

In [10]:
sc_df_pbcp.Metadata_genotype.value_counts()

Null    1000
WT       693
Name: Metadata_genotype, dtype: int64

In [11]:
pd.crosstab(sc_df_pbcp.Metadata_genotype, sc_df_pbcp.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,44,0,53,0,48,0,59,0,64,...,0,77,0,54,0,72,0,53,0,59
WT,48,0,51,0,49,0,48,0,55,0,...,44,0,55,0,33,0,26,0,29,0
