# Process single cell morphology features for CellProfiler readouts - PyBaSiC and CellProfiler Method

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output, util

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../../../CellProfiler_pipelines"
output_dir = "../../data/Plate1/CellProfiler"

## Set up paths to sqlite files and outputs

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file_pbcp = "NF1_data_pybasic_cp_plate1.sqlite"
single_cell_file_pbcp = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file_pbcp}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"

# Set path with name for outputted data
sc_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_pybasic_cellprofiler.csv.gz")
sc_norm_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_norm_pybasic_cellprofiler.csv.gz")
sc_norm_fs_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_pybasic_cellprofiler.csv.gz")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load in platemap

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,C,6,C6,NF1,WT
1,C,7,C7,NF1,Null
2,D,6,D6,NF1,WT
3,D,7,D7,NF1,Null
4,E,6,E6,NF1,WT
5,E,7,E7,NF1,Null
6,F,6,F6,NF1,WT
7,F,7,F7,NF1,Null


## Set up `SingleCells` class from Pycytominer

In [6]:
# Instantiate SingleCells class
sc_pbcp = cells.SingleCells(
    sql_file=single_cell_file_pbcp,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_feature_categories=["Correlation", "Texture", "Granularity"],
    add_image_features=True,
    load_image_data=True,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols=["ImageNumber"]
)



## Merge single cells 

In [7]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df_pbcp = sc_pbcp.merge_single_cells(
    platemap=platemap_file,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df_pbcp, sc_output_file_pbcp)

print(sc_df_pbcp.shape)
sc_df_pbcp.head()

(241, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,3,1,...,1238.72285,1179.250712,655.943302,615.439093,607.532978,583.378878,262.401005,257.912781,263.508342,265.420711
1,C,6,NF1,WT,1,1,C6,2,4,2,...,1321.816347,1230.24571,514.140384,494.307862,498.533943,448.867667,96.646848,98.695691,103.741923,95.047508
2,C,6,NF1,WT,1,1,C6,3,5,3,...,1186.118097,1152.224625,291.519216,265.656395,266.725373,238.545128,91.570219,87.388137,98.279571,97.734207
3,C,6,NF1,WT,1,1,C6,4,7,4,...,865.094181,839.097694,218.430012,194.968446,191.831872,193.43147,93.765059,93.809022,91.933528,91.105865
4,C,6,NF1,WT,1,1,C6,5,8,5,...,1159.793785,1147.506816,303.06524,296.152842,306.75981,282.561259,616.983573,620.487106,606.813743,608.887347


## Normalize data

In [8]:
# Normalize single cell data and write to file
normalize_sc_pbcp = normalize(
    sc_df_pbcp,
    method="standardize"
)

output(normalize_sc_pbcp, sc_norm_output_file_pbcp)

print(normalize_sc_pbcp.shape)
normalize_sc_pbcp.head()

(241, 1209)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,3,1,...,-0.44921,-0.45687,0.795275,0.73886,0.676636,0.658115,1.079249,1.051704,1.084199,1.111437
1,C,6,NF1,WT,1,1,C6,2,4,2,...,-0.329431,-0.379423,0.474068,0.460919,0.431895,0.351245,0.080868,0.096177,0.123054,0.075878
2,C,6,NF1,WT,1,1,C6,3,5,3,...,-0.525041,-0.497915,-0.030206,-0.063733,-0.088596,-0.12858,0.05029,0.028316,0.090193,0.092208
3,C,6,NF1,WT,1,1,C6,4,7,4,...,-0.987799,-0.973467,-0.195765,-0.22593,-0.256758,-0.231501,0.06351,0.06685,0.052016,0.05192
4,C,6,NF1,WT,1,1,C6,5,8,5,...,-0.562987,-0.50508,-0.004053,0.006243,0.001295,-0.028163,3.214995,3.227661,3.149502,3.19909


## Feature selection

In [9]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_pbcp = feature_select(
    normalize_sc_pbcp,
    operation=feature_select_ops
)

output(feature_select_norm_sc_pbcp, sc_norm_fs_output_file_pbcp)

print(feature_select_norm_sc_pbcp.shape)
feature_select_norm_sc_pbcp.head()

(241, 446)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas2_DAPI_3_03_256,Nuclei_Texture_InfoMeas2_GFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_00_256,Nuclei_Texture_SumEntropy_DAPI_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,3,1,...,0.700737,0.965755,0.438415,0.534472,0.943146,1.380947,-0.347949,1.057741,-0.273991,1.118008
1,C,6,NF1,WT,1,1,C6,2,4,2,...,0.697642,1.122196,-0.156684,0.382543,1.009916,0.293588,-1.463565,1.105301,-0.168553,0.042082
2,C,6,NF1,WT,1,1,C6,3,5,3,...,0.505022,0.664303,-0.671115,-0.703113,0.135464,0.82772,-0.240754,0.971345,-0.42414,0.116954
3,C,6,NF1,WT,1,1,C6,4,7,4,...,0.41972,0.305376,-0.10989,0.498398,1.252383,0.271598,-0.113838,0.768712,-0.837286,0.021548
4,C,6,NF1,WT,1,1,C6,5,8,5,...,0.68671,0.586884,1.036074,1.178635,1.28966,1.428171,-0.765977,0.963674,-0.493886,3.089762


---

### Visualize basic count statistics

In [10]:
sc_df_pbcp.Metadata_genotype.value_counts()

Null    172
WT       69
Name: Metadata_genotype, dtype: int64

In [11]:
pd.crosstab(sc_df_pbcp.Metadata_genotype, sc_df_pbcp.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,22,0,27,0,56,0,67
WT,25,0,7,0,15,0,22,0
