# Process single cell morphology features for CellProfiler readouts

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../CellProfiler_pipelines"
output_dir = "data"

### Plate 1

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file = "NF1_data.sqlite"
single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"

# Set path with name for outputted data
sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler.csv.gz")
sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz")
sc_norm_fs_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_cellprofiler.csv.gz")

### Plate 2

In [4]:
# Set name and path of .sqlite file and path to metadata
sql_file2 = "NF1_data_plate2.sqlite"
single_cell_file2 = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file2}"
platemap_file2 = f"{cp_dir}/Metadata/platemap_NF1_CP_Plate2.csv"

# Set path with name for outputted data
sc_output_file2 = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler_plate2.csv.gz")
sc_norm_output_file2 = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler_plate2.csv.gz")
sc_norm_fs_output_file2 = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_cellprofiler_plate2.csv.gz")

## Set up names for linking columns between tables in the database file

In [5]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_OrigNuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Plate 1

### Load and view platemap file

In [6]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,C,6,C6,NF1,WT
1,C,7,C7,NF1,Null
2,D,6,D6,NF1,WT
3,D,7,D7,NF1,Null
4,E,6,E6,NF1,WT
5,E,7,E7,NF1,Null
6,F,6,F6,NF1,WT
7,F,7,F7,NF1,Null


### Set up `SingleCells` class from Pycytominer

In [7]:
# Instantiate SingleCells class
sc = cells.SingleCells(
    sql_file=single_cell_file,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



### Merge single cells 

In [8]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df = sc.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df, sc_output_file)

print(sc_df.shape)
sc_df.head()

(149, 1054)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,4,1,...,1778.355949,1715.661141,306.13973,295.581509,310.469726,287.78839,496.084704,502.046808,490.259298,491.171009
1,C,6,NF1,WT,1,1,C6,2,5,2,...,366.696473,320.304744,312.669442,314.123609,330.563627,295.428066,99.874165,100.19489,104.700258,99.916735
2,C,6,NF1,WT,1,1,C6,3,7,3,...,356.359632,379.334116,419.277399,366.291857,365.844449,341.137003,104.292865,102.844307,103.764869,103.749468
3,C,6,NF1,WT,1,1,C6,4,8,4,...,784.257119,747.557748,390.160802,398.535455,394.923449,359.749244,213.883176,223.214126,225.159172,210.879537
4,C,6,NF1,WT,4,1,C6,1,3,1,...,558.440195,495.532894,75.455753,72.548299,75.497862,70.903668,149.088921,146.259081,149.11093,149.976102


### Normalize Data

In [9]:
# Normalize single cell data and write to file
normalize_sc_df = normalize(
    sc_df,
    method="standardize"
)

output(normalize_sc_df, sc_norm_output_file)

print(normalize_sc_df.shape)
normalize_sc_df.head()

(149, 1054)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,NF1,WT,1,1,C6,1,4,1,...,3.14154,3.202273,-0.097356,-0.096165,-0.094202,-0.106456,3.337969,3.350528,3.278168,3.310371
1,C,6,NF1,WT,1,1,C6,2,5,2,...,0.315924,0.258633,-0.087971,-0.069493,-0.065539,-0.095377,0.314776,0.31392,0.34842,0.318693
2,C,6,NF1,WT,1,1,C6,3,7,3,...,0.295233,0.383161,0.065251,0.00555,-0.015212,-0.029087,0.348492,0.33394,0.341312,0.347999
3,C,6,NF1,WT,1,1,C6,4,8,4,...,1.151725,1.159965,0.023403,0.051931,0.026268,-0.002094,1.184695,1.243519,1.263751,1.167156
4,C,6,NF1,WT,4,1,C6,1,3,1,...,0.699723,0.628294,-0.428904,-0.416992,-0.429383,-0.420997,0.690298,0.662006,0.685883,0.701466


### Feature Selection

In [10]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df = feature_select(
    normalize_sc_df,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df, sc_norm_fs_output_file)

print(feature_select_norm_sc_df.shape)
feature_select_norm_sc_df.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(149, 443)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas1_RFP_3_03_256,Nuclei_Texture_InfoMeas2_GFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,C,6,NF1,WT,1,1,C6,1,4,1,...,-1.197488,0.289091,0.969456,1.131385,1.30368,1.416917,-0.079438,-2.315521,-1.69321,2.881199
1,C,6,NF1,WT,1,1,C6,2,5,2,...,0.188414,0.611666,0.481954,0.748184,0.750277,0.511083,-0.065958,-1.460076,-1.427579,0.304121
2,C,6,NF1,WT,1,1,C6,3,7,3,...,-1.087258,0.843883,-0.214887,0.238299,0.482832,1.26495,-0.069749,-1.841707,-0.798368,0.257284
3,C,6,NF1,WT,1,1,C6,4,8,4,...,-1.250742,0.638684,1.163023,1.062039,1.082605,1.38685,-0.272864,-1.789888,-1.432404,1.083761
4,C,6,NF1,WT,4,1,C6,1,3,1,...,-0.258815,-2.222128,-0.048779,0.504843,1.34083,0.924382,0.612704,-2.158178,-1.781201,0.518641


---

### Visualize basic count statistics for Plate 1

In [11]:
sc_df.Metadata_genotype.value_counts()

Null    116
WT       33
Name: Metadata_genotype, dtype: int64

In [12]:
pd.crosstab(sc_df.Metadata_genotype, sc_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,12,0,14,0,44,0,46
WT,12,0,5,0,9,0,7,0


---

## Plate 2

### Load and view platemap file

In [13]:
# Load platemap file
platemap_df2 = pd.read_csv(platemap_file2)
platemap_df2

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,A,1,A1,NF1,WT
1,A,6,A6,NF1,WT
2,A,7,A7,NF1,Null
3,A,12,A12,NF1,Null
4,B,1,B1,NF1,WT
5,B,6,B6,NF1,WT
6,B,7,B7,NF1,Null
7,B,12,B12,NF1,Null
8,C,1,C1,NF1,WT
9,C,6,C6,NF1,WT


### Set up `SingleCells` class from Pycytominer

In [14]:
# Instantiate SingleCells class
sc2 = cells.SingleCells(
    sql_file=single_cell_file2,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



### Merge single cells 

In [15]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df2 = sc2.merge_single_cells(
    platemap=platemap_df2,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df2, sc_output_file2)

print(sc_df2.shape)
sc_df2.head()

(1078, 1054)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,5,1,A1,1,5,1,...,99.246431,127.269878,64.592939,66.0357,69.603941,68.479207,37.257204,35.536674,36.570249,37.428376
1,A,1,NF1,WT,5,1,A1,2,6,2,...,160.7511,192.118563,53.48185,55.783816,56.522494,54.131093,53.59517,51.265407,52.753667,55.693539
2,A,1,NF1,WT,5,1,A1,3,7,3,...,132.813377,133.856803,363.894439,336.67715,389.377029,428.473729,42.007024,41.871654,42.511226,42.716327
3,A,1,NF1,WT,5,1,A1,4,8,4,...,99.931645,103.010625,149.103688,150.211513,146.167236,147.193631,33.981317,34.0892,33.52989,34.066777
4,A,1,NF1,WT,5,1,A1,5,9,5,...,313.800603,306.992919,80.046977,81.488832,94.438209,81.618378,108.396733,108.807666,109.200151,108.207648


### Normalize data

In [16]:
# Normalize single cell data and write to file
normalize_sc_df2 = normalize(
    sc_df2,
    method="standardize"
)

output(normalize_sc_df2, sc_norm_output_file2)

print(normalize_sc_df2.shape)
normalize_sc_df2.head()

(1078, 1054)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,5,1,A1,1,5,1,...,-0.276551,-0.222624,-0.561033,-0.555776,-0.553759,-0.554094,-0.264862,-0.267745,-0.270101,-0.260436
1,A,1,NF1,WT,5,1,A1,2,6,2,...,-0.210215,-0.150903,-0.581121,-0.574565,-0.577509,-0.580572,-0.19729,-0.202614,-0.202862,-0.184812
2,A,1,NF1,WT,5,1,A1,3,7,3,...,-0.240347,-0.215339,-0.01991,-0.059766,0.026809,0.110239,-0.245217,-0.241513,-0.245417,-0.238542
3,A,1,NF1,WT,5,1,A1,4,8,4,...,-0.275812,-0.249454,-0.408241,-0.401505,-0.414754,-0.408835,-0.278411,-0.273739,-0.282733,-0.274354
4,A,1,NF1,WT,5,1,A1,5,9,5,...,-0.045142,-0.023855,-0.533093,-0.527455,-0.508671,-0.529847,0.029364,0.035663,0.031663,0.032616


### Feature selection

In [17]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df2 = feature_select(
    normalize_sc_df2,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df2, sc_norm_fs_output_file2)

print(feature_select_norm_sc_df2.shape)
feature_select_norm_sc_df2.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(1078, 397)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas1_RFP_3_03_256,Nuclei_Texture_InfoMeas2_GFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_SumVariance_GFP_3_01_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,A,1,NF1,WT,5,1,A1,1,5,1,...,-0.098599,-1.244455,-0.578066,-1.607087,-1.574069,0.447549,0.141551,-0.733453,-0.5425,-0.222624
1,A,1,NF1,WT,5,1,A1,2,6,2,...,-0.187982,-1.572139,-0.251567,-0.672769,-0.494294,0.570318,1.162374,-0.712842,-0.543692,-0.150903
2,A,1,NF1,WT,5,1,A1,3,7,3,...,0.438222,0.591007,-0.856059,-1.347109,-0.530479,-0.109228,-0.243271,-0.386558,-0.139043,-0.215339
3,A,1,NF1,WT,5,1,A1,4,8,4,...,0.359058,-0.145504,-0.097314,-0.715362,-0.718491,-0.17737,-0.159728,0.063771,-0.421018,-0.249454
4,A,1,NF1,WT,5,1,A1,5,9,5,...,0.425018,-0.28526,0.815302,0.054524,-0.22338,0.114204,0.703012,-1.357233,-0.508258,-0.023855


---

### Visualize basic count statistics for Plate 2

In [18]:
sc_df2.Metadata_genotype.value_counts()

Null    678
WT      400
Name: Metadata_genotype, dtype: int64

In [19]:
pd.crosstab(sc_df2.Metadata_genotype, sc_df2.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,27,0,32,0,32,0,36,0,41,...,0,50,0,33,0,49,0,35,0,45
WT,30,0,35,0,23,0,30,0,34,0,...,24,0,35,0,12,0,14,0,19,0
