# Process single cell morphology features for CellProfiler readouts - Plate 2

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../../CellProfiler_pipelines"
output_dir = "../data/Plate2/"

## Set paths to sqlite files

### All CellProfiler Method

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file = "NF1_data_allcp_plate2.sqlite"
single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP_Plate2.csv"

# Set path with name for outputted data
sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler_plate2.csv.gz")
sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler_plate2.csv.gz")
sc_norm_fs_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_cellprofiler_plate2.csv.gz")

### PyBaSiC and CellProfiler Method

In [4]:
# Set name and path of .sqlite file and path to metadata
sql_file_pbcp = "NF1_data_pybasic_cp_plate2.sqlite"
single_cell_file_pbcp = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file}"
platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP_Plate2.csv"

# Set path with name for outputted data
sc_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_pybasic_cp_plate2.csv.gz")
sc_norm_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_norm_pybasic_cp_plate2.csv.gz")
sc_norm_fs_output_file_pbcp = pathlib.Path(f"{output_dir}/nf1_sc_norm_fs_pybasic_cp_plate2.csv.gz")

## Set up names for linking columns between tables in the database file

In [5]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## All CellProfiler Method

### Load and view platemap file

In [6]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,A,1,A1,NF1,WT
1,A,6,A6,NF1,WT
2,A,7,A7,NF1,Null
3,A,12,A12,NF1,Null
4,B,1,B1,NF1,WT


### Set up `SingleCells` class from Pycytominer

In [7]:
# Instantiate SingleCells class
sc = cells.SingleCells(
    sql_file=single_cell_file,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



### Merge single cells 

In [8]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df = sc.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df, sc_output_file)

print(sc_df.shape)
sc_df.head()

(1681, 1053)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,326.30527,326.598282,81.232293,84.845421,95.18677,82.720187,83.470121,82.159449,83.742652,83.407585
1,A,1,NF1,WT,7,1,A1,2,6,2,...,80.508695,75.817435,111.349822,100.684921,104.498224,116.178237,24.678482,25.071391,24.79476,24.584216
2,A,1,NF1,WT,7,1,A1,3,7,3,...,188.830897,213.251092,92.415912,95.454254,102.414212,99.273349,75.785872,71.502887,73.222713,73.84663
3,A,1,NF1,WT,7,1,A1,4,8,4,...,131.651607,130.380536,51.557684,54.548247,60.101483,51.289874,35.641896,34.998177,36.353194,36.357001
4,A,1,NF1,WT,7,1,A1,5,9,5,...,409.97624,428.771555,286.379839,288.316078,296.102255,293.522567,122.720753,117.844431,120.696927,124.334695


### Normalize Data

In [9]:
# Normalize single cell data and write to file
normalize_sc_df = normalize(
    sc_df,
    method="standardize"
)

output(normalize_sc_df, sc_norm_output_file)

print(normalize_sc_df.shape)
normalize_sc_df.head()

(1681, 1053)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,-0.004132,0.030298,-0.508826,-0.497972,-0.482617,-0.504481,-0.055592,-0.056597,-0.054803,-0.050452
1,A,1,NF1,WT,7,1,A1,2,6,2,...,-0.318141,-0.300751,-0.451858,-0.467648,-0.464979,-0.440064,-0.342162,-0.335627,-0.343553,-0.338699
2,A,1,NF1,WT,7,1,A1,3,7,3,...,-0.179758,-0.119329,-0.487672,-0.477662,-0.468926,-0.472611,-0.093048,-0.108683,-0.106334,-0.097302
3,A,1,NF1,WT,7,1,A1,4,8,4,...,-0.252806,-0.228724,-0.564957,-0.555973,-0.549081,-0.564993,-0.288723,-0.287108,-0.286935,-0.28101
4,A,1,NF1,WT,7,1,A1,5,9,5,...,0.102759,0.165174,-0.120782,-0.108444,-0.102018,-0.098625,0.135728,0.117822,0.126213,0.1501


### Feature Selection

In [10]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df = feature_select(
    normalize_sc_df,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df, sc_norm_fs_output_file)

print(feature_select_norm_sc_df.shape)
feature_select_norm_sc_df.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(1681, 380)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas1_RFP_3_03_256,Nuclei_Texture_InfoMeas2_GFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_SumVariance_GFP_3_01_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,-3.007891,0.334534,1.366869,1.346246,1.26446,1.483208,0.51941,1.598186,-0.458471,0.030298
1,A,1,NF1,WT,7,1,A1,2,6,2,...,0.596768,0.226732,0.493606,-0.157697,-0.486096,-0.485824,0.400789,0.478036,-0.482441,-0.300751
2,A,1,NF1,WT,7,1,A1,3,7,3,...,0.2565,-0.333882,0.881035,-0.121555,-0.435451,0.231137,0.341268,-1.156038,-0.475448,-0.119329
3,A,1,NF1,WT,7,1,A1,4,8,4,...,-0.701806,0.002554,1.18288,0.839324,0.67178,0.839661,0.89711,0.950847,-0.519997,-0.228724
4,A,1,NF1,WT,7,1,A1,5,9,5,...,-0.605042,0.528578,1.002495,0.623923,0.613636,0.936982,-0.839145,-0.421461,-0.140123,0.165174


---

### Visualize basic count statistics for All CellProfiler Method

In [11]:
sc_df.Metadata_genotype.value_counts()

Null    994
WT      687
Name: Metadata_genotype, dtype: int64

In [12]:
pd.crosstab(sc_df.Metadata_genotype, sc_df.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,44,0,52,0,47,0,61,0,62,...,0,74,0,53,0,73,0,52,0,58
WT,47,0,52,0,48,0,47,0,54,0,...,45,0,54,0,32,0,27,0,29,0


---

## PyBaSiC and CellProfiler Method

### Load and view platemap file

In [13]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,A,1,A1,NF1,WT
1,A,6,A6,NF1,WT
2,A,7,A7,NF1,Null
3,A,12,A12,NF1,Null
4,B,1,B1,NF1,WT


### Set up `SingleCells` class from Pycytominer

In [14]:
# Instantiate SingleCells class
sc_pbcp = cells.SingleCells(
    sql_file=single_cell_file_pbcp,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



### Merge single cells 

In [15]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well_position", "Image_Metadata_Well"]}

sc_df_pbcp = sc_pbcp.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df_pbcp, sc_output_file_pbcp)

print(sc_df_pbcp.shape)
sc_df_pbcp.head()

(1681, 1053)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,326.30527,326.598282,81.232293,84.845421,95.18677,82.720187,83.470121,82.159449,83.742652,83.407585
1,A,1,NF1,WT,7,1,A1,2,6,2,...,80.508695,75.817435,111.349822,100.684921,104.498224,116.178237,24.678482,25.071391,24.79476,24.584216
2,A,1,NF1,WT,7,1,A1,3,7,3,...,188.830897,213.251092,92.415912,95.454254,102.414212,99.273349,75.785872,71.502887,73.222713,73.84663
3,A,1,NF1,WT,7,1,A1,4,8,4,...,131.651607,130.380536,51.557684,54.548247,60.101483,51.289874,35.641896,34.998177,36.353194,36.357001
4,A,1,NF1,WT,7,1,A1,5,9,5,...,409.97624,428.771555,286.379839,288.316078,296.102255,293.522567,122.720753,117.844431,120.696927,124.334695


### Normalize Data

In [16]:
# Normalize single cell data and write to file
normalize_sc_df_pbcp = normalize(
    sc_df_pbcp,
    method="standardize"
)

output(normalize_sc_df_pbcp, sc_norm_output_file_pbcp)

print(normalize_sc_df_pbcp.shape)
normalize_sc_df_pbcp.head()

(1681, 1053)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,-0.004132,0.030298,-0.508826,-0.497972,-0.482617,-0.504481,-0.055592,-0.056597,-0.054803,-0.050452
1,A,1,NF1,WT,7,1,A1,2,6,2,...,-0.318141,-0.300751,-0.451858,-0.467648,-0.464979,-0.440064,-0.342162,-0.335627,-0.343553,-0.338699
2,A,1,NF1,WT,7,1,A1,3,7,3,...,-0.179758,-0.119329,-0.487672,-0.477662,-0.468926,-0.472611,-0.093048,-0.108683,-0.106334,-0.097302
3,A,1,NF1,WT,7,1,A1,4,8,4,...,-0.252806,-0.228724,-0.564957,-0.555973,-0.549081,-0.564993,-0.288723,-0.287108,-0.286935,-0.28101
4,A,1,NF1,WT,7,1,A1,5,9,5,...,0.102759,0.165174,-0.120782,-0.108444,-0.102018,-0.098625,0.135728,0.117822,0.126213,0.1501


### Feature Selection

In [17]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df_pbcp = feature_select(
    normalize_sc_df_pbcp,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df_pbcp, sc_norm_fs_output_file_pbcp)

print(feature_select_norm_sc_df_pbcp.shape)
feature_select_norm_sc_df_pbcp.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


(1681, 380)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InfoMeas1_RFP_3_03_256,Nuclei_Texture_InfoMeas2_GFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_SumVariance_GFP_3_01_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,A,1,NF1,WT,7,1,A1,1,5,1,...,-3.007891,0.334534,1.366869,1.346246,1.26446,1.483208,0.51941,1.598186,-0.458471,0.030298
1,A,1,NF1,WT,7,1,A1,2,6,2,...,0.596768,0.226732,0.493606,-0.157697,-0.486096,-0.485824,0.400789,0.478036,-0.482441,-0.300751
2,A,1,NF1,WT,7,1,A1,3,7,3,...,0.2565,-0.333882,0.881035,-0.121555,-0.435451,0.231137,0.341268,-1.156038,-0.475448,-0.119329
3,A,1,NF1,WT,7,1,A1,4,8,4,...,-0.701806,0.002554,1.18288,0.839324,0.67178,0.839661,0.89711,0.950847,-0.519997,-0.228724
4,A,1,NF1,WT,7,1,A1,5,9,5,...,-0.605042,0.528578,1.002495,0.623923,0.613636,0.936982,-0.839145,-0.421461,-0.140123,0.165174


---

### Visualize basic count statistics for PyBaSiC and CellProfiler Method

In [18]:
sc_df_pbcp.Metadata_genotype.value_counts()

Null    994
WT      687
Name: Metadata_genotype, dtype: int64

In [19]:
pd.crosstab(sc_df_pbcp.Metadata_genotype, sc_df_pbcp.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,44,0,52,0,47,0,61,0,62,...,0,74,0,53,0,73,0,52,0,58
WT,47,0,52,0,48,0,47,0,54,0,...,45,0,54,0,32,0,27,0,29,0
