# Process single cell morphology features for CellProfiler readouts

## Import Libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import cells, output

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = "../3.cellprofiler_analysis"
output_dir = "data"

## Set paths to sqlite files

In [3]:
# Set name and path of .sqlite file and path to metadata
sql_file = "interstellar_wave3.sqlite"
single_cell_file = f"sqlite:///{cp_dir}/analysis_output/{sql_file}"
platemap_file = "../../metadata/Interstellar_platemap.csv"

# Set path with name for outputted data
sc_output_file = pathlib.Path(f"{output_dir}/interstellar_wave3_sc.csv.gz")
sc_norm_output_file = pathlib.Path(f"{output_dir}/interstellar_wave3_sc_norm_cellprofiler.csv.gz")
sc_norm_fs_output_file = pathlib.Path(f"{output_dir}/interstellar_wave3_sc_norm_fs_cellprofiler.csv.gz")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load and view platemap file

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,well,wellName,row,col,alias,treatment,dose
0,A01,A1,1,1,1,LPS,10µg/ml
1,B01,B1,2,1,2,LPS,1µg/ml
2,C01,C1,3,1,3,H2O2,500µM
3,D01,D1,4,1,4,H2O2,50µM
4,E01,E1,5,1,5,ATP,1mM


## Set up `SingleCells` class from Pycytominer

In [6]:
# Instantiate SingleCells class
sc = cells.SingleCells(
    sql_file=single_cell_file,
    compartments=["Per_Cells", "Per_Cytoplasm", "Per_Nuclei"],
    compartment_linking_cols=linking_cols,
    image_table_name="Per_Image",
    strata=["Image_Metadata_Well", "Image_Metadata_Plate"],
    merge_cols=["ImageNumber"],
    image_cols="ImageNumber",
    load_image_data=True
)



## Merge single cells 

In [7]:
# Merge single cells across compartments
anno_kwargs = {"join_on": ["Metadata_well", "Image_Metadata_Well"]}

sc_df = sc.merge_single_cells(
    platemap=platemap_df,
    **anno_kwargs,
)

# Save level 2 data as a csv
output(sc_df, sc_output_file)

print(sc_df.shape)
sc_df.head()

(265283, 2473)


Unnamed: 0,Metadata_wellName,Metadata_row,Metadata_col,Metadata_alias,Metadata_treatment,Metadata_dose,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256,Nuclei_Texture_Variance_CorrRNA_3_00_256,Nuclei_Texture_Variance_CorrRNA_3_01_256,Nuclei_Texture_Variance_CorrRNA_3_02_256,Nuclei_Texture_Variance_CorrRNA_3_03_256
0,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,1,...,0.146833,0.150225,0.401413,0.375217,0.383468,0.416597,0.408288,0.381595,0.411175,0.452748
1,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,2,...,0.222861,0.220533,0.117626,0.121171,0.117034,0.10911,0.367092,0.362007,0.354774,0.329226
2,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,3,...,0.087708,0.089044,0.241971,0.243451,0.241474,0.238959,0.276973,0.276883,0.281929,0.272285
3,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,4,...,0.193027,0.195484,0.138033,0.157668,0.152092,0.133185,0.372212,0.402119,0.393876,0.376437
4,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,5,...,0.239165,0.241708,0.381533,0.398054,0.393185,0.388422,0.338986,0.328554,0.321786,0.319527


## Normalize data by DMSO 0.1% treatment

In [9]:
sc_df["Metadata_treatment"].unique()

array(['LPS', 'Disulfiram', 'H2O2', 'Thapsi', 'ATP', 'LPS + Nigericin',
       'Flagellin', 'DMSO 0.1%', 'Media only'], dtype=object)

In [10]:
# Normalize single cell data and write to file
normalize_sc_df = normalize(
    sc_df,
    samples="Metadata_treatment == 'DMSO 0.1%'",
    method="standardize"
)

output(normalize_sc_df, sc_norm_output_file)

print(normalize_sc_df.shape)
normalize_sc_df.head()

(265283, 2473)


Unnamed: 0,Metadata_wellName,Metadata_row,Metadata_col,Metadata_alias,Metadata_treatment,Metadata_dose,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrPM_3_00_256,Nuclei_Texture_Variance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrPM_3_02_256,Nuclei_Texture_Variance_CorrPM_3_03_256,Nuclei_Texture_Variance_CorrRNA_3_00_256,Nuclei_Texture_Variance_CorrRNA_3_01_256,Nuclei_Texture_Variance_CorrRNA_3_02_256,Nuclei_Texture_Variance_CorrRNA_3_03_256
0,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,1,...,-0.20622,-0.204173,-0.181774,-0.196815,-0.194551,-0.165967,-0.047242,-0.048761,-0.048065,-0.04521
1,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,2,...,-0.191783,-0.190637,-0.396634,-0.389459,-0.394518,-0.401155,-0.049133,-0.049689,-0.050713,-0.051065
2,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,3,...,-0.217447,-0.215953,-0.30249,-0.296733,-0.301122,-0.301837,-0.05327,-0.053722,-0.054132,-0.053764
3,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,4,...,-0.197448,-0.19546,-0.381183,-0.361783,-0.368207,-0.38274,-0.048898,-0.047789,-0.048877,-0.048827
4,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,5,...,-0.188688,-0.18656,-0.196825,-0.179498,-0.187258,-0.187517,-0.050424,-0.051274,-0.052261,-0.051525


## Feature selection

In [11]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

feature_select_norm_sc_df = feature_select(
    normalize_sc_df,
    operation=feature_select_ops
)

output(feature_select_norm_sc_df, sc_norm_fs_output_file)

print(feature_select_norm_sc_df.shape)
feature_select_norm_sc_df.head()

(265283, 568)


Unnamed: 0,Metadata_wellName,Metadata_row,Metadata_col,Metadata_alias,Metadata_treatment,Metadata_dose,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_DifferenceVariance_CorrPM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrDNA_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256
0,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,1,...,-0.699198,-0.351559,-0.146933,-0.593356,-0.146447,0.02235,0.396804,0.382902,-0.353635,-0.788263
1,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,2,...,0.63004,0.535725,-0.794853,-0.328332,-0.055755,-0.785782,0.141279,0.271795,0.77774,0.779324
2,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,3,...,0.205534,1.045614,0.680502,0.826678,0.806227,0.781439,0.738322,0.683853,-0.014283,-0.012623
3,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,4,...,0.43197,1.092862,-0.110188,0.518159,0.301195,-0.365383,0.137225,0.207344,0.704001,0.732725
4,A13,1,13,1,LPS,10µg/ml,1,70117_20230118MM1_CellPainting_A700_20X_V1,A13,5,...,-0.237877,0.852926,1.129569,0.844405,0.546343,0.646782,0.188985,0.421418,-0.069492,0.243017


## View info of the dataframe for single cell data

In [12]:
sc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265283 entries, 0 to 265282
Columns: 2473 entries, Metadata_wellName to Nuclei_Texture_Variance_CorrRNA_3_03_256
dtypes: int64(3), object(2470)
memory usage: 4.9+ GB


---

### Visualize basic count statistics

In [13]:
sc_df.Metadata_treatment.value_counts()

Flagellin          38000
LPS                36930
ATP                36192
LPS + Nigericin    34315
Disulfiram         33049
Thapsi             25624
H2O2               22989
DMSO 0.1%          19210
Media only         18974
Name: Metadata_treatment, dtype: int64

In [14]:
pd.crosstab(sc_df.Metadata_treatment, sc_df.Metadata_Well)

Metadata_Well,A13,A14,A15,A16,A17,A18,A19,A20,A21,A22,...,P15,P16,P17,P18,P19,P20,P21,P22,P23,P24
Metadata_treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DMSO 0.1%,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Disulfiram,0,0,0,0,0,0,0,0,0,0,...,1860,0,1013,0,0,0,1450,0,703,0
Flagellin,0,1621,1830,0,1535,0,0,1512,1369,0,...,0,1674,0,0,0,0,0,1523,0,0
H2O2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LPS,1810,0,0,0,0,0,1432,0,0,0,...,0,0,0,1562,0,0,0,0,0,1089
LPS + Nigericin,0,0,0,1841,0,0,0,0,0,1562,...,0,0,0,0,0,1710,0,0,0,0
Media only,0,0,0,0,0,1474,0,0,0,0,...,0,0,0,0,1690,0,0,0,0,0
Thapsi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
