In [1]:
import pathlib

import pandas as pd
from pycytominer import feature_select
from pycytominer.cyto_utils import output

In [2]:
# set path to normalized data
normalized_data_path = pathlib.Path(
    "../data/20231017ChromaLive_6hr_4ch_MaxIP_normalized_combined_data.parquet"
).resolve(strict=True)

# set the outout file path
feature_selected_output_file_path = pathlib.Path(
    "../data/20231017ChromaLive_6hr_4ch_MaxIP_normalized_combined_data_feature_selected.parquet"
).resolve()

# read in the normalized data
normalized_data = pd.read_parquet(normalized_data_path)
print(normalized_data.shape)
normalized_data.head()

(240048, 3476)


Unnamed: 0,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,C-02,1,10,10,10,1,7914,Staurosporine,0.0,negative,...,-0.442636,-0.611063,2.300909,0.528285,1.19859,-0.012867,-0.334217,0.865129,1.109296,1.342877
1,C-02,1,10,10,100,1,7914,Staurosporine,0.0,negative,...,1.64888,0.479085,-1.315989,-0.2175,0.925141,1.166212,0.189229,1.501307,-0.877394,-1.800525
2,C-02,1,10,10,101,1,7914,Staurosporine,0.0,negative,...,0.343164,1.349414,2.497744,-0.083206,-1.125241,-0.802084,0.880478,-0.025921,-0.643299,-1.579514
3,C-02,1,10,10,102,1,7914,Staurosporine,0.0,negative,...,-1.059285,-0.73472,-0.217822,-0.243605,1.012593,0.707333,0.664375,-0.575244,1.772152,-0.601933
4,C-02,1,10,10,103,1,7914,Staurosporine,0.0,negative,...,-0.56043,-0.308253,0.776449,0.609137,0.347549,-0.149095,2.776645,-0.608087,-1.178395,-0.124372


In [3]:
# define operations to be performed on the data
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]

# Get columns that contain "Metadata"
metadata_features = normalized_data.columns[
    normalized_data.columns.str.contains("Metadata")
].tolist()

# get the feature columns
feature_columns = normalized_data.columns.difference(metadata_features).to_list()

In [4]:
manual_block_list = [
    "Nuclei_TrackObjects_Displacement_50",
    "Nuclei_TrackObjects_DistanceTraveled_50",
    "Nuclei_TrackObjects_IntegratedDistance_50",
    "Nuclei_TrackObjects_Label_50",
    "Nuclei_TrackObjects_Linearity_50",
    "Nuclei_TrackObjects_ParentObjectNumber_50",
    "Nuclei_AreaShape_BoundingBoxArea",
    "Nuclei_AreaShape_BoundingBoxMinimum_X",
    "Cells_AreaShape_BoundingBoxArea",
]

In [5]:
feature_select_df = feature_select(
    normalized_data,
    operation=feature_select_ops,
    features=feature_columns,
)
# add "Metadata_" to the beginning of each column name in the list
feature_select_df.columns = [
    "Metadata_" + column if column in manual_block_list else column
    for column in feature_select_df.columns
]
print("Feature selection complete, saving to parquet file!")
# save features selected df as parquet file
output(
    df=feature_select_df,
    output_filename=feature_selected_output_file_path,
    output_type="parquet",
)
print(f"Features have been selected!")
# check to see if the shape of the df has changed indicating feature selection occurred
print(normalized_data.shape)
print(feature_select_df.shape)
print(f"{normalized_data.shape[1] - feature_select_df.shape[1]} features were removed.")
print(f"{feature_select_df.shape[1]} features remain.")
feature_select_df.head()

Feature selection complete, saving to parquet file!
Features have been selected!
(240048, 3476)
(240048, 2470)
1006 features were removed.
2470 features remain.


Unnamed: 0,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,C-02,1,10,10,10,1,7914,Staurosporine,0.0,negative,...,-0.442636,-0.611063,2.300909,0.528285,1.19859,-0.012867,-0.334217,0.865129,1.109296,1.342877
1,C-02,1,10,10,100,1,7914,Staurosporine,0.0,negative,...,1.64888,0.479085,-1.315989,-0.2175,0.925141,1.166212,0.189229,1.501307,-0.877394,-1.800525
2,C-02,1,10,10,101,1,7914,Staurosporine,0.0,negative,...,0.343164,1.349414,2.497744,-0.083206,-1.125241,-0.802084,0.880478,-0.025921,-0.643299,-1.579514
3,C-02,1,10,10,102,1,7914,Staurosporine,0.0,negative,...,-1.059285,-0.73472,-0.217822,-0.243605,1.012593,0.707333,0.664375,-0.575244,1.772152,-0.601933
4,C-02,1,10,10,103,1,7914,Staurosporine,0.0,negative,...,-0.56043,-0.308253,0.776449,0.609137,0.347549,-0.149095,2.776645,-0.608087,-1.178395,-0.124372
