In [1]:
import pathlib

import pandas as pd
from pycytominer import feature_select
from pycytominer.cyto_utils import output

In [2]:
# set path to normalized data
normalized_data_path = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_norm.parquet"
).resolve(strict=True)

# set the outout file path
feature_selected_output_file_path = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve()

# read in the normalized data
normalized_data = pd.read_parquet(normalized_data_path)
print(normalized_data.shape)
normalized_data.head()

(140235, 3870)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,1.0,...,0.455522,0.39468,1.649389,0.010468,0.426432,-0.339073,1.587193,-0.362701,0.812067,1.215035
1,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,15.0,...,1.055508,0.703437,-1.417191,0.261842,1.097767,-1.700814,0.564109,0.238833,0.718067,0.768177
2,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,16.0,...,-0.834625,0.309778,-0.677732,1.00843,-0.144541,-1.514545,-0.609423,0.447713,1.357439,1.000067
3,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,26.0,...,-1.519487,1.510775,-0.939313,-0.584098,2.944786,-0.227158,0.076995,-1.431423,-0.097869,0.970456
4,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,37.0,...,-1.171937,0.214552,2.372796,-0.591718,-0.306193,0.506419,-0.360647,1.294736,1.949604,0.107606


In [3]:
# define operations to be performed on the data
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]

# Get columns that contain "Metadata"
metadata_features = normalized_data.columns[
    normalized_data.columns.str.contains("Metadata")
].tolist()

# get the feature columns
feature_columns = normalized_data.columns.difference(metadata_features).to_list()

In [4]:
feature_select_df = feature_select(
    normalized_data,
    operation=feature_select_ops,
    features=feature_columns,
)
# add "Metadata_" to the beginning of each column name in the list
manual_block_list = [
    x
    for x in normalized_data.columns
    if "bounding" in x.lower()
    or "Location_Center_Y" in x.lower()
    or "Location_Center_X" in x.lower()
]

feature_select_df.columns = [
    "Metadata_" + column if column in manual_block_list else column
    for column in feature_select_df.columns
]
print("Feature selection complete, saving to parquet file!")
# save features selected df as parquet file
output(
    df=feature_select_df,
    output_filename=feature_selected_output_file_path,
    output_type="parquet",
)
print("Features have been selected!")
# check to see if the shape of the df has changed indicating feature selection occurred
print(normalized_data.shape)
print(feature_select_df.shape)
print(f"{normalized_data.shape[1] - feature_select_df.shape[1]} features were removed.")
print(f"{feature_select_df.shape[1]} features remain.")
feature_select_df.head()

Feature selection complete, saving to parquet file!
Features have been selected!
(140235, 3870)
(140235, 2448)
1422 features were removed.
2448 features remain.


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,1.0,...,0.455522,0.39468,1.649389,0.010468,0.426432,-0.339073,1.587193,-0.362701,0.812067,1.215035
1,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,15.0,...,1.055508,0.703437,-1.417191,0.261842,1.097767,-1.700814,0.564109,0.238833,0.718067,0.768177
2,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,16.0,...,-0.834625,0.309778,-0.677732,1.00843,-0.144541,-1.514545,-0.609423,0.447713,1.357439,1.000067
3,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,26.0,...,-1.519487,1.510775,-0.939313,-0.584098,2.944786,-0.227158,0.076995,-1.431423,-0.097869,0.970456
4,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,37.0,...,-1.171937,0.214552,2.372796,-0.591718,-0.306193,0.506419,-0.360647,1.294736,1.949604,0.107606
