# Perform feature selection on normalized data

## Import libraries

In [1]:
import sys
import pathlib

import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# directory where normalized parquet file is located
data_dir = pathlib.Path("./data/normalized_data")

# directory where the feature selected parquet file is saved to
output_dir = pathlib.Path("./data/feature_selected_data")
output_dir.mkdir(exist_ok=True)

# define input path
normalized_file_path = str(pathlib.Path(f"{data_dir}/SHSY5Y_sc_norm.parquet"))

# define ouput path
feature_select_output_file = str(pathlib.Path(f"{output_dir}/SHSY5Y_sc_norm_fs.parquet"))

## Perform feature selection

In [3]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

# process each run
normalized_df = pd.read_parquet(normalized_file_path)

print(f"Performing feature selection on normalized annotated merged single cells!")

# perform feature selection with the operations specified
feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
)

# save features selected df as parquet file
output(
    df=feature_select_df,
    output_filename=feature_select_output_file,
    output_type="parquet"
)
print(f"Features have been selected for SHSY5Y cells and saved to {pathlib.Path(feature_select_output_file).name}!")

Performing feature selection on normalized annotated merged single cells!
Features have been selected for SHSY5Y cells and saved to SHSY5Y_sc_norm_fs.parquet!


In [4]:
# check to see if the shape of the df has changed indicating feature selection occurred
print(feature_select_df.shape)
feature_select_df.head()

(597902, 1271)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_Site,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,1,6,Media ctr,,,media ctr,,...,-1.841853,-0.597438,-1.295016,-3.127056,-3.004511,-1.481117,1.851482,0.024721,0.307472,0.092086
1,SH-SY5Y,B13,3765,1,6,Media ctr,,,media ctr,,...,-0.841272,-0.681935,-0.330951,-3.725197,-0.827474,-0.461348,0.897731,-0.041156,1.443262,0.009843
2,SH-SY5Y,B13,3765,1,6,Media ctr,,,media ctr,,...,0.500885,-0.229003,-1.254209,-0.691997,-1.374967,-1.337252,0.82597,-0.044386,-0.020445,0.000848
3,SH-SY5Y,B13,3765,1,6,Media ctr,,,media ctr,,...,0.612343,1.059158,0.801677,-0.48718,0.050679,-0.2008,0.459088,-0.060584,-0.069607,-0.029195
4,SH-SY5Y,B13,3765,1,6,Media ctr,,,media ctr,,...,0.188362,-0.538742,-0.802034,-0.507851,0.419403,-0.053121,-0.488278,-0.052383,0.006078,-0.060355
