# Perform feature selection on normalized data

## Import libraries

In [1]:
import sys
import pathlib
import gc
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# directory where normalized parquet file is located
data_dir = pathlib.Path("./data/normalized_data")

# directory where the feature selected parquet file is saved to
output_dir = pathlib.Path("./data/feature_selected_data")
output_dir.mkdir(exist_ok=True)

# define input path
normalized_file_path = str(pathlib.Path(f"{data_dir}/PBMC_sc_norm.parquet"))

# define ouput path
feature_select_output_file = str(pathlib.Path(f"{output_dir}/PBMC_sc_norm_fs.parquet"))

In [3]:
# process each run
normalized_df = pd.read_parquet(normalized_file_path)

In [4]:
normalized_df.shape

(8318751, 2926)

## Perform feature selection

In [5]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
]

print(f"Performing feature selection on normalized annotated merged single cells!")

# perform feature selection with the operations specified
feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
)

del normalized_df
gc.collect()

Performing feature selection on normalized annotated merged single cells!
Starting variance thresholding
Finished variance thresholding
Dropping NA Columns
Finished dropping NA columns
Finished


0

In [6]:
print(feature_select_df.shape)

(8318751, 2164)


In [7]:
# Assuming 'well' is the column you want to stratify by
feature_select_df = feature_select_df.groupby('Metadata_Well').apply(lambda x: x.sample(frac=0.01, random_state=0)).reset_index(drop=True)
print(feature_select_df.shape)

(83191, 2164)


In [8]:
feature_select_ops = [
    "correlation_threshold",
]
# perform feature selection with the operations specified
feature_select_df = feature_select(
    feature_select_df,
    operation=feature_select_ops,
)
print(feature_select_df.shape)
# get the column names of the feature selected dataframe
feature_select_df_columns = feature_select_df.columns.tolist()

Starting correlation thresholding
Starting pairwise correlations
Finished correlation calculations
Finshed correlation computation - continuing to feature selection
Finished correlation thresholding
Finished
(83191, 1216)


In [12]:
# reload the normalized dataframe
normalized_df = pd.read_parquet(normalized_file_path)
# filter the normalized dataframe to only include the columns that were selected
feature_select_df = normalized_df.reindex(columns=feature_select_df_columns)
print(feature_select_df.shape)

(8318751, 1216)


In [13]:
# save features selected df as parquet file
output(
    df=feature_select_df,
    output_filename=feature_select_output_file,
    output_type="parquet"
)
print(f"Features have been selected for PBMC cells and saved to {pathlib.Path(feature_select_output_file).name}!")

Features have been selected for PBMC cells and saved to PBMC_sc_norm_fs.parquet!


In [14]:
# check to see if the shape of the df has changed indicating feature selection occurred
print(feature_select_df.shape)
feature_select_df.head()

(8318751, 1216)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_03_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrER_3_00_256
0,PBMC,B02,47666,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,1.148187,1.069975,0.368181,-0.117756,0.45826,0.020577,-0.361602,0.41709,-0.027274,-0.186865
1,PBMC,B02,47666,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,0.795113,0.66126,-0.008886,0.727637,0.590452,0.383109,0.59952,-0.560967,-0.141698,-0.178472
2,PBMC,B02,47666,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,0.621216,-0.102527,1.04818,0.535328,0.446772,0.625449,0.430404,-0.842064,-0.147543,-0.136735
3,PBMC,B02,47666,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,-1.15784,-1.593708,0.252187,-3.017132,-1.380525,-1.310792,-3.024079,2.345765,0.802267,0.391562
4,PBMC,B02,47666,6,DMSO,0.025,%,LPS,0.01,µg/ml,...,-0.407971,-1.085826,-0.680803,-0.325516,-0.02155,-0.227969,-0.04475,-0.210408,-0.12913,-0.083951


In [18]:
# seperate the metadata and profile data
metadata_cols = feature_select_df.columns[feature_select_df.columns.str.contains("Metadata")]
profile_cols = feature_select_df.columns[~feature_select_df.columns.str.contains("Metadata")]
print(len(metadata_cols))
print(len(profile_cols))
print(len(normalized_df.columns)-len(metadata_cols))

19
1197
2907
