# Perform feature selection on normalized data

## Import libraries

In [1]:
import sys
import pathlib
import gc
import pandas as pd

from pycytominer import feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# directory where normalized parquet file is located
data_dir = pathlib.Path("./data/normalized_data")

# directory where the feature selected parquet file is saved to
output_dir = pathlib.Path("./data/feature_selected_data")
output_dir.mkdir(exist_ok=True)

# define input path
normalized_file_path = str(pathlib.Path(f"{data_dir}/SHSY5Y_sc_norm.parquet"))

# define ouput path
feature_select_output_file = str(pathlib.Path(f"{output_dir}/SHSY5Y_sc_norm_fs.parquet"))

In [3]:
# process each run
normalized_df = pd.read_parquet(normalized_file_path)

In [4]:
normalized_df.shape

(600816, 2926)

## Perform feature selection

In [5]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
]

print(f"Performing feature selection on normalized annotated merged single cells!")

# perform feature selection with the operations specified
feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
)

del normalized_df
gc.collect()

Performing feature selection on normalized annotated merged single cells!
Starting variance thresholding
Finished variance thresholding
Dropping NA Columns
Finished dropping NA columns
Finished


0

In [6]:
print(feature_select_df.shape)

(600816, 2339)


In [7]:
# Assuming 'well' is the column you want to stratify by
feature_select_df = feature_select_df.groupby('Metadata_Well').apply(lambda x: x.sample(frac=0.1, random_state=0)).reset_index(drop=True)
print(feature_select_df.shape)

(60078, 2339)


In [8]:
feature_select_ops = [
    "correlation_threshold",
]
# perform feature selection with the operations specified
feature_select_df = feature_select(
    feature_select_df,
    operation=feature_select_ops,
)
print(feature_select_df.shape)
# get the column names of the feature selected dataframe
feature_select_df_columns = feature_select_df.columns.tolist()

Starting correlation thresholding
Starting pairwise correlations
Finished correlation calculations
Finshed correlation computation - continuing to feature selection
Finished correlation thresholding
Finished
(60078, 1249)


In [9]:
# reload the normalized dataframe
normalized_df = pd.read_parquet(normalized_file_path)
# filter the normalized dataframe to only include the columns that were selected
feature_select_df = normalized_df.reindex(columns=feature_select_df_columns)
print(feature_select_df.shape)

(600816, 1249)


In [10]:
# save features selected df as parquet file
output(
    df=feature_select_df,
    output_filename=feature_select_output_file,
    output_type="parquet"
)
print(f"Features have been selected for SHSY5Y cells and saved to {pathlib.Path(feature_select_output_file).name}!")

Features have been selected for SHSY5Y cells and saved to SHSY5Y_sc_norm_fs.parquet!


In [11]:
# check to see if the shape of the df has changed indicating feature selection occurred
print(feature_select_df.shape)
feature_select_df.head()

(600816, 1249)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_SumEntropy_CorrPM_3_03_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_00_256
0,SH-SY5Y,B13,3780,6,Media ctr,,,media ctr,,,...,-1.83711,-0.594605,-1.290898,-3.120176,-3.002434,-1.479849,1.993159,0.024644,0.307726,0.071916
1,SH-SY5Y,B13,3780,6,Media ctr,,,media ctr,,,...,-0.838109,-0.678986,-0.328452,-3.717218,-0.826658,-0.460672,0.843387,-0.041344,1.445562,0.014207
2,SH-SY5Y,B13,3780,6,Media ctr,,,media ctr,,,...,0.50193,-0.226678,-1.250159,-0.689588,-1.373834,-1.336068,0.632187,-0.04458,-0.020781,0.015121
3,SH-SY5Y,B13,3780,6,Media ctr,,,media ctr,,,...,0.613212,1.059709,0.802273,-0.485147,0.050987,-0.200275,0.468929,-0.060804,-0.070032,-0.031396
4,SH-SY5Y,B13,3780,6,Media ctr,,,media ctr,,,...,0.189899,-0.53599,-0.798744,-0.50578,0.419497,-0.052682,-0.632097,-0.05259,0.00579,-0.058913


In [12]:
# seperate the metadata and profile data
metadata_cols = feature_select_df.columns[feature_select_df.columns.str.contains("Metadata")]
profile_cols = feature_select_df.columns[~feature_select_df.columns.str.contains("Metadata")]
print(len(metadata_cols))
print(len(profile_cols))
print(len(normalized_df.columns)-len(metadata_cols))

19
1230
2907
