# Perform feature selection on normalized merged single cells for each plate

## Import libraries

In [1]:
import pathlib
import yaml
import pprint

import pandas as pd
from pycytominer import feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# output directory for feature selected data
output_dir = pathlib.Path("./data/feature_selected_data")
output_dir.mkdir(exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary)

{'Plate_1': {'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',
             'dest_path': 'data/converted_data/Plate_1.parquet',
             'normalized_path': 'data/normalized_data/Plate_1_sc_norm.parquet',
             'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',
             'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},
 'Plate_2': {'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',
             'dest_path': 'data/converted_data/Plate_2.parquet',
             'normalized_path': 'data/normalized_data/Plate_2_sc_norm.parquet',
             'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',
             'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},
 'Plate_3': {'annotated_path': 'data/annotated_d

## Perform feature selection

The operations that we are using for feature selection are:

- `variance_threshold`: creates a list of excluded features that have very low varience of values between single cells
  
- `correlation_threshold`: creates a list of excluded features with a correlation to at least one other feature greater than the default threshold (`threshold=0.9`)
  
- `blocklist`: creates a list of excluded features using the [standard blocklist file](https://github.com/cytomining/pycytominer/blob/master/pycytominer/data/blocklist_features.txt) for CellProfiler features from Pycytominer

For more information regarding these operations, please visit [the Pycytominer operations folder](https://github.com/cytomining/pycytominer/tree/master/pycytominer/operations) on GitHub.
To view how `blocklist` works, please visit [the separate file](https://github.com/cytomining/pycytominer/blob/a5ae6c81a275b692ef5d4c85cfeb37696bf69242/pycytominer/cyto_utils/features.py#L13) for that function.

In [3]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

# process each run
for plate, info in plate_info_dictionary.items():
    normalized_df = pd.read_parquet(info["normalized_path"])
    # output_file does not need to be saved to dictionary as there are no more processing steps after this
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm_fs.parquet"))
    print(f"Performing feature selection on normalized annotated merged single cells for {plate}!")

    # perform feature selection with the operations specified
    feature_select_df = feature_select(
        normalized_df,
        operation=feature_select_ops,
        output_file="none",
    )

    # save features selected df as parquet file
    output(
        df=feature_select_df,
        output_filename=output_file,
        output_type="parquet"
    )
    print(f"Features have been selected for {plate} and saved!")

Performing feature selection on normalized annotated merged single cells for Plate_1!
Features have been selected for Plate_1 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_2!
Features have been selected for Plate_2 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_3!
Features have been selected for Plate_3 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_3_prime!
Features have been selected for Plate_3_prime and saved!
Performing feature selection on normalized annotated merged single cells for Plate_4!
Features have been selected for Plate_4 and saved!


In [4]:
# print last feature selected df to assess if feature selection occurred (less columns)
print(feature_select_df.shape)
feature_select_df.head()

(7502, 635)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_CY5_3_00_256,Nuclei_Texture_SumEntropy_DAPI_3_03_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_01_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,B,2,B2,11,115,NF1,WT,1000,,0,...,-0.853646,-0.593777,-0.287816,-0.307117,0.078565,2.150259,-0.548795,-1.056228,-0.47354,2.716088
1,B,2,B2,11,115,NF1,WT,1000,,0,...,-0.056489,0.159731,-0.05393,-0.321569,0.779409,-0.907817,-0.562408,1.374649,-0.434801,-0.678851
2,B,2,B2,11,115,NF1,WT,1000,,0,...,-0.629045,-0.34121,-0.587309,0.342038,0.153377,0.525008,-0.068374,-1.011699,-0.132183,0.077051
3,B,2,B2,11,115,NF1,WT,1000,,0,...,-0.201772,0.572489,-0.088943,-0.149306,0.345108,-0.546746,-0.400875,1.902985,0.079753,-0.402336
4,B,2,B2,14,115,NF1,WT,1000,,0,...,-0.379375,0.724884,-0.371386,-0.954809,-0.247454,-0.39532,-0.71315,-1.447722,-0.577792,-0.543383
