# Perform feature selection on normalized merged single cells for each plate

## Import libraries

In [1]:
import sys
import pathlib
import os
import yaml
import json

import pandas as pd
from pycytominer import feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# output directory for feature selected data
output_dir = pathlib.Path("./data/feature_selected_data")
# if directory if doesn't exist, will not raise error if it already exists
os.makedirs(output_dir, exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# view the dictionary to confirm all info is included to use for normalization
print(json.dumps(plate_info_dictionary, indent=4))

{
    "Plate_1": {
        "annotated_path": "data/annotated_data/Plate_1_sc.parquet",
        "dest_path": "data/converted_data/Plate_1.parquet",
        "normalized_path": "data/normalized_data/Plate_1_sc_norm.parquet",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate1.csv",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite"
    },
    "Plate_2": {
        "annotated_path": "data/annotated_data/Plate_2_sc.parquet",
        "dest_path": "data/converted_data/Plate_2.parquet",
        "normalized_path": "data/normalized_data/Plate_2_sc_norm.parquet",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate2.csv",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite"
    },
    "Plate_3": {
        "annotated_path": "data/annotated_data/Plate_3_sc.parquet",
        "dest_path": "data/converted_data/Plate_3.parquet",
        "normalized_path": "data/normalized_data/Plate_3_sc_norm.parquet"

## Perform feature selection

The operations that we are using for feature selection are:

- `variance_threshold`: creates a list of excluded features that have very low varience of values between single cells
  
- `correlation_threshold`: creates a list of excluded features with a correlation to at least one other feature greater than the default threshold (`threshold=0.9`)
  
- `blocklist`: creates a list of excluded features using the [standard blocklist file](https://github.com/cytomining/pycytominer/blob/master/pycytominer/data/blocklist_features.txt) for CellProfiler features from Pycytominer

For more information regarding these operations, please visit [the Pycytominer operations folder](https://github.com/cytomining/pycytominer/tree/master/pycytominer/operations) on GitHub.
To view how `blocklist` works, please visit [the separate file](https://github.com/cytomining/pycytominer/blob/a5ae6c81a275b692ef5d4c85cfeb37696bf69242/pycytominer/cyto_utils/features.py#L13) for that function.

In [3]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

# process each run
for plate, info in plate_info_dictionary.items():
    normalized_df = pd.read_parquet(info["normalized_path"])
    # output_file does not need to be saved to dictionary as there are no more processin steps after this
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm_fs.parquet"))
    print(f"Performing feature selection on normalized annotated merged single cells for {plate}!")

    # perform feature selection with the operations specified
    feature_select_df = feature_select(
        normalized_df,
        operation=feature_select_ops,
        output_file="none",
    )

    # save features selected df as parquet file
    output(
        df=feature_select_df,
        output_filename=output_file,
        output_type="parquet"
    )
    print(f"Features have been selected for {plate} and saved!")

Performing feature selection on normalized annotated merged single cells for Plate_1!
Features have been selected for Plate_1 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_2!
Features have been selected for Plate_2 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_3!
Features have been selected for Plate_3 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_3_prime!
Features have been selected for Plate_3_prime and saved!


In [4]:
# print last feature selected df to assess if feature selection occurred (less columns)
print(feature_select_df.shape)
feature_select_df.head()

(14495, 595)


Unnamed: 0,Metadata_WellRow,Metadata_Well,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_ImageNumber,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_01_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,B,B1,42,1,NF1,WT,500,78,1,1,...,-0.191334,0.67142,0.93676,1.458638,0.948377,1.10768,-1.864463,1.259309,0.415479,-0.697022
1,B,B1,42,1,NF1,WT,500,81,2,2,...,-0.524947,0.637803,0.186882,-0.152737,-0.279862,0.11599,-0.425139,-0.595902,-0.245817,-0.465088
2,B,B1,42,1,NF1,WT,500,82,1,1,...,-0.230874,0.104657,0.039143,0.272484,1.296262,0.984872,-0.957025,1.404316,0.49481,-0.577513
3,B,B1,42,1,NF1,WT,500,82,2,2,...,-0.314377,0.665691,0.216409,0.121547,1.337763,1.116384,-0.960571,1.663014,0.50399,-0.547628
4,B,B1,42,1,NF1,WT,500,83,1,1,...,-0.589706,0.528876,0.854686,1.324683,0.926693,0.043019,-1.360293,-0.635271,0.100997,-0.639121
