# Perform feature selection on normalized merged single cells for each plate

## Import libraries

In [1]:
import sys
import pathlib
import os

import pandas as pd
from pycytominer import feature_select
from pycytominer.cyto_utils import output

sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# output directory for feature selected data
output_dir = pathlib.Path("./data/feature_selected_data")
# if directory if doesn't exist, will not raise error if it already exists
os.makedirs(output_dir, exist_ok=True)

# dictionary with each run for the cell type
plate_info_dictionary = {
    "Plate_1": {
        # path to parquet file from normalize function
        "normalized_path": str(pathlib.Path("./data/normalized_data/Plate_1_sc_norm.parquet"))
    },
    "Plate_2": {
        # path to parquet file from normalize function
        "normalized_path": str(pathlib.Path("./data/normalized_data/Plate_2_sc_norm.parquet"))
    },
    "Plate_3": {
        # path to parquet file from normalize function
        "normalized_path": str(pathlib.Path("./data/normalized_data/Plate_3_sc_norm.parquet"))
    },
    "Plate_3_prime": {
        # path to parquet file from normalize function
        "normalized_path": str(pathlib.Path("./data/normalized_data/Plate_3_prime_sc_norm.parquet"))
    }
}

## Perform feature selection

The operations that we are using for feature selection are:

- `variance_threshold`: creates a list of excluded features that have very low varience of values between single cells
  
- `correlation_threshold`: creates a list of excluded features with a correlation to at least one other feature greater than the default threshold (`threshold=0.9`)
  
- `blocklist`: creates a list of excluded features using the [standard blocklist file](https://github.com/cytomining/pycytominer/blob/master/pycytominer/data/blocklist_features.txt) for CellProfiler features from Pycytominer

For more information regarding these operations, please visit [the Pycytominer operations folder](https://github.com/cytomining/pycytominer/tree/master/pycytominer/operations) on GitHub.
To view how `blocklist` works, please visit [the separate file](https://github.com/cytomining/pycytominer/blob/a5ae6c81a275b692ef5d4c85cfeb37696bf69242/pycytominer/cyto_utils/features.py#L13) for that function.

In [3]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

# process each run
for plate, info in plate_info_dictionary.items():
    normalized_df = pd.read_parquet(info["normalized_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm_fs.parquet"))
    print(f"Performing feature selection on normalized annotated merged single cells for {plate}!")

    # perform feature selection with the operations specified
    feature_select_df = feature_select(
        normalized_df,
        operation=feature_select_ops,
        output_file="none",
    )

    # save features selected df as parquet file
    output(
        df=feature_select_df,
        output_filename=output_file,
        output_type="parquet"
    )
    print(f"Features have been selected for {plate} and saved!")

Performing feature selection on normalized annotated merged single cells for Plate_1!
Features have been selected for Plate_1 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_2!
Features have been selected for Plate_2 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_3!
Features have been selected for Plate_3 and saved!
Performing feature selection on normalized annotated merged single cells for Plate_3_prime!
Features have been selected for Plate_3_prime and saved!


In [4]:
# print last feature selected df to assess if feature selection occurred (less columns)
print(feature_select_df.shape)
feature_select_df.head()

(4098, 631)


Unnamed: 0,Metadata_WellRow,Metadata_Well,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_ImageNumber,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_01_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_01_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_01_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,G,G4,388,4,NF1,WT,4000,1651,1,1,...,-0.308726,-0.069612,1.000387,0.047658,0.341889,-0.488085,-0.214034,-0.212931,-0.365031,-0.3766
1,G,G4,388,4,NF1,WT,4000,1651,2,2,...,-0.113777,1.239559,-0.064648,-0.060977,0.719764,-0.981186,-0.640715,0.45509,-0.400824,-0.422417
2,G,G4,388,4,NF1,WT,4000,1651,3,3,...,-0.154915,0.562972,0.169635,-0.159897,0.185859,-0.747536,-0.563811,-0.489082,-0.433693,-0.479389
3,G,G4,388,4,NF1,WT,4000,1651,4,4,...,-0.169044,0.535015,-0.096458,-0.235037,0.107373,-0.509046,-0.573363,-0.572271,-0.446369,-0.34446
4,G,G4,388,4,NF1,WT,4000,1651,5,5,...,-0.38446,0.798174,-0.349656,-0.434121,0.202104,-0.614142,0.026261,-0.179076,-0.341966,-0.388416
