## Perform traditional bulk pycytominer pipeline

Following single-cell curation with cytotable, we create bulk profiles by applying the following steps:

1. aggregation
2. annotation
3. normalization
4. feature_selection

In [1]:
import pathlib
import yaml
import pprint

import pandas as pd

from pycytominer import aggregate, annotate, normalize, feature_select
from pycytominer.cyto_utils import load_profiles

In [2]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns"
]

# Set paths
output_dir = pathlib.Path("data/bulk_profiles")
output_dir.mkdir(exist_ok=True)
metadata_dir = pathlib.Path("../0.download_data/metadata/")

# load in plate information
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

In [3]:
# add path to platemaps for each plate 
for plate in plate_info_dictionary.keys():
    # since Plate_3_prime has the same platemap as Plate_3,
    # we need an else statement so that we make sure it adds the 
    # path that was given to Plate_3
    if plate != "Plate_3_prime":
        # match the naming format of the plates to the platemap file
        plate_info_dictionary[plate]["platemap_path"] = str(
            pathlib.Path(
                list(
                    metadata_dir.rglob(f"platemap_NF1_{plate.replace('_', '').lower()}.csv")
                )[0]
            ).resolve(strict=True)
        )
    else:
        plate_info_dictionary["Plate_3_prime"]["platemap_path"] = (
            plate_info_dictionary["Plate_3"]["platemap_path"]
        )

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_1': {   'dest_path': 'data/converted_data/Plate_1.parquet',
                   'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',
                   'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},
    'Plate_2': {   'dest_path': 'data/converted_data/Plate_2.parquet',
                   'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',
                   'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},
    'Plate_3': {   'dest_path': 'data/converted_data/Plate_3.parquet',
                   'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',
                   'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/an

## Perform pycytominer pipeline

In [4]:
for plate, info in plate_info_dictionary.items():
    print(f"Now performing pycytominer pipeline for {plate}")
    output_aggregated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk.parquet"))
    output_annotated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_annotated.parquet"))
    output_normalized_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_normalized.parquet"))
    output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_feature_selected.parquet"))
    
    # Load single-cell profiles
    single_cell_df = pd.read_parquet(info["dest_path"])
    
    # Load platemap
    platemap_df = pd.read_csv(info["platemap_path"])

    # Step 1: Aggregation
    aggregate(
        population_df=single_cell_df,
        operation="median",
        strata=["Image_Metadata_Plate", "Image_Metadata_Well"],
        output_file=output_aggregated_file,
        output_type="parquet"
    )
    
    # Step 2: Annotation
    annotate(
        profiles=output_aggregated_file,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )
    
    # Step 3: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
    )
    
    # Step 4: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet"
    )

Now performing pycytominer pipeline for Plate_1
Now performing pycytominer pipeline for Plate_2
Now performing pycytominer pipeline for Plate_3
Now performing pycytominer pipeline for Plate_3_prime
Now performing pycytominer pipeline for Plate_4


In [5]:
# Check output file
test_df = load_profiles(output_feature_select_file)

print(test_df.shape)
test_df.head(2)

(60, 1174)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,Metadata_Concentration,Metadata_Plate,Metadata_Well,...,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_01_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_02_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_02_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_Variance_RFP_3_01_256
0,B,2,NF1,WT,1000,,0,0.0,Plate_4,B2,...,0.041461,-0.481716,-0.590904,-0.538406,-1.757806,-2.256964,-2.035733,-1.120269,-0.097843,2.515517
1,B,3,NF1,WT,1000,Scramble,1,0.05,Plate_4,B3,...,-0.9399,-1.382558,-0.931205,0.718546,0.022127,-0.971826,-0.285359,1.478716,0.387565,0.906714
