# Perform aggregation on all plates to output bulk profiles

## Import libraries

In [1]:
import pathlib
import yaml
import pprint

import pandas as pd
from pycytominer import aggregate
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# output directory for annotated data
output_dir = pathlib.Path("./data/aggregated_data")
output_dir.mkdir(exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_1': {   'dest_path': 'data/converted_data/Plate_1.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},
    'Plate_2': {   'dest_path': 'data/converted_data/Plate_2.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},
    'Plate_3': {   'dest_path': 'data/converted_data/Plate_3.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},
    'Plate_3_prime': {   'dest_path': 'data/converted_data/Plate_3_prime.parquet',
                         'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},
    'Plate_4': {   'dest_path': 'data/converted_data/Plate_4.parquet',
                   'sou

## Peform aggregation

**Note:** We use the default operation of `median` for aggregating the single cell data.

In [3]:
for plate, info in plate_info_dictionary.items():
    # single_cell_df is the dataframe loaded in from the converted parquet file
    single_cell_df = pd.read_parquet(info["dest_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk.parquet"))
    # save path to annotated file to dictionary for downstream use
    plate_info_dictionary[plate]["bulk_path"] = output_file
    print(f"Performing aggregation on {plate}!")

    # perform median aggregation (default) to ouput bulk features
    aggregate_df = aggregate(
        population_df=single_cell_df, strata=["Image_Metadata_Plate", "Image_Metadata_Well"]
    )

    # save aggregated df as parquet file
    output(
        df=aggregate_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"The bulk profile for {plate} has been created and saved!")

Performing aggregation on Plate_1!
The bulk profile for Plate_1 has been created and saved!
Performing aggregation on Plate_2!
The bulk profile for Plate_2 has been created and saved!
Performing aggregation on Plate_3!
The bulk profile for Plate_3 has been created and saved!
Performing aggregation on Plate_3_prime!
The bulk profile for Plate_3_prime has been created and saved!
Performing aggregation on Plate_4!
The bulk profile for Plate_4 has been created and saved!


In [4]:
# print last aggregate df to see if annotation occurred
print(aggregate_df.shape)
aggregate_df.head()

(60, 2306)


Unnamed: 0,Image_Metadata_Plate,Image_Metadata_Well,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_BoundingBoxMinimum_Y,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,Plate_4,B10,18439.0,54378.0,721.0,596.0,459.0,344.0,589.322731,467.639272,...,1402.176746,1390.439448,343.450989,339.348995,361.993801,354.468871,402.157284,408.136027,406.956549,401.988368
1,Plate_4,B11,14858.0,43289.0,792.5,555.0,607.5,325.0,693.164644,450.720216,...,1403.549344,1402.376148,267.296946,256.675966,262.744632,261.21941,353.841985,351.173117,366.253475,365.439917
2,Plate_4,B2,15746.0,42441.0,705.0,606.0,453.0,363.0,599.574838,505.885777,...,1360.189103,1334.643687,266.632008,266.805205,260.368846,255.193937,518.194881,511.552822,506.730303,504.779113
3,Plate_4,B3,17612.5,46376.0,728.0,502.0,479.5,268.5,604.916368,374.651,...,1510.003889,1515.435019,270.317856,263.647022,266.04719,255.156976,431.85225,428.683538,423.755592,420.139912
4,Plate_4,B4,16847.5,48662.0,734.0,477.5,522.0,264.5,627.978123,372.34319,...,1519.37528,1480.309106,332.66277,331.642057,337.357941,327.187538,429.636663,434.896642,433.045802,436.874798


## Write updated dictionary to yaml file for use in downstream steps

In [5]:
with open(dictionary_path, "w") as file:
    yaml.dump(plate_info_dictionary, file)