# Process bulk profiles

## Import libraries

In [1]:
import pathlib
import pprint

import pandas as pd

from pycytominer import aggregate, annotate, normalize, feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# Condition for how to normalize the plates
normalize_with_U2OS = (
    True  # Set to False if normalizing to whole plate versus just `U2-OS` cell line
)

# Set round to be processed
round_id = "Round_3_data"

# Path to dir with cleaned data from single-cell QC
cleaned_dir = pathlib.Path(f"./data/cleaned_profiles/{round_id}")

# output path for bulk profiles
output_dir = pathlib.Path(f"./data/bulk_profiles/{round_id}")
output_dir.mkdir(parents=True, exist_ok=True)

# extract the plate names from the file name
plate_names = [file.stem.split("_")[0] for file in cleaned_dir.glob("*.parquet")]

# path for platemap directory
platemap_dir = pathlib.Path("../0.download_data/metadata/platemaps")

# load in barcode platemap
barcode_platemap = pd.read_csv(
    pathlib.Path(f"{platemap_dir}/Barcode_platemap_pilot_data.csv")
)

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns",
]

plate_names

['BR00147496',
 'BR00147497',
 'BR00146998',
 'BR00147003',
 'BR00147002',
 'BR00147001',
 'BR00146999',
 'BR00147000',
 'BR00147262',
 'BR00147263',
 'BR00147261',
 'BR00147495']

## Set dictionary with plates to process

In [3]:
# create plate info dictionary
plate_info_dictionary = {
    name: {
        "profile_path": (
            str(
                pathlib.Path(
                    list(cleaned_dir.rglob(f"{name}_cleaned.parquet"))[0]
                ).resolve(strict=True)
            )
            if list(cleaned_dir.rglob(f"{name}_cleaned.parquet"))
            else None
        ),
        # Find the platemap file based on barcode match and append .csv
        "platemap_path": (
            str(
                pathlib.Path(
                    list(
                        platemap_dir.rglob(
                            f"{barcode_platemap.loc[barcode_platemap['barcode'] == name, 'platemap_file'].values[0]}.csv"
                        )
                    )[0]
                ).resolve(strict=True)
            )
            if name in barcode_platemap["barcode"].values
            else None
        ),
        # Get the time_point based on the barcode match
        "time_point": (
            barcode_platemap.loc[
                barcode_platemap["barcode"] == name, "time_point"
            ].values[0]
            if name in barcode_platemap["barcode"].values
            else None
        ),
    }
    for name in plate_names
}

# Display the dictionary to verify the entries
pprint.pprint(plate_info_dictionary, indent=4)

{   'BR00146998': {   'platemap_path': '/home/jenna/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate5_platemap.csv',
                      'profile_path': '/home/jenna/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/cleaned_profiles/Round_3_data/BR00146998_cleaned.parquet',
                      'time_point': 24},
    'BR00146999': {   'platemap_path': '/home/jenna/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate5_platemap.csv',
                      'profile_path': '/home/jenna/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/cleaned_profiles/Round_3_data/BR00146999_cleaned.parquet',
                      'time_point': 48},
    'BR00147000': {   'platemap_path': '/home/jenna/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate5_platemap.csv',
                      'profile_path': '/home/jenna/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/cleaned_profil

## Process data with pycytominer

In [4]:
# Determine suffix based on normalize_with_U2OS
u2os_suffix = "_U2OS_samples" if normalize_with_U2OS else ""

# If normalizing with U2-OS, create a subfolder named 'U2OS_samples'
if normalize_with_U2OS:
    U2OS_output_dir = output_dir / "U2OS_samples"
    U2OS_output_dir.mkdir(exist_ok=True)
else:
    U2OS_output_dir = (
        output_dir  # Otherwise, use the root output_dir for whole plate normalization
    )

for plate, info in plate_info_dictionary.items():
    print(f"Now performing pycytominer pipeline for {plate}")

    # Output file paths for each file
    output_aggregated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk.parquet"))
    output_annotated_file = str(
        pathlib.Path(f"{output_dir}/{plate}_bulk_annotated.parquet")
    )

    # Save normalized and feature-selected files in U2OS_samples folder if needed
    output_normalized_file = str(
        pathlib.Path(f"{U2OS_output_dir}/{plate}_bulk_normalized{u2os_suffix}.parquet")
    )
    output_feature_select_file = str(
        pathlib.Path(
            f"{U2OS_output_dir}/{plate}_bulk_feature_selected{u2os_suffix}.parquet"
        )
    )

    # Load single-cell profiles
    single_cell_df = pd.read_parquet(info["profile_path"])

    # Load platemap
    platemap_df = pd.read_csv(info["platemap_path"])

    # Step 1: Aggregation
    aggregate(
        population_df=single_cell_df,
        operation="median",
        strata=["Image_Metadata_Plate", "Image_Metadata_Well"],
        output_file=output_aggregated_file,
        output_type="parquet",
    )

    # Step 2: Annotation
    annotated_df = annotate(
        profiles=output_aggregated_file,
        platemap=platemap_df,
        join_on=["Metadata_well", "Image_Metadata_Well"],
    )

    # Step 2.5: Add 'Metadata_time_point' column based on the plate's time_point from dict
    annotated_df["Metadata_time_point"] = info["time_point"]

    # Step 3: Output annotated DataFrame
    output(
        df=annotated_df,
        output_filename=output_annotated_file,
        output_type="parquet",
    )

    # Step 4: Normalization
    if normalize_with_U2OS:
        if "Metadata_cell_line" in annotated_df.columns:
            if (
                annotated_df["Metadata_cell_line"]
                .astype(str)
                .str.contains("U2-OS")
                .any()
            ):
                samples = "Metadata_cell_line == 'U2-OS'"
            else:
                raise ValueError(
                    "U2-OS not found in 'Metadata_cell_line'. Please ensure it is spelled exactly as 'U2-OS'."
                )
        else:
            raise ValueError("'Metadata_cell_line' column not found in the dataframe.")
    else:
        samples = "all"

    print(f"Normalizing using samples: {samples}")

    normalize(
        profiles=annotated_df,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
        samples=samples,
    )

    # Step 5: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet",
    )

Now performing pycytominer pipeline for BR00147496
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00147497
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00146998
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00147003
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00147002
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00147001
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00146999
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00147000
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer pipeline for BR00147262
Normalizing using samples: Metadata_cell_line == 'U2-OS'
Now performing pycytominer p

In [5]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
test_df.head(2)

(178, 703)


Unnamed: 0,Metadata_cell_line,Metadata_row,Metadata_column,Metadata_seeding_density,Metadata_condition,Metadata_Plate,Metadata_Well,Metadata_time_point,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_AreaShape_Compactness,...,Nuclei_Texture_Correlation_CorrBrightfield_3_02_256,Nuclei_Texture_Correlation_CorrBrightfield_3_03_256,Nuclei_Texture_Correlation_CorrDNA_3_00_256,Nuclei_Texture_Correlation_CorrMito_3_01_256,Nuclei_Texture_Correlation_CorrRNA_3_03_256,Nuclei_Texture_Entropy_CorrRNA_3_03_256,Nuclei_Texture_InfoMeas1_CorrDNA_3_00_256,Nuclei_Texture_InfoMeas2_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrDNA_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_01_256
0,D283,C,3,1000,synthemax,BR00147495,C03,24,-3.935267,-0.032544,...,0.533248,5.385786,0.874489,-3.076594,-3.76208,0.747645,-11.419717,7.341162,-5.506212,722.71893
1,D283,C,4,1000,synthemax,BR00147495,C04,24,-2.27206,0.026117,...,-3.317288,1.555258,0.89828,-4.234099,-4.839643,0.852402,-10.725627,7.348269,-5.54702,978.564112
