# Process single cell profiles

## Import libraries

In [1]:
import gc
import pathlib
import pprint

import pandas as pd

from pycytominer import annotate, normalize, feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# Set round of plates to process
round_id = "Round_2_data"

# Path to dir with cleaned data from single-cell QC
cleaned_dir = pathlib.Path(f"./data/cleaned_profiles/{round_id}")

# output path for bulk profiles
output_dir = pathlib.Path(f"./data/single_cell_profiles/{round_id}")
output_dir.mkdir(parents=True, exist_ok=True)

# extract the plate names from the file name
plate_names = [file.stem.split("_")[0] for file in cleaned_dir.glob("*.parquet")]

# path for platemap directory
platemap_dir = pathlib.Path("../0.download_data/metadata/platemaps")

# load in barcode platemap
barcode_platemap = pd.read_csv(
    pathlib.Path(f"{platemap_dir}/Barcode_platemap_pilot_data.csv")
)

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns",
]

plate_names

['BR00145438',
 'BR00145818',
 'BR00145439',
 'BR00145816',
 'BR00145440',
 'BR00145817']

## Set dictionary with plates to process

In [3]:
# create plate info dictionary
plate_info_dictionary = {
    name: {
        "profile_path": (
            str(
                pathlib.Path(
                    list(cleaned_dir.rglob(f"{name}_cleaned.parquet"))[0]
                ).resolve(strict=True)
            )
            if list(cleaned_dir.rglob(f"{name}_cleaned.parquet"))
            else None
        ),
        # Find the platemap file based on barcode match and append .csv
        "platemap_path": (
            str(
                pathlib.Path(
                    list(
                        platemap_dir.rglob(
                            f"{barcode_platemap.loc[barcode_platemap['barcode'] == name, 'platemap_file'].values[0]}.csv"
                        )
                    )[0]
                ).resolve(strict=True)
            )
            if name in barcode_platemap["barcode"].values
            else None
        ),
        # Get the time_point based on the barcode match
        "time_point": (
            barcode_platemap.loc[
                barcode_platemap["barcode"] == name, "time_point"
            ].values[0]
            if name in barcode_platemap["barcode"].values
            else None
        ),
    }
    for name in plate_names
}

# Display the dictionary to verify the entries
pprint.pprint(plate_info_dictionary, indent=4)

{   'BR00145438': {   'platemap_path': '/home/jenna/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate3_platemap.csv',
                      'profile_path': '/home/jenna/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/cleaned_profiles/Round_2_data/BR00145438_cleaned.parquet',
                      'time_point': 24},
    'BR00145439': {   'platemap_path': '/home/jenna/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate3_platemap.csv',
                      'profile_path': '/home/jenna/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/cleaned_profiles/Round_2_data/BR00145439_cleaned.parquet',
                      'time_point': 48},
    'BR00145440': {   'platemap_path': '/home/jenna/pediatric_cancer_atlas_profiling/0.download_data/metadata/platemaps/Assay_Plate3_platemap.csv',
                      'profile_path': '/home/jenna/pediatric_cancer_atlas_profiling/3.preprocessing_features/data/cleaned_profil

## Process data with pycytominer

In [4]:
# Set up map for renaming metadata column(s)
column_name_mapping = {
    "Image_Metadata_Site": "Metadata_Site",
}

for plate, info in plate_info_dictionary.items():
    print(f"Performing pycytominer pipeline for {plate}")

    # Set output paths
    output_annotated_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_annotated.parquet")
    )
    output_normalized_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_normalized.parquet")
    )
    output_feature_select_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")
    )

    # Load in profile and platemap
    profile_df = pd.read_parquet(info["profile_path"])
    platemap_df = pd.read_csv(info["platemap_path"])

    print("Performing annotation for", plate, "...")
    # Step 1: Annotation
    annotated_df = annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_well", "Image_Metadata_Well"],
    )

    # Add 'Metadata_time_point' column based on the plate's time_point from dict
    annotated_df["Metadata_time_point"] = info["time_point"]

    # Rename Metadata column(s) using the rename() function
    annotated_df.rename(columns=column_name_mapping, inplace=True)

    # Save the modified annotated DataFrame
    output(
        df=annotated_df,
        output_filename=output_annotated_file,
        output_type="parquet",
    )

    # Clear memory
    del profile_df, platemap_df, annotated_df
    gc.collect()

    print("Performing normalization for", plate, "...")
    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
    )

    # Clear memory
    del normalized_df
    gc.collect()

    print("Performing feature selection for", plate, "...")
    # Step 3: Feature selection
    feature_select(
        profiles=output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet",
    )

    # Clear memory
    gc.collect()

    print(f"Preprocessing features completed for {plate}!")

Performing pycytominer pipeline for BR00145438
Performing annotation for BR00145438 ...
Performing normalization for BR00145438 ...
Performing feature selection for BR00145438 ...
Preprocessing features completed for BR00145438!
Performing pycytominer pipeline for BR00145818
Performing annotation for BR00145818 ...
Performing normalization for BR00145818 ...
Performing feature selection for BR00145818 ...
Preprocessing features completed for BR00145818!
Performing pycytominer pipeline for BR00145439
Performing annotation for BR00145439 ...
Performing normalization for BR00145439 ...
Performing feature selection for BR00145439 ...
Preprocessing features completed for BR00145439!
Performing pycytominer pipeline for BR00145816
Performing annotation for BR00145816 ...
Performing normalization for BR00145816 ...
Performing feature selection for BR00145816 ...
Preprocessing features completed for BR00145816!
Performing pycytominer pipeline for BR00145440
Performing annotation for BR00145440 

In [5]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
test_df.head(2)

(82987, 1169)


Unnamed: 0,Metadata_cell_line,Metadata_row,Metadata_column,Metadata_seeding_density,Metadata_condition,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,...,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrRNA_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrRNA_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrRNA_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrRNA_3_03_256,Nuclei_Texture_SumVariance_CorrAGP_3_03_256,Nuclei_Texture_SumVariance_CorrBrightfield_3_01_256,Nuclei_Texture_SumVariance_CorrDNA_3_01_256,Nuclei_Texture_SumVariance_CorrER_3_03_256,Nuclei_Texture_SumVariance_CorrRNA_3_03_256
0,CHLA-10,B,3,1000,synthemax,11.614486,57.014019,15.196178,50.388535,15,...,2.172167,0.9567,0.451038,1.068224,1.617094,-0.286858,-0.133131,-0.424782,-0.218685,-0.267844
1,CHLA-10,B,3,1000,synthemax,925.485866,202.238516,929.105121,195.681941,21,...,0.674003,0.017141,0.155338,-0.033407,-0.098907,-0.341274,-0.111363,-0.305924,-0.182183,-0.264831
