# Process single cell profiles

NOTE: We are normalizing the plates for all samples as we only have three wells associated with the healthy controls, which is insufficient for normalization.

## Import libraries

In [1]:
import os
import pathlib
import pprint

import pandas as pd

from pycytominer import annotate, normalize, feature_select

## Set paths and variables

In [2]:
# get the batch to process from environment variable
batch_to_process = os.environ.get("BATCH", "batch_1")
if batch_to_process is None:
    raise ValueError(
        "Please set the BATCH environment variable before running this script."
    )

# base directory where batches are located
base_dir = pathlib.Path("./data/").resolve(strict=True)

# Decide what to process
if batch_to_process:
    print(f"Processing {batch_to_process}")
    batch_dirs = [base_dir / batch_to_process]
else:
    print("No specific batch set, processing all available batches")
    batch_dirs = [p for p in base_dir.glob("batch_*") if p.is_dir()]

# path for platemap directory
platemap_dir = pathlib.Path("../metadata/updated_platemaps/")

# Load the barcode_platemap file
barcode_platemap_df = pd.read_csv(
    (platemap_dir / "updated_barcode_platemap.csv").resolve()
)

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns",
]

Processing batch_1


## Set dictionary with plates to process

In [3]:
plate_info_dictionary = {}

# Loop over batches and layouts
for batch_dir in batch_dirs:
    layouts = [p for p in batch_dir.iterdir() if p.is_dir()]  # all layouts
    for layout_dir in layouts:
        cleaned_dir = layout_dir / "cleaned_profiles"
        output_dir = layout_dir / "single_cell_profiles"
        output_dir.mkdir(parents=True, exist_ok=True)

        # Extract plate names from parquet files
        parquet_files = list(cleaned_dir.glob("*.parquet"))
        plate_names = [
            "_".join(f.stem.split("_")[:2]) if len(f.stem.split("_")) >= 2 else f.stem
            for f in parquet_files
        ]

        for name in plate_names:
            # Find the corresponding parquet file
            matching_files = [f for f in parquet_files if name in f.stem]
            if not matching_files:
                continue
            profile_path = matching_files[0].resolve(strict=True)

            # Find corresponding platemap CSV
            platemap_row = barcode_platemap_df.loc[
                barcode_platemap_df["plate_barcode"] == name
            ]
            if platemap_row.empty:
                raise ValueError(f"No platemap found for plate {name}")
            platemap_path = (
                platemap_dir / f"{platemap_row['platemap_file'].values[0]}.csv"
            ).resolve(strict=True)

            # Add to dictionary
            plate_info_dictionary[name] = {
                "profile_path": profile_path,
                "platemap_path": platemap_path,
                "output_dir": output_dir,
            }

# View dictionary
print("Number of plates to process:", len(plate_info_dictionary))
pprint.pprint(plate_info_dictionary, indent=4)

Number of plates to process: 16
{   'CARD-CelIns-CX7_251023210001': {   'output_dir': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/batch_1/platemap_1/single_cell_profiles'),
                                        'platemap_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/metadata/updated_platemaps/Target_Selective_Library_Screen_Plate_1_with_pathways.csv'),
                                        'profile_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/batch_1/platemap_1/cleaned_profiles/CARD-CelIns-CX7_251023210001_cleaned.parquet')},
    'CARD-CelIns-CX7_251124150001': {   'output_dir': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/batch_1/platemap_1/single_cell_profiles'),
                                        'platemap_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/metadata/updated_platemaps/Target_Selective_Library_Screen_Plate_1_with_path

## Process data with pycytominer

In [4]:
for plate, info in plate_info_dictionary.items():
    output_dir = info["output_dir"]
    print("Performing preprocessing on", plate, output_dir)

    # Use the output_dir from the dictionary for this specific plate
    output_dir = info["output_dir"]
    output_dir.mkdir(parents=True, exist_ok=True)

    output_annotated_file = str(output_dir / f"{plate}_sc_annotated.parquet")
    output_normalized_file = str(output_dir / f"{plate}_sc_normalized.parquet")
    output_feature_select_file = str(
        output_dir / f"{plate}_sc_feature_selected.parquet"
    )

    profile_df = pd.read_parquet(info["profile_path"])
    platemap_df = pd.read_csv(info["platemap_path"])

    print("Performing annotation for", plate, "...")
    # Step 1: Annotation
    annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )

    # Load the annotated parquet file to fix metadata columns names
    annotated_df = pd.read_parquet(output_annotated_file)

    # Rename columns
    annotated_df.rename(columns={"Image_Metadata_Site": "Metadata_Site"}, inplace=True)

    # Save back
    annotated_df.to_parquet(output_annotated_file, index=False)

    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
        samples="Metadata_treatment == 'DMSO' and Metadata_cell_type == 'failing'",
    )

    # Step 3: Feature selection
    print("Performing feature selection for", plate, "...")
    feature_select(
        normalized_df,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet",
        blocklist_file="./blocklist_features.txt",
    )

    print(f"Annotation, normalization, and feature selection complete for {plate}")

Performing preprocessing on CARD-CelIns-CX7_251205100001 /home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/batch_1/platemap_3/single_cell_profiles
Performing annotation for CARD-CelIns-CX7_251205100001 ...
Performing feature selection for CARD-CelIns-CX7_251205100001 ...
Annotation, normalization, and feature selection complete for CARD-CelIns-CX7_251205100001
Performing preprocessing on CARD-CelIns-CX7_251210180001 /home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/batch_1/platemap_3/single_cell_profiles
Performing annotation for CARD-CelIns-CX7_251210180001 ...
Performing feature selection for CARD-CelIns-CX7_251210180001 ...
Annotation, normalization, and feature selection complete for CARD-CelIns-CX7_251210180001
Performing preprocessing on CARD-CelIns-CX7_251203170001 /home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/batch_1/platemap_3/single_cell_profiles
Performing annotation for CARD-CelIns-CX7_251203170001 ...
Pe

In [5]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
print("Plate:", test_df.Metadata_Plate.unique())
print(
    "Metadata columns:", [col for col in test_df.columns if col.startswith("Metadata_")]
)
test_df.head(2)

(6361, 925)
Plate: ['CARD-CelIns-CX7_251201110001']
Metadata columns: ['Metadata_WellRow', 'Metadata_WellCol', 'Metadata_heart_number', 'Metadata_cell_type', 'Metadata_heart_failure_type', 'Metadata_treatment', 'Metadata_Pathway', 'Metadata_Nuclei_Location_Center_X', 'Metadata_Nuclei_Location_Center_Y', 'Metadata_Cells_Location_Center_X', 'Metadata_Cells_Location_Center_Y', 'Metadata_Image_Count_Cells', 'Metadata_ImageNumber', 'Metadata_Plate', 'Metadata_Well', 'Metadata_Cells_Number_Object_Number', 'Metadata_Cytoplasm_Parent_Cells', 'Metadata_Cytoplasm_Parent_Nuclei', 'Metadata_Nuclei_Number_Object_Number', 'Metadata_Site']


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Pathway,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,...,Nuclei_Texture_InfoMeas2_Mito_3_00_256,Nuclei_Texture_InfoMeas2_Mito_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mito_3_03_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumEntropy_Mito_3_00_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mito_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,7,healthy,,DMSO,,504.459638,382.729819,505.968841,...,0.94299,0.572058,-2.518711,-2.320037,-1.736429,-2.338841,1.545899,1.671922,0.114597,0.214213
1,B,2,7,healthy,,DMSO,,961.839763,183.838773,963.348589,...,1.268513,1.408434,-0.526834,-1.193505,0.070259,-0.349963,1.070199,0.254882,-0.11901,-0.025557


In [6]:
# Check output file
test_df = pd.read_parquet(output_annotated_file)

print(test_df.shape)
print("Plate:", test_df.Metadata_Plate.unique())
print(
    "Metadata columns:", [col for col in test_df.columns if col.startswith("Metadata_")]
)
test_df.head(2)

(6361, 2327)
Plate: ['CARD-CelIns-CX7_251201110001']
Metadata columns: ['Metadata_WellRow', 'Metadata_WellCol', 'Metadata_heart_number', 'Metadata_cell_type', 'Metadata_heart_failure_type', 'Metadata_treatment', 'Metadata_Pathway', 'Metadata_Nuclei_Location_Center_X', 'Metadata_Nuclei_Location_Center_Y', 'Metadata_Cells_Location_Center_X', 'Metadata_Cells_Location_Center_Y', 'Metadata_Image_Count_Cells', 'Metadata_ImageNumber', 'Metadata_Plate', 'Metadata_Well', 'Metadata_Cells_Number_Object_Number', 'Metadata_Cytoplasm_Parent_Cells', 'Metadata_Cytoplasm_Parent_Nuclei', 'Metadata_Nuclei_Number_Object_Number', 'Metadata_Site']


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Pathway,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,...,Nuclei_Texture_Variance_ER_3_02_256,Nuclei_Texture_Variance_ER_3_03_256,Nuclei_Texture_Variance_Mito_3_00_256,Nuclei_Texture_Variance_Mito_3_01_256,Nuclei_Texture_Variance_Mito_3_02_256,Nuclei_Texture_Variance_Mito_3_03_256,Nuclei_Texture_Variance_PM_3_00_256,Nuclei_Texture_Variance_PM_3_01_256,Nuclei_Texture_Variance_PM_3_02_256,Nuclei_Texture_Variance_PM_3_03_256
0,B,2,7,healthy,,DMSO,,504.459638,382.729819,505.968841,...,37.051868,36.655899,26.097537,23.765346,26.085721,26.605978,10.214589,9.59304,10.489005,10.453427
1,B,2,7,healthy,,DMSO,,961.839763,183.838773,963.348589,...,8.714344,8.637393,6.238527,6.01879,5.876572,5.670404,2.656007,2.511783,2.471226,2.437532
