# Process single cell profiles

NOTE: We are normalizing the plates for all samples as we only have three wells associated with the healthy controls, which is insufficient for normalization.

## Import libraries

In [1]:
import os
import pathlib
import pprint

import pandas as pd

from pycytominer import annotate, normalize, feature_select

## Set paths and variables

In [None]:
# Optional: set `PLATEMAP_LAYOUT` env var to process only a single platemap (e.g. 'platemap_1')
platemap_to_process = os.environ.get("PLATEMAP_LAYOUT")
# platemap_to_process = "platemap_1"  # for testing only

# set base directory for where the SQLite files are located (should be local to repo)
base_dir = pathlib.Path("../2.cellprofiler_processing/cp_output/").resolve(strict=True)

# Decide what to process
if platemap_to_process:
    print(f"Processing only {platemap_to_process}")
    layouts = [platemap_to_process]
else:
    print("No specific layout set, processing all available platemaps")
    layouts = [p.name for p in base_dir.glob("platemap_*") if p.is_dir()]

pprint.pprint(layouts)

Processing only platemap_1
['platemap_1']


In [None]:
# Path to dir with cleaned data from single-cell QC
converted_dir = pathlib.Path(f"./data/{platemap_to_process}/cleaned_profiles/").resolve(
    strict=True
)

# output path for single-cell profiles
output_dir = pathlib.Path(f"./data/{platemap_to_process}/single_cell_profiles")
output_dir.mkdir(parents=True, exist_ok=True)

# Extract the plate names from the file name
plate_names = [
    "_".join(parts[:2]) if len(parts) >= 2 else parts[0]
    for parts in (file.stem.split("_") for file in converted_dir.glob("*.parquet"))
]


# path for platemap directory
platemap_dir = pathlib.Path("../metadata/updated_platemaps/")

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns",
]

## Set dictionary with plates to process

In [4]:
# Load the barcode_platemap file
barcode_platemap_df = pd.read_csv(
    pathlib.Path(f"{platemap_dir}/updated_barcode_platemap.csv").resolve()
)

# Create plate info dictionary
plate_info_dictionary = {
    name: {
        "profile_path": (converted_dir / f"{name}_cleaned.parquet").resolve(
            strict=True
        ),
        "platemap_path": (
            platemap_dir
            / f"{barcode_platemap_df.loc[barcode_platemap_df['plate_barcode'] == name, 'platemap_file'].values[0]}.csv"
        ).resolve(strict=True),
    }
    for name in plate_names
}

# View the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'CARD-CelIns-CX7_251023210001': {   'platemap_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/metadata/updated_platemaps/Target_Selective_Library_Screen_Plate_1_with_pathways.csv'),
                                        'profile_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/platemap_1/cleaned_profiles/CARD-CelIns-CX7_251023210001_cleaned.parquet')},
    'CARD-CelIns-CX7_251124150001': {   'platemap_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/metadata/updated_platemaps/Target_Selective_Library_Screen_Plate_1_with_pathways.csv'),
                                        'profile_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/3.preprocessing_features/data/platemap_1/cleaned_profiles/CARD-CelIns-CX7_251124150001_cleaned.parquet')},
    'CARD-CelIns-CX7_251125110001': {   'platemap_path': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/metadata/updated_platemaps/Target_Selective_Library_Screen_Pla

## Process data with pycytominer

In [5]:
for plate, info in plate_info_dictionary.items():
    print(f"Performing pycytominer pipeline for {plate}")
    output_annotated_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_annotated.parquet")
    )
    output_normalized_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_normalized.parquet")
    )
    output_feature_select_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")
    )

    profile_df = pd.read_parquet(info["profile_path"])
    platemap_df = pd.read_csv(info["platemap_path"])

    print("Performing annotation for", plate, "...")
    # Step 1: Annotation
    annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )

    # Load the annotated parquet file to fix metadata columns names
    annotated_df = pd.read_parquet(output_annotated_file)

    # Rename columns using the rename() function
    column_name_mapping = {
        "Image_Metadata_Site": "Metadata_Site",
    }

    annotated_df.rename(columns=column_name_mapping, inplace=True)

    # Save the modified DataFrame back to the same location
    annotated_df.to_parquet(output_annotated_file, index=False)

    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
    )

    print("Performing feature selection for", plate, "...")
    # Step 3: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet",
    )
    print(
        f"Annotation, normalization, and feature selection have been performed for {plate}"
    )

Performing pycytominer pipeline for CARD-CelIns-CX7_251124150001
Performing annotation for CARD-CelIns-CX7_251124150001 ...
Performing feature selection for CARD-CelIns-CX7_251124150001 ...
Annotation, normalization, and feature selection have been performed for CARD-CelIns-CX7_251124150001
Performing pycytominer pipeline for CARD-CelIns-CX7_251126130001
Performing annotation for CARD-CelIns-CX7_251126130001 ...
Performing feature selection for CARD-CelIns-CX7_251126130001 ...
Annotation, normalization, and feature selection have been performed for CARD-CelIns-CX7_251126130001
Performing pycytominer pipeline for CARD-CelIns-CX7_251023210001
Performing annotation for CARD-CelIns-CX7_251023210001 ...
Performing feature selection for CARD-CelIns-CX7_251023210001 ...
Annotation, normalization, and feature selection have been performed for CARD-CelIns-CX7_251023210001
Performing pycytominer pipeline for CARD-CelIns-CX7_251125110001
Performing annotation for CARD-CelIns-CX7_251125110001 ...


In [6]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
print("Plate:", test_df.Metadata_Plate.unique())
print(
    "Metadata columns:", [col for col in test_df.columns if col.startswith("Metadata_")]
)
test_df.head(2)

(7693, 968)
Plate: ['CARD-CelIns-CX7_251125110001']
Metadata columns: ['Metadata_WellRow', 'Metadata_WellCol', 'Metadata_heart_number', 'Metadata_cell_type', 'Metadata_heart_failure_type', 'Metadata_treatment', 'Metadata_Pathway', 'Metadata_Nuclei_Location_Center_X', 'Metadata_Nuclei_Location_Center_Y', 'Metadata_Cells_Location_Center_X', 'Metadata_Cells_Location_Center_Y', 'Metadata_Image_Count_Cells', 'Metadata_ImageNumber', 'Metadata_Plate', 'Metadata_Well', 'Metadata_Cells_Number_Object_Number', 'Metadata_Cytoplasm_Parent_Cells', 'Metadata_Cytoplasm_Parent_Nuclei', 'Metadata_Nuclei_Number_Object_Number', 'Metadata_Site']


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Pathway,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,...,Nuclei_Texture_InfoMeas2_Mito_3_01_256,Nuclei_Texture_InfoMeas2_Mito_3_02_256,Nuclei_Texture_InfoMeas2_Mito_3_03_256,Nuclei_Texture_InfoMeas2_PM_3_00_256,Nuclei_Texture_InfoMeas2_PM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_Mito_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,7,healthy,,DMSO,,785.101016,189.283921,779.231342,...,-0.481068,-0.455141,-0.198944,-1.038263,-0.211829,0.789941,0.879992,-0.311151,-0.226121,-0.248144
1,B,2,7,healthy,,DMSO,,277.944215,170.593492,299.687732,...,-0.296389,-0.993186,-1.259688,-0.688871,-0.766972,0.947502,0.988154,-0.024926,-0.183829,-0.201708


In [7]:
# Check output file
test_df = pd.read_parquet(output_annotated_file)

print(test_df.shape)
print("Plate:", test_df.Metadata_Plate.unique())
print(
    "Metadata columns:", [col for col in test_df.columns if col.startswith("Metadata_")]
)
test_df.head(2)

(7693, 2322)
Plate: ['CARD-CelIns-CX7_251125110001']
Metadata columns: ['Metadata_WellRow', 'Metadata_WellCol', 'Metadata_heart_number', 'Metadata_cell_type', 'Metadata_heart_failure_type', 'Metadata_treatment', 'Metadata_Pathway', 'Metadata_Nuclei_Location_Center_X', 'Metadata_Nuclei_Location_Center_Y', 'Metadata_Cells_Location_Center_X', 'Metadata_Cells_Location_Center_Y', 'Metadata_Image_Count_Cells', 'Metadata_ImageNumber', 'Metadata_Plate', 'Metadata_Well', 'Metadata_Cells_Number_Object_Number', 'Metadata_Cytoplasm_Parent_Cells', 'Metadata_Cytoplasm_Parent_Nuclei', 'Metadata_Nuclei_Number_Object_Number', 'Metadata_Site']


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Pathway,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,...,Nuclei_Texture_Variance_ER_3_02_256,Nuclei_Texture_Variance_ER_3_03_256,Nuclei_Texture_Variance_Mito_3_00_256,Nuclei_Texture_Variance_Mito_3_01_256,Nuclei_Texture_Variance_Mito_3_02_256,Nuclei_Texture_Variance_Mito_3_03_256,Nuclei_Texture_Variance_PM_3_00_256,Nuclei_Texture_Variance_PM_3_01_256,Nuclei_Texture_Variance_PM_3_02_256,Nuclei_Texture_Variance_PM_3_03_256
0,B,2,7,healthy,,DMSO,,785.101016,189.283921,779.231342,...,3.07341,3.092606,1.191286,0.957049,0.971634,0.978932,3.792291,3.887335,4.131333,3.701426
1,B,2,7,healthy,,DMSO,,277.944215,170.593492,299.687732,...,3.415442,3.228637,3.078634,2.701166,2.561604,2.590654,5.180889,5.047245,5.087999,5.230235
