# Process single cell profiles

In [1]:
import pathlib
import pprint

import pandas as pd

from pycytominer import annotate, normalize, feature_select

In [2]:
# Path to dir with nuclei feature files
converted_dir = pathlib.Path("./data/converted_profiles")

# output path for single cell profiles 
output_dir = pathlib.Path("./data/single_cell_profiles")
output_dir.mkdir(parents=True, exist_ok=True)  

# Extract the plate names from the file name
plate_names = [file.stem.replace("_converted", "") for file in converted_dir.glob("*.parquet")]

# path for platemap directory
platemap_dir = pathlib.Path("../metadata/")

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

In [3]:
# create plate info dictionary 
plate_info_dictionary = {
    name: {
        "profile_path": str(
            pathlib.Path(list(converted_dir.rglob(f"{name}_converted.parquet"))[0]).resolve(
                strict=True
            )
        ),
        "platemap_path": str(
            pathlib.Path(list(platemap_dir.rglob(f"{name}_platemap.csv"))[0]).resolve(
                strict=True
            )
        ),
    }
    for name in plate_names
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'localhost220512140003_KK22-05-198': {   'platemap_path': '/home/jenna/CFReT_data/metadata/localhost220512140003_KK22-05-198_platemap.csv',
                                             'profile_path': '/home/jenna/CFReT_data/3.process_cfret_features/data/converted_profiles/localhost220512140003_KK22-05-198_converted.parquet'},
    'localhost220513100001_KK22-05-198_FactinAdjusted': {   'platemap_path': '/home/jenna/CFReT_data/metadata/localhost220513100001_KK22-05-198_FactinAdjusted_platemap.csv',
                                                            'profile_path': '/home/jenna/CFReT_data/3.process_cfret_features/data/converted_profiles/localhost220513100001_KK22-05-198_FactinAdjusted_converted.parquet'},
    'localhost230405150001': {   'platemap_path': '/home/jenna/CFReT_data/metadata/localhost230405150001_platemap.csv',
                                 'profile_path': '/home/jenna/CFReT_data/3.process_cfret_features/data/converted_profiles/localhost230405150001_converted.

In [4]:
for plate, info in plate_info_dictionary.items():
    print(f"Performing pycytominer pipeline for {plate}")
    output_annotated_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_annotated.parquet"))
    output_normalized_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_normalized.parquet"))
    output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet"))

    profile_df = pd.read_parquet(info["profile_path"])
    platemap_df = pd.read_csv(info["platemap_path"])

    # Step 1: Annotation
    annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )

    # Load the annotated parquet file to fix metadata columns names
    annotated_df = pd.read_parquet(output_annotated_file)

    # Rename columns using the rename() function
    column_name_mapping = {
        "Image_Metadata_Site": "Metadata_Site",
    }

    annotated_df.rename(columns=column_name_mapping, inplace=True)

    # Save the modified DataFrame back to the same location
    annotated_df.to_parquet(output_annotated_file, index=False)
    
    # Only for Plate 4, we want to normalize to the DMSO treatments
    if plate == "localhost231120090001":
        samples = "Metadata_heart_number == 7 and Metadata_treatment == 'DMSO'"
    # For all other plates, use the default
    else:
        samples = "all"

    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
        samples=samples,
    )
    

    # Step 3: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        output_file=output_feature_select_file,
        output_type="parquet"
    )
    print(f"Annotation, normalization, and feature selection have been performed for {plate}")

Performing pycytominer pipeline for localhost230405150001
Annotation, normalization, and feature selection have been performed for localhost230405150001
Performing pycytominer pipeline for localhost231120090001
Annotation, normalization, and feature selection have been performed for localhost231120090001
Performing pycytominer pipeline for localhost220512140003_KK22-05-198
Annotation, normalization, and feature selection have been performed for localhost220512140003_KK22-05-198
Performing pycytominer pipeline for localhost220513100001_KK22-05-198_FactinAdjusted
Annotation, normalization, and feature selection have been performed for localhost220513100001_KK22-05-198_FactinAdjusted


In [5]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
test_df.head(2)

(17536, 642)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_dose,Metadata_dose_unit,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,...,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumEntropy_Hoechst_3_00_256,Nuclei_Texture_SumEntropy_Mitochondria_3_03_256,Nuclei_Texture_SumEntropy_PM_3_01_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,A,9,9,drug_x,5.0,uM,424.22186,102.784001,34,1.0,...,0.905773,0.124091,0.116207,-0.501642,-1.161273,-0.399491,-0.44867,-0.316325,-0.188458,-0.19853
1,A,9,9,drug_x,5.0,uM,162.360089,173.149134,34,1.0,...,-1.942802,-1.409078,-1.847424,0.362271,1.905731,1.979446,1.417736,-0.202229,0.697392,0.765973
