# Process single cell profiles

## Import libraries

In [1]:
import pathlib
import pprint

import pandas as pd

from pycytominer import annotate, normalize, feature_select

## Set paths and variables

In [2]:
# Path to dir with cleaned data from single-cell QC
converted_dir = pathlib.Path("./data/converted_profiles")

# output path for single-cell profiles 
output_dir = pathlib.Path("./data/single_cell_profiles")
output_dir.mkdir(parents=True, exist_ok=True)  

# Extract the plate names from the file name
plate_names = [file.stem.replace("_converted", "") for file in converted_dir.glob("*.parquet")]

# path for platemap directory
platemap_dir = pathlib.Path("../metadata/")

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns"
]

## Set dictionary with plates to process

In [3]:
# create plate info dictionary 
plate_info_dictionary = {
    name: {
        "profile_path": str(
            pathlib.Path(list(converted_dir.rglob(f"{name}_converted.parquet"))[0]).resolve(
                strict=True
            )
        ),
        "platemap_path": str(
            pathlib.Path(list(platemap_dir.rglob(f"{name}_platemap.csv"))[0]).resolve(
                strict=True
            )
        ),
    }
    for name in plate_names if name == "localhost230405150001"
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'localhost230405150001': {   'platemap_path': '/home/jenna/cellpainting_predicts_cardiac_fibrosis/metadata/localhost230405150001_platemap.csv',
                                 'profile_path': '/home/jenna/cellpainting_predicts_cardiac_fibrosis/3.process_cfret_features/data/converted_profiles/localhost230405150001_converted.parquet'}}


## Process data with pycytominer

In [4]:
for plate, info in plate_info_dictionary.items():
    print(f"Performing pycytominer pipeline for {plate}")
    output_annotated_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_annotated_no_QC.parquet"))
    output_normalized_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_normalized_no_QC.parquet"))
    output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected_no_QC.parquet"))

    profile_df = pd.read_parquet(info["profile_path"])
    platemap_df = pd.read_csv(info["platemap_path"])

    print("Performing annotation for", plate, "...")
    # Step 1: Annotation
    annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )

    # Load the annotated parquet file to fix metadata columns names
    annotated_df = pd.read_parquet(output_annotated_file)

    # Rename columns using the rename() function
    column_name_mapping = {
        "Image_Metadata_Site": "Metadata_Site",
    }

    annotated_df.rename(columns=column_name_mapping, inplace=True)

    # Save the modified DataFrame back to the same location
    annotated_df.to_parquet(output_annotated_file, index=False)
    
    # set default for samples to use in normalization
    samples = "all"
     
    # Only for Plate 4, we want to normalize to the DMSO treatments
    if plate == "localhost231120090001":
        samples = "Metadata_heart_number == 7 and Metadata_treatment == 'DMSO'"

    print(f"Performing normalization for", plate, "using this samples parameter:", samples)

    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
        samples=samples,
    )
    
    print("Performing feature selection for", plate, "...")
    # Step 3: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet"
    )
    print(f"Annotation, normalization, and feature selection have been performed for {plate}")

Performing pycytominer pipeline for localhost230405150001
Performing annotation for localhost230405150001 ...
Performing normalization for localhost230405150001 using this samples parameter: all
Performing feature selection for localhost230405150001 ...
Annotation, normalization, and feature selection have been performed for localhost230405150001


In [5]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
test_df.head(2)

(25859, 587)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_03_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,9,failing,rejected,DMSO,221.046761,137.115493,246.6028,109.285755,...,-0.036033,0.309281,-0.612973,0.906721,0.418794,0.385735,-0.133843,-0.363257,-0.262538,-0.309825
1,B,2,9,failing,rejected,DMSO,690.596142,183.067828,716.170091,177.132195,...,0.289544,-0.057546,-0.419084,-0.068447,0.43727,0.335719,-0.216157,-0.371891,2.526579,-0.183663
