# Perform data preprocessing with pycytominer on single cell features

Note: Single cell is represented by only nuclei compartment features which was used to extract features across all three channels.

## Import libraries

In [1]:
import pathlib
import pprint

import pandas as pd
from pycytominer import annotate, normalize, feature_select

## Set paths and variables

In [2]:
# Path to dir with converted profiles per plate (each plate as a folder)
converted_dir = pathlib.Path("./data/converted_profiles")

# path for plate map directory
platemap_dir = pathlib.Path(f"../0.download_data/metadata/platemaps")

# Output dir for the files to be saved to
output_dir = pathlib.Path("./data/single_cell_profiles")
output_dir.mkdir(exist_ok=True, parents=True)

# Extract the plate names from the file names
plate_names = [file.stem.split("_")[0] for file in platemap_dir.glob("*_platemap.csv")]

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

# print the plate names and how many plates there are (confirmation)
print(f"There are {len(plate_names)} plates in this dataset. Below are the names:")
for name in plate_names:
    print(name)

There are 4 plates in this dataset. Below are the names:
slide4
slide2
slide1
slide3


## Create dictionary with unique paths for each plate

In [3]:
# create plate info dictionary
plate_info_dictionary = {
    name: {
        "profile_path": str(
            pathlib.Path(f"{converted_dir}/{name}/per_nuclei.parquet").resolve(strict=True)
        ),
        "platemap_path": str(
            pathlib.Path(f"{platemap_dir}/{name}_platemap.csv").resolve(strict=True)
        ),
    }
    for name in plate_names
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'slide1': {   'platemap_path': '/home/jenna/nuclear_speckle_image_analysis/0.download_data/metadata/platemaps/slide1_platemap.csv',
                  'profile_path': '/home/jenna/nuclear_speckle_image_analysis/4.preprocess_features/data/converted_profiles/slide1/per_nuclei.parquet'},
    'slide2': {   'platemap_path': '/home/jenna/nuclear_speckle_image_analysis/0.download_data/metadata/platemaps/slide2_platemap.csv',
                  'profile_path': '/home/jenna/nuclear_speckle_image_analysis/4.preprocess_features/data/converted_profiles/slide2/per_nuclei.parquet'},
    'slide3': {   'platemap_path': '/home/jenna/nuclear_speckle_image_analysis/0.download_data/metadata/platemaps/slide3_platemap.csv',
                  'profile_path': '/home/jenna/nuclear_speckle_image_analysis/4.preprocess_features/data/converted_profiles/slide3/per_nuclei.parquet'},
    'slide4': {   'platemap_path': '/home/jenna/nuclear_speckle_image_analysis/0.download_data/metadata/platemaps/slide4_platemap.csv

## Perform preprocessing on single cell features

In [4]:
for plate, info in plate_info_dictionary.items():
    print(f"Performing pycytominer pipeline for {plate}")
    # Set output paths per preprocessing step
    output_annotated_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_annotated.parquet")
    )
    output_normalized_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_normalized.parquet")
    )
    output_feature_select_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")
    )

    # Load in the converted profile to be used in the first step
    profile_df = pd.read_parquet(info["profile_path"])

    # Load in platemap file with most relevant columns for annotation
    platemap_df = pd.read_csv(info["platemap_path"], usecols=["Well", "CellLine", "Condition"])

    # Step 1: Annotation
    annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_Well", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )

    # Load the annotated parquet file to fix metadata columns names
    annotated_df = pd.read_parquet(output_annotated_file)

    # Rename columns using the rename() function
    column_name_mapping = {
        "Image_Metadata_Site": "Metadata_Site",
        "Image_Count_Nuclei": "Metadata_Nuclei_Site_Count",
    }

    annotated_df.rename(columns=column_name_mapping, inplace=True)

    # Save the modified DataFrame back to the same location
    annotated_df.to_parquet(output_annotated_file, index=False)

    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
    )

    # Step 3: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        output_file=output_feature_select_file,
        output_type="parquet",
    )
    print(
        f"Annotation, normalization, and feature selection have been performed for {plate}"
    )

Performing pycytominer pipeline for slide4
Annotation, normalization, and feature selection have been performed for slide4
Performing pycytominer pipeline for slide2
Annotation, normalization, and feature selection have been performed for slide2
Performing pycytominer pipeline for slide1
Annotation, normalization, and feature selection have been performed for slide1
Performing pycytominer pipeline for slide3
Annotation, normalization, and feature selection have been performed for slide3


## Check example output file to confirm that the process worked

In [8]:
# Check output file
test_df = pd.read_parquet("./data/single_cell_profiles/slide1_sc_annotated.parquet")

print(test_df.shape)
test_df.head(2000)

(71962, 586)


Unnamed: 0,Metadata_CellLine,Metadata_Condition,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Nuclei_Site_Count,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,...,Nuclei_Texture_Variance_A647_3_02_256,Nuclei_Texture_Variance_A647_3_03_256,Nuclei_Texture_Variance_DAPI_3_00_256,Nuclei_Texture_Variance_DAPI_3_01_256,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GOLD_3_00_256,Nuclei_Texture_Variance_GOLD_3_01_256,Nuclei_Texture_Variance_GOLD_3_02_256,Nuclei_Texture_Variance_GOLD_3_03_256
0,786O,NTC,1,slide1,A1,M10,25,1483.0,2550.0,53.0,...,14.731878,13.950219,2.508336,2.486800,2.539577,2.484139,2.586139,2.685200,2.649136,2.497674
1,786O,NTC,1,slide1,A1,M10,25,1378.0,1974.0,2239.0,...,50.715492,50.584647,11.958722,11.757222,11.748338,12.026326,8.242161,8.072166,8.239189,8.313167
2,786O,NTC,1,slide1,A1,M10,25,1345.0,1974.0,2073.0,...,121.489707,122.096126,50.735363,51.805624,50.608420,51.418746,28.538125,29.204032,28.407457,28.739371
3,786O,NTC,1,slide1,A1,M10,25,1403.0,5130.0,2013.0,...,0.000000,0.000000,47.224646,50.686761,46.432874,47.049561,0.000000,0.000000,0.000000,0.000000
4,786O,NTC,1,slide1,A1,M10,25,1157.0,1950.0,1805.0,...,197.081725,197.985838,116.485446,121.235388,115.894563,117.135912,40.604219,41.407578,40.214090,41.106731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,786O,ALY kd5,163,slide1,B1,M32,55,1553.0,1961.0,1686.0,...,107.584991,109.206173,37.862779,37.835819,39.500949,38.099296,63.644421,64.436695,64.814966,65.115703
1996,786O,ALY kd5,163,slide1,B1,M32,55,3095.0,5160.0,1783.0,...,91.344365,92.329751,81.684182,81.445826,81.180288,80.423571,46.176275,46.805049,46.749427,47.222665
1997,786O,ALY kd5,163,slide1,B1,M32,55,2465.0,3337.0,1871.0,...,177.873785,179.445514,90.494332,85.457200,85.681774,86.164297,83.299113,83.183737,82.601901,82.641290
1998,786O,ALY kd5,163,slide1,B1,M32,55,731.0,1008.0,1954.0,...,101.290119,102.016210,26.125288,24.188620,23.024083,23.186760,55.382516,54.539857,54.456355,55.640187
