## Normalize merged single cells with standardized method for each plate

## Import libraries

In [1]:
import sys
import pathlib
import os
import yaml
import json

import pandas as pd
from pycytominer import normalize
from pycytominer.cyto_utils import output

In [2]:
# output directory for normalized data
output_dir = pathlib.Path("./data/normalized_data")
# if directory if doesn't exist, will not raise error if it already exists
os.makedirs(output_dir, exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# view the dictionary to confirm all info is included to use for normalization
print(json.dumps(plate_info_dictionary, indent=4))

{
    "Plate_1": {
        "annotated_path": "data/annotated_data/Plate_1_sc.parquet",
        "dest_path": "data/converted_data/Plate_1.parquet",
        "normalized_path": "data/normalized_data/Plate_1_sc_norm.parquet",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate1.csv",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite"
    },
    "Plate_2": {
        "annotated_path": "data/annotated_data/Plate_2_sc.parquet",
        "dest_path": "data/converted_data/Plate_2.parquet",
        "normalized_path": "data/normalized_data/Plate_2_sc_norm.parquet",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate2.csv",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite"
    },
    "Plate_3": {
        "annotated_path": "data/annotated_data/Plate_3_sc.parquet",
        "dest_path": "data/converted_data/Plate_3.parquet",
        "normalized_path": "data/normalized_data/Plate_3_sc_norm.parquet"

In [3]:
# process each run
for plate, info in plate_info_dictionary.items():
    annotated_df = pd.read_parquet(info["annotated_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm.parquet"))
    # save path to annotated file to dictionary for downstream use
    plate_info_dictionary[plate]["normalized_path"] = output_file
    print(f"Normalizing annotated merged single cells for {plate}!")

    # normalize annotated data
    normalized_df = normalize(
            # df with annotated raw merged single cell features
            profiles=annotated_df,
            # normalization method used
            method="standardize"
    )

    # save df as parquet file
    output(
        df=normalized_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Single cells have been normalized for {plate} and saved!")

Normalizing annotated merged single cells for Plate_1!
Single cells have been normalized for Plate_1 and saved!
Normalizing annotated merged single cells for Plate_2!
Single cells have been normalized for Plate_2 and saved!
Normalizing annotated merged single cells for Plate_3!
Single cells have been normalized for Plate_3 and saved!
Normalizing annotated merged single cells for Plate_3_prime!
Single cells have been normalized for Plate_3_prime and saved!


In [4]:
# print last normalized df to see if looks like normalization has occurred
print(normalized_df.shape)
normalized_df.head()

(14495, 1596)


Unnamed: 0,Metadata_WellRow,Metadata_Well,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_ImageNumber,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,B,B1,42,1,NF1,WT,500,78,1,1,...,1.25603,1.227913,0.531897,0.487729,0.49828,0.547909,-0.751284,-0.75004,-0.746242,-0.742364
1,B,B1,42,1,NF1,WT,500,81,2,2,...,-0.649737,-0.63574,-0.267391,-0.243767,-0.262017,-0.25555,-0.500182,-0.494528,-0.511217,-0.505461
2,B,B1,42,1,NF1,WT,500,82,1,1,...,1.482894,1.666617,0.783631,0.535143,0.474602,0.502886,-0.569328,-0.590753,-0.602282,-0.590615
3,B,B1,42,1,NF1,WT,500,82,2,2,...,1.999033,2.351937,0.742802,0.693465,0.661268,0.549678,-0.566042,-0.566538,-0.561955,-0.576103
4,B,B1,42,1,NF1,WT,500,83,1,1,...,-0.643806,-0.653913,0.143588,0.159361,0.118046,0.150589,-0.702865,-0.701488,-0.705585,-0.702083


## Write updated dictionary to yaml file for use in downstream steps

In [5]:
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)