## Normalize merged single cells with standardized method for each plate

## Import libraries

In [1]:
import pathlib
import yaml
import pprint

import pandas as pd
from pycytominer import normalize
from pycytominer.cyto_utils import output

## Set paths and load in dictionary from annotated run

In [2]:
# output directory for normalized data
output_dir = pathlib.Path("./data/normalized_data")
output_dir.mkdir(exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary)

{'Plate_1': {'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',
             'bulk_annotated_path': 'data/annotated_data/Plate_1_bulk_annotated.parquet',
             'bulk_path': 'data/aggregated_data/Plate_1_bulk.parquet',
             'dest_path': 'data/converted_data/Plate_1.parquet',
             'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',
             'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},
 'Plate_2': {'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',
             'bulk_annotated_path': 'data/annotated_data/Plate_2_bulk_annotated.parquet',
             'bulk_path': 'data/aggregated_data/Plate_2_bulk.parquet',
             'dest_path': 'data/converted_data/Plate_2.parquet',
             'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',
             'source_path': 

## Normalize annotated bulk profiles from each plate

In [3]:
# process each run
for plate, info in plate_info_dictionary.items():
    annotated_df = pd.read_parquet(info["bulk_annotated_path"])
    # set output path and add to the dictionary
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_norm.parquet"))
    # save path to normalized file to dictionary for downstream use
    plate_info_dictionary[plate]["bulk_normalized_path"] = output_file
    print(f"Normalizing annotated bulk profiles for {plate}!")

    # normalize annotated data
    normalized_df = normalize(
            # df with annotated raw merged single cell features
            profiles=annotated_df,
            # normalization method used
            method="standardize"
    )

    # save df as parquet file
    output(
        df=normalized_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Bulk profiles have been normalized for {plate} and saved!")

Normalizing annotated bulk profiles for Plate_1!
Bulk profiles have been normalized for Plate_1 and saved!
Normalizing annotated bulk profiles for Plate_2!
Bulk profiles have been normalized for Plate_2 and saved!
Normalizing annotated bulk profiles for Plate_3!
Bulk profiles have been normalized for Plate_3 and saved!
Normalizing annotated bulk profiles for Plate_3_prime!
Bulk profiles have been normalized for Plate_3_prime and saved!
Normalizing annotated bulk profiles for Plate_4!
Bulk profiles have been normalized for Plate_4 and saved!


## Normalize annotated single cells from each plate

**Note:** Path to normalized data for each plate is added to the dictionary in this step to be used during feature selection.

In [4]:
# process each run
for plate, info in plate_info_dictionary.items():
    annotated_df = pd.read_parquet(info["annotated_path"])
    # set output path and add to the dictionary
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm.parquet"))
    # save path to normalized file to dictionary for downstream use
    plate_info_dictionary[plate]["normalized_path"] = output_file
    print(f"Normalizing annotated merged single cells for {plate}!")

    # normalize annotated data
    normalized_df = normalize(
            # df with annotated raw merged single cell features
            profiles=annotated_df,
            # normalization method used
            method="standardize"
    )

    # save df as parquet file
    output(
        df=normalized_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Single cells have been normalized for {plate} and saved!")

Normalizing annotated merged single cells for Plate_1!
Single cells have been normalized for Plate_1 and saved!
Normalizing annotated merged single cells for Plate_2!
Single cells have been normalized for Plate_2 and saved!
Normalizing annotated merged single cells for Plate_3!
Single cells have been normalized for Plate_3 and saved!
Normalizing annotated merged single cells for Plate_3_prime!
Single cells have been normalized for Plate_3_prime and saved!
Normalizing annotated merged single cells for Plate_4!
Single cells have been normalized for Plate_4 and saved!


In [5]:
# print last normalized df to see if looks like normalization has occurred
print(normalized_df.shape)
normalized_df.head()

(7502, 2321)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,B,2,B2,11,115,NF1,WT,1000,,0,...,-1.015567,-0.991013,-0.427085,-0.492375,-0.505686,-0.492483,3.287944,3.324396,3.234285,3.337852
1,B,2,B2,11,115,NF1,WT,1000,,0,...,1.254595,1.181604,-0.467826,-0.486035,-0.481063,-0.457716,-0.661907,-0.668559,-0.634783,-0.644279
2,B,2,B2,11,115,NF1,WT,1000,,0,...,-0.840056,-0.88246,-0.052123,-0.097227,-0.132127,-0.097907,0.252911,0.251474,0.33791,0.296604
3,B,2,B2,11,115,NF1,WT,1000,,0,...,1.85642,1.740204,0.198178,0.191218,0.17341,0.210927,-0.340967,-0.328878,-0.17326,-0.293306
4,B,2,B2,14,115,NF1,WT,1000,,0,...,-1.399975,-1.412046,-0.597744,-0.601125,-0.628829,-0.621264,-0.491339,-0.487596,-0.440303,-0.472316


## Write updated dictionary to yaml file for use in downstream steps

In [6]:
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)