# Normalize annotated single cells using negative control

## Import libraries

In [1]:
import pathlib
import pprint

import pandas as pd
from pycytominer import normalize
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# directory where combined parquet file are located
data_dir = pathlib.Path("../data/annotated_data")

# directory where the normalized parquet file is saved to
output_dir = pathlib.Path("../data/normalized_data")
output_dir.mkdir(exist_ok=True)

## Define dict of paths

In [3]:
# dictionary with each run for the cell type
dict_of_inputs = {
    # "run_20230920ChromaLiveTL_24hr4ch_MaxIP": {
    #     "annotated_file_path": pathlib.Path(
    #         f"{data_dir}/run_20230920ChromaLiveTL_24hr4ch_MaxIP_sc.parquet"
    #     ).resolve(),
    #     "output_file_path": pathlib.Path(
    #         f"{output_dir}/run_20230920ChromaLiveTL_24hr4ch_MaxIP_norm.parquet"
    #     ).resolve(),
    # },
    "run_20231017ChromaLive_6hr_4ch_MaxIP": {
        "annotated_file_path": pathlib.Path(
            f"{data_dir}/run_20231017ChromaLive_6hr_4ch_MaxIP_sc.parquet"
        ).resolve(),
        "output_file_path": pathlib.Path(
            f"{output_dir}/run_20231017ChromaLive_6hr_4ch_MaxIP_norm_pan_time_norm.parquet"
        ).resolve(),
    },
    # "run_20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP": {
    #     "annotated_file_path": pathlib.Path(
    #         f"{data_dir}/run_20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP_sc.parquet"
    #     ).resolve(),
    #     "output_file_path": pathlib.Path(
    #         f"{output_dir}/run_20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP_norm.parquet"
    #     ).resolve(),
    # },
}

## Normalize with standardize method with negative control on annotated data

The normalization needs to occur per time step. 
This code cell will split the data into time steps and normalize each time step separately.
Then each normalized time step will be concatenated back together. 

In [5]:
for info, input_path in dict_of_inputs.items():
    # read in the annotated file
    print(input_path)
    annotated_df = pd.read_parquet(input_path["annotated_file_path"])

    # Normalize the single cell data per time point

    # make the time column an integer
    annotated_df.Metadata_Time = annotated_df.Metadata_Time.astype(int)

    # get the unique time points
    time_points = annotated_df.Metadata_Time.unique()

    output_dict_of_normalized_dfs = {}

    print(f"Shape of the annotated data: {annotated_df.shape}")

    normalized_df = normalize(
        # df with annotated raw merged single cell features
        profiles=annotated_df,
        # specify samples used as normalization reference (negative control)
        samples=f"Metadata_compound == 'Staurosporine' and Metadata_dose == 0.0",
        # normalization method used
        method="standardize",
    )

    output(
        normalized_df,
        output_filename=input_path["output_file_path"],
        output_type="parquet",
    )
    print(
        f"Single cells have been normalized for PBMC cells and saved to {pathlib.Path(info).name} !"
    )
    # check to see if the features have been normalized
    print(normalized_df.shape)
    normalized_df.head()

{'annotated_file_path': PosixPath('/home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/annotated_data/run_20231017ChromaLive_6hr_4ch_MaxIP_sc.parquet'), 'output_file_path': PosixPath('/home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/normalized_data/run_20231017ChromaLive_6hr_4ch_MaxIP_norm_pan_time_norm.parquet')}
Shape of the annotated data: (0, 2102)


ValueError: Found array with 0 sample(s) (shape=(0, 2077)) while a minimum of 1 is required by StandardScaler.