In [1]:
import pathlib

import pandas as pd
from pycytominer import normalize
from pycytominer.cyto_utils import output

In [2]:
# set path to data

combined_data_path = pathlib.Path(
    "../data/20231017ChromaLive_6hr_4ch_MaxIP_combined_data.parquet"
).resolve(strict=True)

# set output path
normalized_data_output_path = pathlib.Path(
    "../data/20231017ChromaLive_6hr_4ch_MaxIP_normalized_combined_data.parquet"
).resolve()

# load data
combined_data = pd.read_parquet(combined_data_path)
print(combined_data.shape)
combined_data.head()

(240048, 3476)


Unnamed: 0,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,C-02,1,10,10,10,1,7914,Staurosporine,0.0,negative,...,-0.026835,-0.025143,0.022814,0.044236,-0.048172,0.003977,0.005565,0.033877,0.082223,0.009103
1,C-02,1,10,10,100,1,7914,Staurosporine,0.0,negative,...,0.02401,0.002705,-0.059467,0.032855,-0.05753,0.031927,0.017482,0.051654,0.024463,-0.034733
2,C-02,1,10,10,101,1,7914,Staurosporine,0.0,negative,...,-0.007732,0.024938,0.027292,0.034904,-0.127702,-0.014732,0.033218,0.008977,0.031269,-0.031651
3,C-02,1,10,10,102,1,7914,Staurosporine,0.0,negative,...,-0.041826,-0.028302,-0.034485,0.032456,-0.054537,0.021049,0.028299,-0.006374,0.101494,-0.018018
4,C-02,1,10,10,103,1,7914,Staurosporine,0.0,negative,...,-0.029698,-0.017408,-0.011866,0.045469,-0.077298,0.000747,0.076386,-0.007291,0.015712,-0.011359


In [3]:
# if column name contains TrackObjects, then prepend with Metadata
combined_data.columns = [
    "Metadata_" + x if "TrackObjects" in x else x for x in combined_data.columns
]

In [4]:
# Get columns that contain "Metadata"
metadata_features = combined_data.columns[
    combined_data.columns.str.contains("Metadata")
].tolist()

# get the feature columns
feature_columns = combined_data.columns.difference(metadata_features).to_list()

In [5]:
# Normalize the single cell data per time point

# make the time column an integer
combined_data.Metadata_Time = combined_data.Metadata_Time.astype(int)

# get the unique time points
time_points = combined_data.Metadata_Time.unique()

output_dict_of_normalized_dfs = {}

# define a for loop to normalize each time point
for time_point in time_points:
    # subset the data to the time point
    time_point_df = combined_data.loc[combined_data.Metadata_Time == time_point]

    # normalize annotated data
    normalized_df = normalize(
        # df with annotated raw merged single cell features
        profiles=time_point_df,
        features=feature_columns,
        meta_features=metadata_features,
        # specify samples used as normalization reference (negative control)
        samples=f"Metadata_compound == 'Staurosporine' and Metadata_dose == 0.0 and Metadata_Time == {time_point}",
        # normalization method used
        method="standardize",
    )

    output_dict_of_normalized_dfs[time_point] = normalized_df

# combine the normalized dataframes
normalized_df = pd.concat(output_dict_of_normalized_dfs.values()).reset_index(drop=True)

output(
    normalized_df,
    output_filename=normalized_data_output_path,
    output_type="parquet",
)
print(f"Single cells have been normalized!")
# check to see if the features have been normalized
print(normalized_df.shape)
normalized_df.head()

Single cells have been normalized!
(240048, 3476)


Unnamed: 0,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,C-02,1,10,10,10,1,7914,Staurosporine,0.0,negative,...,-0.442636,-0.611063,2.300909,0.528285,1.19859,-0.012867,-0.334217,0.865129,1.109296,1.342877
1,C-02,1,10,10,100,1,7914,Staurosporine,0.0,negative,...,1.64888,0.479085,-1.315989,-0.2175,0.925141,1.166212,0.189229,1.501307,-0.877394,-1.800525
2,C-02,1,10,10,101,1,7914,Staurosporine,0.0,negative,...,0.343164,1.349414,2.497744,-0.083206,-1.125241,-0.802084,0.880478,-0.025921,-0.643299,-1.579514
3,C-02,1,10,10,102,1,7914,Staurosporine,0.0,negative,...,-1.059285,-0.73472,-0.217822,-0.243605,1.012593,0.707333,0.664375,-0.575244,1.772152,-0.601933
4,C-02,1,10,10,103,1,7914,Staurosporine,0.0,negative,...,-0.56043,-0.308253,0.776449,0.609137,0.347549,-0.149095,2.776645,-0.608087,-1.178395,-0.124372
