### Import Libraries


In [1]:
import pathlib
import joblib
import importlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pycytominer.cyto_utils import DeepProfiler_processing

normalization_utils = importlib.import_module("normalization-utils")

### Set Load/Save Paths


In [2]:
# paths to load merged features,index, and annotations from
merged_features_save_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged/"
)
dp_index_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-DP/inputs/metadata/index.csv"
)
annotations_path = pathlib.Path(
    "../0.image-download/manifest/idr0080-screenA-annotation.csv"
)

# path to save normalized merged features to
normalized_merged_features_save_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged-normalized/"
)
normalized_merged_features_save_path.mkdir(parents=True, exist_ok=True)
scaler_save_dir = pathlib.Path("normalization-scalers/")
scaler_save_dir.mkdir(parents=True, exist_ok=True)

### Normalize merged single-cell data


In [3]:
for merged_single_cell_plate_path in merged_features_save_path.iterdir():
    # get only plate name from merged features file name
    plate = merged_single_cell_plate_path.name.split("-")[0]
    print(f"Normalizing plate {plate}...")

    # REMOVE LATER
    normalized_merged_plate_single_cells_save_path = pathlib.Path(
        f"{normalized_merged_features_save_path}/{plate}-normalized-merged-single-cell.csv"
    )
    if normalized_merged_plate_single_cells_save_path.is_file():
        continue

    # load plate single-cell data
    print(f"Loading single-cell data...")
    # plate_merged_single_cells = pd.read_csv(merged_single_cell_plate_path, compression="gzip", engine="pyarrow")
    # load uncompressed version REMOVE LATER AND USE ABOVE
    plate_merged_single_cells = pd.read_csv(
        merged_single_cell_plate_path, engine="pyarrow", low_memory=True
    )

    # create per-plate normalization scaler from the normalization population
    print(f"Deriving normalization scaler...")
    plate_scaler = normalization_utils.get_normalization_scaler(
        plate_merged_single_cells
    )
    # save normalization scaler
    scaler_save_path = pathlib.Path(
        f"{scaler_save_dir}/{plate}-merged-normalization-scaler.joblib"
    )
    joblib.dump(plate_scaler, scaler_save_path)

    # apply scaler to all single cell feature data
    print(f"Applying normalization scaler...")
    # get normalized feature data
    feature_cols = [
        col for col in plate_merged_single_cells.columns.to_list() if "P__" in col
    ]
    features = plate_merged_single_cells[feature_cols].values
    features = plate_scaler.transform(features)
    features = pd.DataFrame(features, columns=feature_cols)
    # get metadata for all single cells
    metadata_cols = [
        col for col in plate_merged_single_cells.columns.to_list() if "P__" not in col
    ]
    metadata = plate_merged_single_cells[metadata_cols]

    # combine metadata and normalized features for all single cells (replace other single cell dataframe to not keep two dfs in memory)
    plate_merged_single_cells = pd.concat([metadata, features], axis=1)

    # compress and save merged single-cell data
    # print(f"Saving normalized features...")
    # normalized_merged_plate_single_cells_save_path = pathlib.Path(f"{normalized_merged_features_save_path}/{plate}-normalized-merged-single-cell.csv.gz")
    # plate_merged_single_cells.to_csv(normalized_merged_plate_single_cells_save_path, compression="gzip", index=False)

    # save merged single-cell data REMOVE LATER AND USED COMPRESSED VERSION ABOVE
    print(f"Saving normalized features...")
    normalized_merged_plate_single_cells_save_path = pathlib.Path(
        f"{normalized_merged_features_save_path}/{plate}-normalized-merged-single-cell.csv"
    )
    plate_merged_single_cells.to_csv(
        normalized_merged_plate_single_cells_save_path, index=False
    )


plate_merged_single_cells

Normalizing plate SQ00014617...
Normalizing plate SQ00014615...
Loading single-cell data...
Deriving normalization scaler...
Applying normalization scaler...
Saving normalized features...


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Site,Metadata_Well,Metadata_Plate,Metadata_Plate_Map_Name,Metadata_Reagent,CP__AreaShape_Area,CP__AreaShape_BoundingBoxArea,CP__AreaShape_BoundingBoxMaximum_X,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,1815.609682,43.198714,4,G18,SQ00014615,SQ00014615_G18_04,ARID1B-2,-0.096297,-0.163551,1.194070,...,0.321367,-0.160800,-0.154379,0.249125,-1.200569,-1.171868,-0.516436,-0.263926,0.012264,-0.539202
1,203.655413,54.296624,4,G18,SQ00014615,SQ00014615_G18_04,ARID1B-2,0.858966,0.983053,-1.459682,...,1.043064,-1.074946,0.092682,1.057359,0.097871,-0.668327,1.338019,-0.053791,-0.330383,-0.348268
2,1263.738986,75.333333,4,G18,SQ00014615,SQ00014615_G18_04,ARID1B-2,0.707941,0.414643,0.278188,...,-0.680397,-0.552156,0.595718,2.963250,-0.131678,0.868194,-0.922562,-1.832542,-1.290931,-0.434676
3,740.074125,68.358656,4,G18,SQ00014615,SQ00014615_G18_04,ARID1B-2,-1.056471,-1.092221,-0.586629,...,0.417610,-1.206015,-0.272243,1.690450,-0.757688,0.729346,0.458857,-1.727744,-0.896909,0.929340
4,286.182737,88.632970,4,G18,SQ00014615,SQ00014615_G18_04,ARID1B-2,-0.565333,-0.461544,-1.341079,...,1.053714,-1.004377,-0.407010,1.237923,-0.206471,-0.520280,1.023079,0.180764,0.223022,-0.625990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774088,416.805636,2085.631579,9,M10,SQ00014615,SQ00014615_M10_09,ERBB3-1,-0.760560,-0.432190,-1.113756,...,0.337738,1.160665,-0.746095,-0.320377,-0.062196,-1.088852,-0.878784,-0.361299,-0.673130,1.200795
774089,776.992817,2100.031657,9,M10,SQ00014615,SQ00014615_M10_09,ERBB3-1,0.892118,0.862966,-0.522385,...,0.898036,1.961982,-0.551475,0.066605,-1.551065,0.349752,-0.035760,-1.857165,0.393916,-1.034071
774090,532.853734,2084.372594,9,M10,SQ00014615,SQ00014615_M10_09,ERBB3-1,-0.533409,-0.651903,-0.917730,...,0.685445,-0.169195,-0.955133,0.172606,-0.665295,-0.394254,-0.348886,0.034876,0.266203,-0.298906
774091,1218.812360,2098.119436,9,M10,SQ00014615,SQ00014615_M10_09,ERBB3-1,0.111209,0.134441,0.205708,...,2.097482,-0.559661,-0.497399,-1.122383,-0.271534,-1.256304,0.200224,-1.947731,0.096987,-0.204066
