### Import Libraries

In [1]:
import pathlib
import joblib
import importlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pycytominer.cyto_utils import DeepProfiler_processing

normalization_utils = importlib.import_module("normalization-utils")

### Set Load/Save Paths

In [2]:
# paths to load merged features,index, and annotations from
merged_features_save_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged/"
)
dp_index_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-DP/inputs/metadata/index.csv"
)
annotations_path = pathlib.Path("../0.image-download/manifest/idr0080-screenA-annotation.csv")

# path to save normalized merged features to
normalized_merged_features_save_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged-normalized/"
)
normalized_merged_features_save_path.mkdir(parents=True, exist_ok=True)
scaler_save_dir = pathlib.Path("normalization-scalers/")
scaler_save_dir.mkdir(parents=True, exist_ok=True)

### Normalize merged single-cell data

In [3]:
for merged_single_cell_plate_path in merged_features_save_path.iterdir():
    # get only plate name from merged features file name
    plate = merged_single_cell_plate_path.name.split("-")[0]
    print(f"Normalizing plate {plate}...")
    
    # REMOVE LATER
    normalized_merged_plate_single_cells_save_path = pathlib.Path(f"{normalized_merged_features_save_path}/{plate}-normalized-merged-single-cell.csv")
    if normalized_merged_plate_single_cells_save_path.is_file():
        continue
    
    # load plate single-cell data
    print(f"Loading single-cell data...")
    #plate_merged_single_cells = pd.read_csv(merged_single_cell_plate_path, compression="gzip", engine="pyarrow")
    # load uncompressed version REMOVE LATER AND USE ABOVE
    plate_merged_single_cells = pd.read_csv(merged_single_cell_plate_path, engine="pyarrow", low_memory=True)
    
    # create per-plate normalization scaler from the normalization population
    print(f"Deriving normalization scaler...")
    plate_scaler = normalization_utils.get_normalization_scaler(plate_merged_single_cells)
    # save normalization scaler
    scaler_save_path = pathlib.Path(f"{scaler_save_dir}/{plate}-merged-normalization-scaler.joblib")
    joblib.dump(plate_scaler,scaler_save_path)
    
    # apply scaler to all single cell feature data
    print(f"Applying normalization scaler...")
    # get normalized feature data
    feature_cols = [col for col in plate_merged_single_cells.columns.to_list() if "P__" in col]
    features = plate_merged_single_cells[feature_cols].values
    features = plate_scaler.transform(features)
    features = pd.DataFrame(features, columns=feature_cols)
    # get metadata for all single cells
    metadata_cols = [col for col in plate_merged_single_cells.columns.to_list() if "P__" not in col]
    metadata = plate_merged_single_cells[metadata_cols]
    
    # combine metadata and normalized features for all single cells (replace other single cell dataframe to not keep two dfs in memory)
    plate_merged_single_cells = pd.concat([metadata, features], axis=1)
    
    # compress and save merged single-cell data
    # print(f"Saving normalized features...")
    # normalized_merged_plate_single_cells_save_path = pathlib.Path(f"{normalized_merged_features_save_path}/{plate}-normalized-merged-single-cell.csv.gz")
    # plate_merged_single_cells.to_csv(normalized_merged_plate_single_cells_save_path, compression="gzip", index=False)
    
    # save merged single-cell data REMOVE LATER AND USED COMPRESSED VERSION ABOVE
    print(f"Saving normalized features...")
    normalized_merged_plate_single_cells_save_path = pathlib.Path(f"{normalized_merged_features_save_path}/{plate}-normalized-merged-single-cell.csv")
    plate_merged_single_cells.to_csv(normalized_merged_plate_single_cells_save_path, index=False)
    

plate_merged_single_cells

Normalizing plate SQ00014617...
Loading single-cell data...
Deriving normalization scaler...
(1400, 1445)
Applying normalization scaler...
Saving normalized features...


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Cell_UUID,Metadata_Site,Metadata_Well,Metadata_Plate,Metadata_Plate_Map_Name,Metadata_Reagent,CP__AreaShape_Area,CP__AreaShape_BoundingBoxArea,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,646.773973,4.075342,7e4c9500-6a62-44c0-94a8-627101437637,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,-3.178102,-3.049616,...,2.040826,-0.928666,-0.732369,-0.716077,1.777127,-0.822307,-0.680574,-3.159894,0.336856,-0.198668
1,141.984490,42.527520,983e4aed-5bde-44e6-99f4-88a7cfbbbc44,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.151968,0.145733,...,3.867919,-1.381463,-0.234981,-1.026105,-0.573023,-1.037705,-0.235298,-0.493560,-0.308460,-1.453437
2,1661.244888,59.342876,3e14db6b-6bf8-4c75-a339-d5d28cbfdee2,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.687008,0.683293,...,-0.080554,-1.469361,-0.261997,-0.069439,-1.157915,-0.626537,-1.386309,0.070525,-1.037719,-1.085060
3,1067.377054,73.602479,d525a18f-8a14-4925-b383-2adefabb0f93,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,-1.025637,-1.074012,...,-0.128028,-0.778191,0.424923,0.676423,0.748800,-0.160183,-0.018367,-1.691272,-0.557239,0.255424
4,1305.941113,72.588061,c9c4e64b-08cf-4b44-98a3-f174cc00c3e7,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,-0.874712,-0.348168,...,0.755920,-1.329953,-0.644758,-0.819649,-1.084737,0.700078,-0.461652,-0.145646,0.073457,-0.384974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,343.307203,2017.323319,24df339f-2923-40b0-85f7-68d5159b7bbf,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,-0.017094,-0.090303,...,-0.865847,-1.223206,-0.057534,1.484394,-0.447836,0.111733,-0.197428,0.894394,-1.035642,-2.278883
1396,690.239186,2030.182619,5ce5e5a5-a4cd-4dd8-aa4c-0d9f01e54ba0,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,0.036669,-0.091667,...,-0.656927,-1.135758,0.078466,1.930767,-0.415688,2.118195,-0.328309,0.659557,-0.348843,-0.896331
1397,1191.724266,2072.115130,a4484121-0065-4c19-92b4-01acf9152b6d,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,-0.138871,0.021120,...,0.119796,-0.239162,-0.933527,0.416274,-0.729560,1.044537,-0.580519,-0.191460,0.753107,0.308743
1398,615.209564,2103.762131,8aacb2a3-24e3-455b-af9a-9ddc48adc219,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,0.411715,0.354026,...,1.077876,-1.092056,-0.632069,3.619677,-0.193044,1.935252,0.195808,0.071442,-0.657765,0.335121
