### Import Libraries

In [18]:
import pathlib
import joblib
import importlib

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pycytominer.cyto_utils import DeepProfiler_processing


### Set Load/Save Paths

In [19]:
# paths to load merged features,index, and annotations from
merged_features_save_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged/"
)
dp_index_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-DP/inputs/metadata/index.csv"
)
annotations_path = pathlib.Path("../0.image-download/manifest/idr0080-screenA-annotation.csv")

# path to save normalized merged features to
normalized_merged_features_save_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged-normalized/"
)
normalized_merged_features_save_path.mkdir(parents=True, exist_ok=True)
scaler_save_dir = pathlib.Path("normalization-scalers/")
scaler_save_dir.mkdir(parents=True, exist_ok=True)

In [20]:
for merged_single_cell_plate_path in merged_features_save_path.iterdir():
    # get only plate name from merged features file name
    plate = merged_single_cell_plate_path.name.split("-")[0] # CHANGE split to use -
    print(f"Normalizing plate {plate}...")
    
    # load plate single-cell data
    print(f"Loading single-cell data...")
    #plate_merged_single_cells = pd.read_csv(merged_single_cell_plate_path, compression="gzip")
    # load uncompressed version REMOVE LATER AND USE ABOVE
    plate_merged_single_cells = pd.read_csv(merged_single_cell_plate_path, nrows=10000)
    
    # create per-plate normalization scaler from the normalization population
    print(f"Deriving normalization scaler...")
    # find all cells that have had no reagent applied
    negative_control_single_cells = plate_merged_single_cells.loc[plate_merged_single_cells['Metadata_Reagent'] == "ARID1B-2"] # CHANGE ARID1B-2 TO "no reagent"
    # get features for these negative control cells
    feature_cols = [col for col in negative_control_single_cells.columns.to_list() if "P__" in col]
    negative_control_feature_data = negative_control_single_cells[feature_cols].values
    # fit normalization scaler
    plate_scaler = StandardScaler()
    plate_scaler.fit(negative_control_feature_data)
    # save normalization scaler
    scaler_save_path = pathlib.Path(f"{scaler_save_dir}/{plate}-merged-normalization-scaler.joblib")
    joblib.dump(plate_scaler,scaler_save_path)
    
    
    # apply scaler to all single cell feature data
    print(f"Applying normalization scaler...")
    # get normalized feature data
    features = plate_merged_single_cells[feature_cols].values
    features = plate_scaler.transform(features)
    features = pd.DataFrame(features, columns=feature_cols)
    # get metadata for all single cells
    metadata_cols = [col for col in plate_merged_single_cells.columns.to_list() if "P__" not in col]
    metadata = plate_merged_single_cells[metadata_cols]
    
    # combine metadata and normalized features for all single cells (replace other single cell dataframe to not keep two dfs in memory)
    plate_merged_single_cells = pd.concat([metadata, features], axis=1)
    
    break
    

plate_merged_single_cells

Normalizing plate SQ00014617...
Loading single-cell data...
Deriving normalization scaler...
Applying normalization scaler...


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Cell_UUID,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_Plate_Map_Name,Metadata_Reagent,CP__AreaShape_Area,CP__AreaShape_BoundingBoxArea,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,646.773973,4.075342,63b7aca5-b627-44f6-b3fc-6b44fff1e5b0,SQ00014617,4,G18,SQ00014617_G18_04,ARID1B-2,146.0,216.0,...,-0.040406,-0.043776,-0.198162,-0.100171,-0.038737,-0.119610,-0.087806,-0.246079,-0.113472,1.177017
1,141.984490,42.527520,054eb489-1013-4030-adaa-a005a5dc8e33,SQ00014617,4,G18,SQ00014617_G18_04,ARID1B-2,5287.0,7242.0,...,0.027649,-0.190173,-0.157589,-0.174300,-0.209957,-0.188459,-0.072498,-0.143237,-0.155022,0.704835
2,1661.244888,59.342876,a48e08d3-2d98-4ee3-859a-8e6c8e49cb17,SQ00014617,4,G18,SQ00014617_G18_04,ARID1B-2,6113.0,8424.0,...,-0.119423,-0.218592,-0.159793,0.054443,-0.252569,-0.057034,-0.112069,-0.121480,-0.201976,0.843459
3,1067.377054,73.602479,08ac8c53-461e-40cf-ab39-e3a612c29209,SQ00014617,4,G18,SQ00014617_G18_04,ARID1B-2,3469.0,4560.0,...,-0.121192,0.004875,-0.103759,0.232781,-0.113656,0.092030,-0.065040,-0.189433,-0.171040,1.347896
4,1305.941113,72.588061,e4cb3cbb-bde0-4b27-b3b6-acff8b48c246,SQ00014617,4,G18,SQ00014617_G18_04,ARID1B-2,3702.0,6156.0,...,-0.088266,-0.173519,-0.191016,-0.124935,-0.247238,0.367001,-0.080280,-0.129818,-0.130432,1.106908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,738.769039,842.882896,64cb3cbb-08a2-483a-aaf5-42e55d59ee92,SQ00014617,3,E4,SQ00014617_E4_03,COX5B-1,4005.0,6552.0,...,-0.123361,0.446960,-0.205098,-0.032001,-0.203061,0.225039,-0.040105,-0.076200,-0.100055,1.383069
9996,1740.961868,867.028377,2744b917-e0c5-466f-bb37-24328e73b355,SQ00014617,3,E4,SQ00014617_E4_03,COX5B-1,6766.0,10500.0,...,-0.177398,0.427478,-0.219608,-0.189091,-0.231993,-0.244524,-0.038276,-0.133600,-0.109975,1.607932
9997,693.619533,869.986577,086c85f6-2630-4a79-9068-74bf05f1f5b4,SQ00014617,3,E4,SQ00014617_E4_03,COX5B-1,4321.0,6080.0,...,-0.137096,0.440343,-0.218115,-0.132853,-0.187084,0.362652,-0.091707,-0.077344,-0.110200,1.085277
9998,1280.414865,864.946846,f51fed19-72f0-4eed-bb32-1eb3b2ace769,SQ00014617,3,E4,SQ00014617_E4_03,COX5B-1,4534.0,5760.0,...,-0.080137,0.400217,-0.154349,-0.164959,-0.211136,-0.187373,-0.051681,-0.137273,-0.183101,1.489531
