In [1]:
import sys
import json
import pathlib

import joblib
import pandas as pd
from pycytominer.cyto_utils.features import infer_cp_features

# project module imports
sys.path.append("../../")  # noqa
from src.utils import check_feature_order  # noqa

## Setting up file paths and parameters 

In [2]:
# setting up paths
results_dir = pathlib.Path("../../results")
data_split_dir = (results_dir / "1.data_splits/").resolve(strict=True)
jump_data_dir = pathlib.Path("../../data/JUMP_data").resolve(strict=True)
modeling_dir = pathlib.Path("../../results/2.modeling").resolve(strict=True)

# data files
jump_data_path = (jump_data_dir / "JUMP_all_plates_normalized_negcon.csv.gz").resolve(
    strict=True
)
multi_class_model_path = (modeling_dir / "multi_class_model.joblib").resolve(
    strict=True
)
shuffled_multi_class_model_path = (
    modeling_dir / "shuffled_multi_class_model.joblib"
).resolve(strict=True)
feature_col_names = (data_split_dir / "feature_cols.json").resolve(strict=True)


# output paths
jump_analysis_dir = (results_dir / "3.jump_analysis").resolve()
jump_analysis_dir.mkdir(exist_ok=True)

## loading files

In [3]:
# loading in the negatlive controled normalized profiles
jump_df = pd.read_csv(jump_data_path)

# loading json file containing selected feature names
with open(feature_col_names, mode="r") as infile:
    cell_injury_cp_feature_cols = json.load(infile)

# loading json file that contains the coder and decoder injury labels
with open(data_split_dir / "injury_codes.json") as infile:
    injury_codes = json.load(infile)

# display dataframe and size
print("JUMP dataset size:", jump_df.shape)
jump_df.head()

JUMP dataset size: (19498, 5805)


Unnamed: 0,Metadata_broad_sample,Metadata_Plate,Metadata_Well,Metadata_gene,Metadata_pert_type,Metadata_control_type,Metadata_target_sequence,Metadata_negcon_control_type,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,Metadata_solvent,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid,Metadata_smiles
0,BRDN0001480888,BR00116997,A01,HIF1A,trt,,TATGTGTGAATTACGTTGTG,,1.3659,2.1598,...,-1.4483,-1.3863,-1.3688,-1.4264,-1.3239,,,,,
1,BRDN0001483495,BR00116997,A02,CATSPER4,trt,,CCGACCGTAGGACTCGTGAA,,-0.57723,-0.3286,...,-0.78766,-0.78295,-0.75735,-0.77082,-0.75818,,,,,
2,BRDN0001147364,BR00116997,A03,DDR2,trt,,CCGTGACAAACCGAGCACTG,,-0.34826,-0.077757,...,-0.61214,-0.60621,-0.58238,-0.6049,-0.5694,,,,,
3,BRDN0001490272,BR00116997,A04,OPRL1,trt,,AACGGGAACACCGACAACAG,,-0.076636,0.30289,...,0.001638,0.005891,0.029338,-0.014926,0.055581,,,,,
4,BRDN0001480510,BR00116997,A05,SLC7A11,trt,,GAAGAGATTCAAGTATTACG,,-0.28287,-0.064418,...,-0.43116,-0.40155,-0.37484,-0.42303,-0.34384,,,,,


## Feature alignment
In this section, we are identifying the shared features present in both the cell injury and JUMP datasets. 
Once these features are identified, we update the JUMP dataset to include only those features that are shared between both profiles for our machine learning application.

First we identify the CellProfiler (CP) features present in the JUMP data. 
We accomplish this by utilizing `pycytominer`'s  'infer_cp_features()', which helps us identify CP features in the JUMP dataset.

In [4]:
# get compartments
metadata_prefix = "Metadata_"
compartments = list(
    set(
        [
            feature_name.split("_")[0]
            for feature_name in jump_df.columns.tolist()
            if not feature_name.startswith("Metadata_")
        ]
    )
)

# find CP features in JUMP dataset
jump_cp_features = infer_cp_features(jump_df, compartments=compartments)
meta_features = infer_cp_features(jump_df, compartments=compartments, metadata=True)

# display number of features of both profiles
print("Number of Metadata Features:", len(meta_features))
print(
    "Number of CP features that cell injury has",
    len(cell_injury_cp_feature_cols["feature_cols"]),
)
print("Number of CP features that JUMP has:", len(jump_cp_features))

Number of Metadata Features: 13
Number of CP features that cell injury has 346
Number of CP features that JUMP has: 5792


Now that we have identified the features present in both datasets, the next step is to align them. This involves identifying the common features between both profiles and utilizing these features to update our JUMP dataset for our machine learning model.

In [5]:
cell_injury_cp_features = cell_injury_cp_feature_cols["feature_cols"]

# finding shared featues using intersection
aligned_features = list(set(cell_injury_cp_features) & set(jump_cp_features))

# displaying number of shared features between both profiles
print("Number of shapred features of both profiles", len(aligned_features))

Number of shapred features of both profiles 207


The objective of this step is to preserve the order of the feature space.

Since we have identified the shared feature space across both profiles, we still need to address those that are missing. 
Therefore, to maintain the feature space order, we opted to use the cell injury feature space as our reference feature space order, as our multi-class model was trained to understand this specific order.

Next, we addressed features that were not found within the JUMP dataset.
This was done by including them in the alignment process, but defaulted their values to 0.

Ultimately, we generated a new profile called `aligned_jump_df`, which contains the correctly aligned and ordered feature space from the cell injury dataset.

In [6]:
# multiplier is the number of samples in JUMP data
# this is used to default non-aligned features to 0
multiplier = jump_df.shape[0]

# storing feature and values in order
aligned_jump = {}
for injury_feat in cell_injury_cp_features:
    if injury_feat not in aligned_features:
        aligned_jump[injury_feat] = [0.0] * multiplier
    else:
        aligned_jump[injury_feat] = jump_df[injury_feat].values.tolist()

# creating dataframe with the aligned features and retained feature order
aligned_jump_feats_df = pd.DataFrame.from_dict(aligned_jump, orient="columns")

# sanity check: see if the feature order in the `cell_injury_cp_feature_cols` is the same with
# the newly generated aligned JUMP dataset
assert (
    cell_injury_cp_features == aligned_jump_feats_df.columns.tolist()
), "feature space are not aligned"
assert check_feature_order(
    ref_feat_order=cell_injury_cp_features,
    input_feat_order=aligned_jump_feats_df.columns.tolist(),
), "feature space do not follow the same order"

## Applying to our Multi-Class trained model

We applying the aligned JUMP dataset to our trained multi-class model and measure the probabiltiies of which cell injury each well possessed. 

In [7]:
# loading in mutliclass model
multi_class_cell_injury_model = joblib.load(multi_class_model_path)

In [8]:
# apply
pred_proba = multi_class_cell_injury_model.predict_proba(aligned_jump_feats_df)

# convert prediction probabilities to a pandas daraframe
pred_proba_df = pd.DataFrame(pred_proba)

# update the column names with the name of the injury class
pred_proba_df.columns = [
    injury_codes["decoder"][str(colname)] for colname in pred_proba_df.columns.tolist()
]

# adding shuffle label
pred_proba_df.insert(0, "shuffled_model", False)

# # display shape and size
print("Probability shape:", pred_proba_df.shape)
pred_proba_df.head()

Probability shape: (19498, 16)


Unnamed: 0,shuffled_model,Control,Cytoskeletal,Hsp90,Kinase,Genotoxin,Miscellaneous,Redox,HDAC,mTOR,Proteasome,Saponin,Mitochondria,Ferroptosis,Tannin,Nonspecific reactive
0,False,7.657605e-16,8.982991000000001e-43,1.0,1.0,0.9999999,1.5179260000000002e-17,1.411549e-12,1.0,0.9999998,4.057728e-10,6.991383e-20,7.475609e-13,1.0,6.89196e-51,1.0
1,False,0.9999981,6.155592e-27,1.0,1.0,7.666709999999999e-19,4.0154589999999994e-19,1.0,1.0,0.007511064,1.0,1.474707e-23,0.05653147,1.0,3.6353829999999997e-50,1.0
2,False,0.1766748,2.861212e-10,1.0,1.0,0.001031294,1.6144959999999997e-19,0.9987153,1.0,2.58865e-09,1.0,1.871569e-05,2.378226e-06,1.0,1.8258829999999997e-50,1.0
3,False,0.902123,3.37846e-23,1.0,1.0,9.020802e-09,1.667038e-10,1.0,0.999995,5.331151e-05,0.9999999,3.8767850000000004e-17,1.0,1.0,4.50233e-57,1.0
4,False,1.0,2.777067e-20,1.0,1.0,0.9999972,1.605561e-13,1.0,1.0,0.7591227,1.0,1.202547e-16,0.9818902,0.999619,1.270453e-67,1.0


## Applying to our Shuffled Multi-Class trained model

We applying the aligned JUMP dataset to our trained multi-class model and measure the probabiltiies of which cell injury each well possessed.

In [9]:
# loading shuffled model
shuffled_multi_class_cell_injury_model = joblib.load(shuffled_multi_class_model_path)

In [10]:
# apply
shuffled_pred_proba = shuffled_multi_class_cell_injury_model.predict_proba(
    aligned_jump_feats_df
)

# convert prediction probabilities to a pandas daraframe
shuffled_pred_proba_df = pd.DataFrame(shuffled_pred_proba)

# update the column names with the name of the injury class
shuffled_pred_proba_df.columns = [
    injury_codes["decoder"][str(colname)]
    for colname in shuffled_pred_proba_df.columns.tolist()
]

# # adding label True
shuffled_pred_proba_df.insert(0, "shuffled_model", True)

Saving all probabilities from both shuffle and regular models 

In [11]:
# concat both shuffled
all_probas = pd.concat([pred_proba_df, shuffled_pred_proba_df]).reset_index(drop=True)

# save the mode
all_probas.to_csv(jump_analysis_dir / "JUMP_injury_proba.csv.gz")

print("Shape of the probabilities", all_probas.shape)
print("Unique Models", list(all_probas["shuffled_model"].unique()))
all_probas.head()

Shape of the probabilities (38996, 16)
Unique Models [False, True]


Unnamed: 0,shuffled_model,Control,Cytoskeletal,Hsp90,Kinase,Genotoxin,Miscellaneous,Redox,HDAC,mTOR,Proteasome,Saponin,Mitochondria,Ferroptosis,Tannin,Nonspecific reactive
0,False,7.657605e-16,8.982991000000001e-43,1.0,1.0,0.9999999,1.5179260000000002e-17,1.411549e-12,1.0,0.9999998,4.057728e-10,6.991383e-20,7.475609e-13,1.0,6.89196e-51,1.0
1,False,0.9999981,6.155592e-27,1.0,1.0,7.666709999999999e-19,4.0154589999999994e-19,1.0,1.0,0.007511064,1.0,1.474707e-23,0.05653147,1.0,3.6353829999999997e-50,1.0
2,False,0.1766748,2.861212e-10,1.0,1.0,0.001031294,1.6144959999999997e-19,0.9987153,1.0,2.58865e-09,1.0,1.871569e-05,2.378226e-06,1.0,1.8258829999999997e-50,1.0
3,False,0.902123,3.37846e-23,1.0,1.0,9.020802e-09,1.667038e-10,1.0,0.999995,5.331151e-05,0.9999999,3.8767850000000004e-17,1.0,1.0,4.50233e-57,1.0
4,False,1.0,2.777067e-20,1.0,1.0,0.9999972,1.605561e-13,1.0,1.0,0.7591227,1.0,1.202547e-16,0.9818902,0.999619,1.270453e-67,1.0
