In [1]:
import pathlib

import joblib
import numpy as np
import pandas as pd
import pycytominer

In [2]:
model_file_dir = pathlib.Path(
    "../models/multi_regression_model_ntrees_1000.joblib"
).resolve()
shuffled_model_file_dir = pathlib.Path(
    "../models/shuffled_multi_regression_model_ntrees_1000.joblib"
).resolve()
train_test_wells_path = pathlib.Path(
    "../data_splits/train_test_wells.parquet"
).resolve()

predictions_save_path = pathlib.Path(
    "../results/predicted_terminal_profiles_from_all_time_points.parquet"
).resolve()

profile_data_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve()
terminal_column_names = pathlib.Path("../results/terminal_columns.txt").resolve(
    strict=True
)
terminal_column_names = [
    line.strip() for line in terminal_column_names.read_text().splitlines()
]

data_split_df = pd.read_parquet(train_test_wells_path)
df = pd.read_parquet(profile_data_path)
metadata_cols = [cols for cols in df.columns if "Metadata" in cols]
features_cols = [cols for cols in df.columns if "Metadata" not in cols]
features_cols = features_cols
aggregate_df = pycytominer.aggregate(
    population_df=df,
    strata=["Metadata_Well", "Metadata_Time"],
    features=features_cols,
    operation="median",
)


metadata_df = df[metadata_cols]
metadata_df = metadata_df.drop_duplicates(subset=["Metadata_Well", "Metadata_Time"])
metadata_df = metadata_df.reset_index(drop=True)
aggregate_df = pd.merge(
    metadata_df, aggregate_df, on=["Metadata_Well", "Metadata_Time"]
)
print(aggregate_df.shape)
aggregate_df.head()

(390, 2373)


Unnamed: 0,Metadata_Well,Metadata_Time,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,1,180,Staurosporine,0.0,negative,1,1,101,...,0.208938,-0.07363,0.008897,-0.055376,0.131249,-0.053869,0.12186,0.006765,-0.151495,0.121353
1,C-02,1.0,1,178,Staurosporine,0.0,negative,2,1,104,...,-0.080875,0.047516,-0.007985,0.057872,-0.307675,0.226858,-0.097667,0.154435,0.009346,-0.100267
2,C-02,10.0,1,180,Staurosporine,0.0,negative,11,1,10,...,-0.104633,0.049896,0.15582,0.073132,-0.514341,0.12568,0.01213,0.206564,-0.001114,-0.069553
3,C-02,11.0,1,179,Staurosporine,0.0,negative,12,1,13,...,-0.087787,-0.101442,0.275884,0.209213,-0.459635,0.113541,-0.091702,0.251352,0.081938,-0.135812
4,C-02,12.0,1,180,Staurosporine,0.0,negative,13,1,10,...,0.009973,-0.036503,0.473306,0.311591,-0.641266,0.173728,0.024222,0.272495,0.057495,-0.097702


In [3]:
# map the train/test wells to the aggregate data
aggregate_df["Metadata_data_split"] = aggregate_df["Metadata_Well"].map(
    data_split_df.set_index("Metadata_Well")["data_split"]
)
data_split = aggregate_df.pop("Metadata_data_split")
aggregate_df.insert(0, "Metadata_data_split", data_split)
aggregate_df["Metadata_Time"] = aggregate_df["Metadata_Time"].astype(float)
aggregate_df["Metadata_data_split"].unique()

array(['test', 'train'], dtype=object)

In [4]:
aggregate_df.head(15)

Unnamed: 0,Metadata_data_split,Metadata_Well,Metadata_Time,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,test,C-02,0.0,1,180,Staurosporine,0.0,negative,1,1,...,0.208938,-0.07363,0.008897,-0.055376,0.131249,-0.053869,0.12186,0.006765,-0.151495,0.121353
1,test,C-02,1.0,1,178,Staurosporine,0.0,negative,2,1,...,-0.080875,0.047516,-0.007985,0.057872,-0.307675,0.226858,-0.097667,0.154435,0.009346,-0.100267
2,test,C-02,10.0,1,180,Staurosporine,0.0,negative,11,1,...,-0.104633,0.049896,0.15582,0.073132,-0.514341,0.12568,0.01213,0.206564,-0.001114,-0.069553
3,test,C-02,11.0,1,179,Staurosporine,0.0,negative,12,1,...,-0.087787,-0.101442,0.275884,0.209213,-0.459635,0.113541,-0.091702,0.251352,0.081938,-0.135812
4,test,C-02,12.0,1,180,Staurosporine,0.0,negative,13,1,...,0.009973,-0.036503,0.473306,0.311591,-0.641266,0.173728,0.024222,0.272495,0.057495,-0.097702
5,test,C-02,2.0,1,176,Staurosporine,0.0,negative,3,1,...,-0.01841,0.015301,0.11341,0.051032,-0.281085,0.129891,-0.077894,0.073283,-0.030785,-0.088513
6,test,C-02,3.0,1,175,Staurosporine,0.0,negative,4,1,...,-0.030772,-0.005703,0.111358,0.015614,-0.30506,0.172338,0.021422,0.147393,-0.012494,-0.138317
7,test,C-02,4.0,1,174,Staurosporine,0.0,negative,5,1,...,-0.045352,0.037487,0.029287,-0.053088,-0.378882,0.200138,-0.19677,0.148845,0.015237,-0.109499
8,test,C-02,5.0,1,174,Staurosporine,0.0,negative,6,1,...,-0.108421,-0.00204,0.020643,0.026117,-0.411899,0.219469,-0.043483,0.195249,0.061166,-0.063724
9,test,C-02,6.0,1,175,Staurosporine,0.0,negative,7,1,...,-0.077427,0.062856,0.1838,-0.042906,-0.38928,0.131029,-0.051551,0.13684,0.06642,-0.073412


In [5]:
# if the data_split is train and the time is not 12 then set to non_trained_pair
aggregate_df["Metadata_data_split"] = aggregate_df.apply(
    lambda x: (
        "non_trained_pair"
        if (x["Metadata_data_split"] == "train" and x["Metadata_Time"] != 12.0)
        else x["Metadata_data_split"]
    ),
    axis=1,
)

In [6]:
# load the model
model = joblib.load(model_file_dir)

metadata_columns = [x for x in aggregate_df.columns if "Metadata_" in x]
# remove metadata columns
features = aggregate_df.drop(columns=metadata_columns)
metadata_df = aggregate_df[metadata_columns]
# predict the terminal feature space
predictions = model.predict(features)
predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    predictions_df.insert(0, col, metadata_df[col])
predictions_df["shuffled"] = False

In [7]:
# load the model
shuffled_model = joblib.load(shuffled_model_file_dir)

metadata_columns = [x for x in aggregate_df.columns if "Metadata_" in x]
shuffled_profile_df = aggregate_df.copy()
for col in shuffled_profile_df.columns:
    shuffled_profile_df[col] = np.random.permutation(shuffled_profile_df[col])
# remove metadata columns
features = shuffled_profile_df.drop(columns=metadata_columns)
metadata_df = aggregate_df[metadata_columns]


# predict the terminal feature space
predictions = shuffled_model.predict(features)
shuffled_predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    shuffled_predictions_df.insert(0, col, metadata_df[col])
shuffled_predictions_df["shuffled"] = True

In [8]:
final_predictions_df = pd.concat([predictions_df, shuffled_predictions_df], axis=0)
# save the predictions
final_predictions_df.to_parquet(predictions_save_path, index=False)
final_predictions_df.head()

Unnamed: 0,Metadata_image_path,Metadata_distance,Metadata_coordinates_y,Metadata_parent_id,Metadata_parent_track_id,Metadata_id,Metadata_x,Metadata_y,Metadata_t,Metadata_track_id,...,Terminal_Intensity_MADIntensity_DNA,Terminal_Intensity_MaxIntensity_AnnexinV,Terminal_Intensity_MaxIntensity_DNA,Terminal_Intensity_MeanIntensity_AnnexinV,Terminal_Intensity_MeanIntensity_DNA,Terminal_Intensity_StdIntensity_AnnexinV,Terminal_Intensity_StdIntensity_DNA,Terminal_Intensity_UpperQuartileIntensity_AnnexinV,Terminal_Intensity_UpperQuartileIntensity_DNA,shuffled
0,../0.pre-process_images/data/processed_images/...,0.303702,"[1583.0, 1079.0]",-1.0,-1,1000096.0,1583.0,1079.0,0.0,17,...,8.010986e-07,-0.451732,0.094681,0.544503,0.581019,0.299668,1.180284,-0.100999,-0.453398,False
1,../0.pre-process_images/data/processed_images/...,0.333772,"[882.0, 1105.0]",-1.0,-1,2000099.0,882.0,1105.0,1.0,99,...,5.493248e-07,-0.418905,0.180874,0.010006,0.356017,0.15383,0.664223,-0.304624,-0.322697,False
2,../0.pre-process_images/data/processed_images/...,0.183682,"[1523.0, 59.0]",10000005.0,-1,11000009.0,1523.0,59.0,10.0,2,...,7.01915e-07,-0.450462,0.162781,-0.080458,0.306073,-0.254199,0.384115,-0.513559,0.048675,False
3,../0.pre-process_images/data/processed_images/...,0.528709,"[372.0, 88.0]",11000013.0,-1,12000009.0,372.0,88.0,11.0,165,...,6.027314e-07,-0.4541,0.146417,-0.190411,0.305931,-0.342674,0.38396,-0.638237,0.100054,False
4,../0.pre-process_images/data/processed_images/...,0.539604,"[374.0, 76.0]",12000009.0,-1,13000008.0,374.0,76.0,12.0,165,...,5.722133e-07,-0.464113,0.255976,-0.129145,0.113562,-0.488816,0.312721,-0.282783,0.112673,False
