In [1]:
import pathlib

import joblib
import numpy as np
import pandas as pd

In [2]:
# load the training data
profile_file_dir = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve(strict=True)
model_file_dir = pathlib.Path("../models/multi_regression_model.joblib").resolve(
    strict=True
)
shuffled_model_file_dir = pathlib.Path(
    "../models/shuffled_multi_regression_model.joblib"
).resolve(strict=True)
terminal_column_names = pathlib.Path("../results/terminal_columns.txt").resolve(
    strict=True
)
predictions_save_path = pathlib.Path(
    "../results/predicted_terminal_profiles.parquet"
).resolve()
terminal_column_names = [
    line.strip() for line in terminal_column_names.read_text().splitlines()
]
results_dir = pathlib.Path("../results/").resolve()
results_dir.mkdir(parents=True, exist_ok=True)
profile_df = pd.read_parquet(profile_file_dir)
print(profile_df.shape)
profile_df.head()

(390, 2341)


Unnamed: 0,Metadata_Well,Metadata_Time,Metadata_dose,Cells_AreaShape_BoundingBoxArea_CP,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,0.0,-0.29176,-0.312209,0.311346,0.138225,-0.253509,0.408095,-0.049747,...,0.208938,-0.07363,0.008897,-0.055376,0.131249,-0.053869,0.12186,0.006765,-0.151495,0.121353
1,C-02,1.0,0.0,-0.028198,-0.394414,0.313888,0.26044,0.312064,0.460261,-0.253127,...,-0.080875,0.047516,-0.007985,0.057872,-0.307675,0.226858,-0.097667,0.154435,0.009346,-0.100267
2,C-02,10.0,0.0,0.182481,-0.209134,0.13744,-0.00637,0.309906,0.357322,-0.434222,...,-0.104633,0.049896,0.15582,0.073132,-0.514341,0.12568,0.01213,0.206564,-0.001114,-0.069553
3,C-02,11.0,0.0,0.141672,-0.285466,0.255858,0.099816,0.399589,0.385191,-0.438585,...,-0.087787,-0.101442,0.275884,0.209213,-0.459635,0.113541,-0.091702,0.251352,0.081938,-0.135812
4,C-02,12.0,0.0,0.152554,-0.308668,0.294745,0.133101,0.450444,0.414189,-0.436021,...,0.009973,-0.036503,0.473306,0.311591,-0.641266,0.173728,0.024222,0.272495,0.057495,-0.097702


## Get the non-shuffled predictions

In [3]:
# load the model
model = joblib.load(model_file_dir)
shuffled_model = joblib.load(shuffled_model_file_dir)

metadata_columns = [x for x in profile_df.columns if "Metadata_" in x]
# remove metadata columns
features = profile_df.drop(columns=metadata_columns)
metadata_df = profile_df[metadata_columns]
# predict the terminal feature space
predictions = model.predict(features)
predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    predictions_df.insert(0, col, metadata_df[col])
predictions_df["shuffled"] = False

## Get the shuffled predictions

In [4]:
# load the model
shuffled_model = joblib.load(shuffled_model_file_dir)

metadata_columns = [x for x in profile_df.columns if "Metadata_" in x]
shuffled_profile_df = profile_df.copy()
for col in shuffled_profile_df.columns:
    shuffled_profile_df[col] = np.random.permutation(shuffled_profile_df[col])
# remove metadata columns
features = shuffled_profile_df.drop(columns=metadata_columns)
metadata_df = profile_df[metadata_columns]


# predict the terminal feature space
predictions = shuffled_model.predict(features)
shuffled_predictions_df = pd.DataFrame(predictions, columns=terminal_column_names)
# insert the metadata columns
for col in metadata_columns:
    shuffled_predictions_df.insert(0, col, metadata_df[col])
shuffled_predictions_df["shuffled"] = True

In [5]:
final_predictions_df = pd.concat([predictions_df, shuffled_predictions_df], axis=0)
# save the predictions
final_predictions_df.to_parquet(predictions_save_path, index=False)
final_predictions_df

Unnamed: 0,Metadata_dose,Metadata_Time,Metadata_Well,Terminal_Intensity_LowerQuartileIntensity_AnnexinV,Terminal_Intensity_MADIntensity_AnnexinV,Terminal_Intensity_MADIntensity_DNA,Terminal_Intensity_MaxIntensity_AnnexinV,Terminal_Intensity_MaxIntensity_DNA,Terminal_Intensity_MeanIntensity_AnnexinV,Terminal_Intensity_MeanIntensity_DNA,Terminal_Intensity_StdIntensity_AnnexinV,Terminal_Intensity_StdIntensity_DNA,Terminal_Intensity_UpperQuartileIntensity_AnnexinV,Terminal_Intensity_UpperQuartileIntensity_DNA,shuffled
0,0.0,0.0,C-02,0.030151,-0.636370,8.010986e-07,-0.451732,0.094681,0.544503,0.581019,0.299668,1.180284,-0.100999,-0.453398,False
1,0.0,1.0,C-02,0.099994,-0.417208,5.493248e-07,-0.418905,0.180874,0.010006,0.356017,0.153830,0.664223,-0.304624,-0.322697,False
2,0.0,10.0,C-02,0.273039,-0.256578,7.019150e-07,-0.450462,0.162781,-0.080458,0.306073,-0.254199,0.384115,-0.513559,0.048675,False
3,0.0,11.0,C-02,0.286799,-0.219585,6.027314e-07,-0.454100,0.146417,-0.190411,0.305931,-0.342674,0.383960,-0.638237,0.100054,False
4,0.0,12.0,C-02,0.201319,0.001138,5.722133e-07,-0.464113,0.255976,-0.129145,0.113562,-0.488816,0.312721,-0.282783,0.112673,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,156.25,5.0,E-11,-0.095568,-2.315429,5.416953e-07,-0.475433,0.304448,0.417923,0.578268,0.186894,3.516998,0.037406,-0.558860,True
386,156.25,6.0,E-11,-0.088479,-1.180410,6.103609e-07,-0.533272,0.382523,0.398987,0.670454,0.484910,3.228519,-0.209068,-0.411934,True
387,156.25,7.0,E-11,-0.087853,-2.225660,6.713970e-07,-0.542207,0.321930,0.535170,0.816470,0.265230,2.577458,0.036496,-0.552551,True
388,156.25,8.0,E-11,-0.030936,-2.281724,7.934691e-07,-0.537734,0.329312,0.474940,0.520858,0.497283,2.393843,-0.260562,-0.555255,True
