In [1]:
import itertools
import json
import pathlib

import numpy as np
import pandas as pd

In [2]:
bulk_data_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve(strict=True)
whole_image_final_data_file_path = pathlib.Path(
    "../../data/CP_aggregated/endpoints/aggregated_whole_image.parquet"
).resolve(strict=True)
ground_truth_file_path = pathlib.Path(
    "../../1.ground_truth/data/0.ground_truth/ground_truth.csv"
).resolve(strict=True)
data_splits_dir = pathlib.Path("../data_splits/").resolve()
data_splits_dir.mkdir(parents=True, exist_ok=True)

# Load the data
bulk_df = pd.read_parquet(bulk_data_file_path)
ground_truth_df = pd.read_csv(ground_truth_file_path)
whole_image_final_df = pd.read_parquet(whole_image_final_data_file_path)
bulk_df["Metadata_dose"] = bulk_df["Metadata_dose"].astype("float64")
bulk_df["Metadata_Time"] = bulk_df["Metadata_Time"].astype("float64")
# get the final_timepoint only for the bulk data
bulk_df = bulk_df[bulk_df["Metadata_Time"] == bulk_df["Metadata_Time"].max()]
bulk_df.drop(columns=["Metadata_Time"], inplace=True)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Cells_AreaShape_BoundingBoxArea_CP,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,Cells_AreaShape_Zernike_2_0_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
4,C-02,0.0,0.152554,-0.308668,0.294745,0.133101,0.450444,0.414189,-0.436021,0.427976,...,0.009973,-0.036503,0.473306,0.311591,-0.641266,0.173728,0.024222,0.272495,0.057495,-0.097702
17,C-03,0.61,0.151874,-0.375467,0.298199,0.231699,0.548092,0.458562,-0.351042,0.372777,...,0.153134,0.058,0.507691,0.309948,-0.690561,0.234124,0.046362,0.176689,0.104796,-0.049741
30,C-04,1.22,0.101287,-0.396605,0.34122,0.263786,0.535394,0.38546,-0.12228,0.397354,...,-0.107273,-0.065923,0.459767,0.244821,-0.565626,0.206574,0.020261,0.245567,0.168188,0.002886
43,C-05,2.44,0.060478,-0.471116,0.378293,0.38047,0.469752,0.4099,-0.187871,0.307147,...,-0.068936,-0.122155,0.382944,0.246435,-0.441174,0.262557,0.110904,0.371812,0.074964,0.02169
56,C-06,4.88,-0.017911,-0.519354,0.420167,0.459129,0.463801,0.42887,-0.12883,0.369042,...,-0.092081,0.009313,0.383891,0.215235,-0.631917,0.232691,-0.063059,0.206657,0.076344,0.021697


In [3]:
# prepend "Terminal" to all columns in the whole image final dataframe
for col in whole_image_final_df.columns:
    if col == "Metadata_dose":
        continue
    if col == "Metadata_Well":
        continue
    whole_image_final_df.rename(columns={col: "Terminal_" + col}, inplace=True)

In [4]:
print("Bulk data shape: ", bulk_df.shape)
print("Whole image final data shape: ", whole_image_final_df.shape)

Bulk data shape:  (30, 2340)
Whole image final data shape:  (30, 13)


In [5]:
bulk_df = pd.merge(
    bulk_df,
    ground_truth_df[["Metadata_dose", "apoptosis"]],
    how="left",
    left_on="Metadata_dose",
    right_on="Metadata_dose",
)
gt = bulk_df.pop("apoptosis")
bulk_df.insert(3, "Metadata_apoptosis_ground_truth", gt)

bulk_df = pd.merge(
    bulk_df,
    whole_image_final_df,
    how="left",
    left_on=["Metadata_dose", "Metadata_Well"],
    right_on=["Metadata_dose", "Metadata_Well"],
)
bulk_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Cells_AreaShape_BoundingBoxArea_CP,Metadata_apoptosis_ground_truth,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,...,Terminal_Intensity_MADIntensity_AnnexinV,Terminal_Intensity_MADIntensity_DNA,Terminal_Intensity_MaxIntensity_AnnexinV,Terminal_Intensity_MaxIntensity_DNA,Terminal_Intensity_MeanIntensity_AnnexinV,Terminal_Intensity_MeanIntensity_DNA,Terminal_Intensity_StdIntensity_AnnexinV,Terminal_Intensity_StdIntensity_DNA,Terminal_Intensity_UpperQuartileIntensity_AnnexinV,Terminal_Intensity_UpperQuartileIntensity_DNA
0,C-02,0.0,0.152554,control,-0.308668,0.294745,0.133101,0.450444,0.414189,-0.436021,...,-0.480589,0.0,1.246418,0.266248,-0.921335,0.667563,-0.138017,0.144151,-1.003397,0.0
1,C-03,0.61,0.151874,negative,-0.375467,0.298199,0.231699,0.548092,0.458562,-0.351042,...,-0.151765,0.0,-0.453394,1.025694,-1.044321,0.626815,-1.099832,0.270661,-1.230912,0.901388
2,C-04,1.22,0.101287,negative,-0.396605,0.34122,0.263786,0.535394,0.38546,-0.12228,...,-0.89162,0.0,-0.672737,1.154089,-1.788135,1.011471,-1.249263,1.244758,-1.230912,1.802776
3,C-05,2.44,0.060478,negative,-0.471116,0.378293,0.38047,0.469752,0.4099,-0.187871,...,-1.138238,0.0,-0.712917,0.484793,-1.784165,-0.594062,-0.768595,0.250849,-1.685941,0.0
4,C-06,4.88,-0.017911,negative,-0.519354,0.420167,0.459129,0.463801,0.42887,-0.12883,...,-0.645001,0.0,-0.739316,0.757975,-0.777688,0.087873,-0.609561,0.862966,-1.079236,0.0


In [6]:
dose_wells = bulk_df.copy()
dose_wells = dose_wells[["Metadata_dose", "Metadata_Well"]]
dose_wells = dose_wells.drop_duplicates()
dose_wells = dose_wells.reset_index(drop=True)

In [None]:
# there are 10 doses, with three wells each
# one well is needed for each dose for training
# select one well per dose
test_wells = []
for dose in dose_wells["Metadata_dose"].unique():
    wells = dose_wells[dose_wells["Metadata_dose"] == dose]["Metadata_Well"].tolist()
    selected_well = np.random.choice(wells, 1)[0]
    print(f"Selected well {selected_well} for dose {dose}")
    test_wells.append(str(selected_well))

train_wells = dose_wells[~dose_wells["Metadata_Well"].isin(test_wells)][
    "Metadata_Well"
].tolist()

Selected well C-02 for dose 0.0
Selected well D-03 for dose 0.61
Selected well C-04 for dose 1.22
Selected well D-05 for dose 2.44
Selected well E-06 for dose 4.88
Selected well D-07 for dose 9.77
Selected well C-08 for dose 19.53
Selected well C-09 for dose 39.06
Selected well E-10 for dose 78.13
Selected well E-11 for dose 156.25


In [8]:
train_df = bulk_df[bulk_df["Metadata_Well"].isin(train_wells)]
test_df = bulk_df[bulk_df["Metadata_Well"].isin(test_wells)]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
# write the train and test dataframes to parquet files
train_df_file_path = data_splits_dir / "train.parquet"
train_df.to_parquet(train_df_file_path, index=False)
test_df_file_path = data_splits_dir / "test.parquet"
test_df.to_parquet(test_df_file_path, index=False)

In [9]:
print("Train data shape: ", train_df.shape)
train_df.head()

Train data shape:  (20, 2352)


Unnamed: 0,Metadata_Well,Metadata_dose,Cells_AreaShape_BoundingBoxArea_CP,Metadata_apoptosis_ground_truth,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,...,Terminal_Intensity_MADIntensity_AnnexinV,Terminal_Intensity_MADIntensity_DNA,Terminal_Intensity_MaxIntensity_AnnexinV,Terminal_Intensity_MaxIntensity_DNA,Terminal_Intensity_MeanIntensity_AnnexinV,Terminal_Intensity_MeanIntensity_DNA,Terminal_Intensity_StdIntensity_AnnexinV,Terminal_Intensity_StdIntensity_DNA,Terminal_Intensity_UpperQuartileIntensity_AnnexinV,Terminal_Intensity_UpperQuartileIntensity_DNA
0,C-03,0.61,0.151874,negative,-0.375467,0.298199,0.231699,0.548092,0.458562,-0.351042,...,-0.151765,0.0,-0.453394,1.025694,-1.044321,0.626815,-1.099832,0.270661,-1.230912,0.901388
1,C-05,2.44,0.060478,negative,-0.471116,0.378293,0.38047,0.469752,0.4099,-0.187871,...,-1.138238,0.0,-0.712917,0.484793,-1.784165,-0.594062,-0.768595,0.250849,-1.685941,0.0
2,C-06,4.88,-0.017911,negative,-0.519354,0.420167,0.459129,0.463801,0.42887,-0.12883,...,-0.645001,0.0,-0.739316,0.757975,-0.777688,0.087873,-0.609561,0.862966,-1.079236,0.0
3,C-07,9.77,-0.371168,negative,-0.629097,0.656105,0.64796,0.227454,0.527867,-0.197815,...,-0.89162,0.0,-0.881986,-0.103914,-0.953797,0.808517,-0.614739,1.012193,-1.989294,0.0
4,C-10,78.13,-1.254695,positive,-0.853136,0.927698,1.082197,-1.138856,0.537082,-0.439466,...,-5.659572,0.0,-0.514538,1.085794,1.279305,1.296806,1.934838,6.683572,0.740881,-1.802776


In [10]:
print("Test data shape: ", test_df.shape)
test_df.head()

Test data shape:  (10, 2352)


Unnamed: 0,Metadata_Well,Metadata_dose,Cells_AreaShape_BoundingBoxArea_CP,Metadata_apoptosis_ground_truth,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,...,Terminal_Intensity_MADIntensity_AnnexinV,Terminal_Intensity_MADIntensity_DNA,Terminal_Intensity_MaxIntensity_AnnexinV,Terminal_Intensity_MaxIntensity_DNA,Terminal_Intensity_MeanIntensity_AnnexinV,Terminal_Intensity_MeanIntensity_DNA,Terminal_Intensity_StdIntensity_AnnexinV,Terminal_Intensity_StdIntensity_DNA,Terminal_Intensity_UpperQuartileIntensity_AnnexinV,Terminal_Intensity_UpperQuartileIntensity_DNA
0,C-02,0.0,0.152554,control,-0.308668,0.294745,0.133101,0.450444,0.414189,-0.436021,...,-0.480589,0.0,1.246418,0.266248,-0.921335,0.667563,-0.138017,0.144151,-1.003397,0.0
1,C-04,1.22,0.101287,negative,-0.396605,0.34122,0.263786,0.535394,0.38546,-0.12228,...,-0.89162,0.0,-0.672737,1.154089,-1.788135,1.011471,-1.249263,1.244758,-1.230912,1.802776
2,C-08,19.53,-0.674519,negative,-0.727575,0.791423,0.830161,-0.192977,0.53744,-0.274119,...,-2.864565,0.0,-0.360804,0.479329,-0.511432,0.465917,0.839373,1.756901,-1.685941,-0.901388
3,C-09,39.06,-1.066801,positive,-0.782627,0.761164,0.93779,-0.899838,0.501545,-0.374778,...,-3.851038,0.0,-0.610622,0.476597,0.452641,1.913038,1.087145,5.860365,-0.017501,-1.802776
4,D-03,0.61,0.05002,negative,-0.397819,0.323782,0.265641,0.489456,0.415536,-0.182578,...,-0.151765,0.0,-0.407972,0.96013,-1.015113,1.126583,-0.87666,0.803096,-0.851721,0.901388


In [None]:
# make a df with the wells used for training and testing with their respective doses
test_well_df = pd.DataFrame(test_wells, columns=["Metadata_Well"])
train_well_df = pd.DataFrame(train_wells, columns=["Metadata_Well"])
test_well_df["data_split"] = "test"
train_well_df["data_split"] = "train"
train_test_well_df = pd.concat([train_well_df, test_well_df], axis=0)
# save the train test well df to a parquet file
train_test_well_file_path = data_splits_dir / "train_test_wells.parquet"
train_test_well_df.to_parquet(train_test_well_file_path, index=False)