In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# set paths
# input data
cls_path_dict = {
    "channel_DNA": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_DNA_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel488-1": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel488-1_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel488-2": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel488-2_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel561": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel561_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
}

# image paths
image_paths_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/image_paths.csv"
).resolve(strict=True)

# plate map
plate_map_path = pathlib.Path("../../../data/platemap_6hr_4ch.csv").resolve(strict=True)

# output path for the merged table
output_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve()

In [3]:
cls_df_dict = {}
cls_df_dict["channel488-1_df"] = pd.read_csv(cls_path_dict["channel488-1"], header=None)
cls_df_dict["channel488-2_df"] = pd.read_csv(cls_path_dict["channel488-2"], header=None)
cls_df_dict["channel561_df"] = pd.read_csv(cls_path_dict["channel561"], header=None)
cls_df_dict["channel_DNA_df"] = pd.read_csv(cls_path_dict["channel_DNA"], header=None)

In [4]:
image_paths_df = pd.read_csv(image_paths_path, header=None)
plate_map_df = pd.read_csv(plate_map_path)

In [5]:
print(
    len(cls_df_dict["channel488-1_df"]),
    len(cls_df_dict["channel488-2_df"]),
    len(cls_df_dict["channel561_df"]),
    len(cls_df_dict["channel_DNA_df"]),
    len(image_paths_df),
)
print(
    cls_df_dict["channel488-1_df"].shape,
    cls_df_dict["channel488-2_df"].shape,
    cls_df_dict["channel561_df"].shape,
    cls_df_dict["channel_DNA_df"].shape,
    image_paths_df.shape,
)
print(plate_map_df.shape)

145489 145489 145489 145489 145489
(145489, 384) (145489, 384) (145489, 384) (145489, 384) (145489, 1)
(30, 5)


In [6]:
# loop through each channel and adjust the cls_df column names
for channel, cls_df in cls_df_dict.items():
    channel = channel.strip("_df")
    cls_df.columns = [f"{channel}_cls_feature_{i}" for i in range(cls_df.shape[1])]

# merge the cls dataframes
cls_df = pd.concat(cls_df_dict.values(), axis=1)
cls_df.head()

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_374,channel_DNA_cls_feature_375,channel_DNA_cls_feature_376,channel_DNA_cls_feature_377,channel_DNA_cls_feature_378,channel_DNA_cls_feature_379,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383
0,-0.012335,-0.039824,-0.014177,0.014345,-0.018855,0.073014,-0.05681,0.001735,-0.02451,0.049549,...,0.044997,0.074432,-0.002679,-0.002545,0.04147,0.081154,0.028066,0.044984,0.006272,0.058049
1,-0.01216,-0.019531,-0.014679,0.009506,0.091985,0.056522,-0.03244,0.008592,-0.021424,0.08527,...,0.048427,0.060888,0.007161,0.002641,0.003478,0.080152,0.028939,0.006982,0.050384,0.063222
2,0.011821,-0.032649,-0.027432,-0.011297,0.013974,0.073502,-0.040491,0.042519,-0.002226,0.055581,...,0.052072,0.082642,0.004239,-0.025199,0.008767,0.040814,0.002945,0.050048,-0.010751,0.075197
3,-0.008769,-0.037554,-0.000875,0.014342,0.040138,0.049598,-0.058837,-0.015216,-0.053646,0.026459,...,0.029964,0.063882,0.001875,-0.046493,-0.001972,0.093558,-0.002234,0.061149,0.027082,0.059765
4,-0.000989,-0.098983,0.007407,0.052806,0.06231,0.040308,-0.038187,-0.046912,-0.054462,0.076807,...,0.01875,0.063096,-0.007278,0.027512,-0.017827,0.067983,-0.009539,0.007986,0.004013,0.057065


In [7]:
# rename the columns
image_paths_df.columns = ["image_paths"]

In [8]:
# combine data
cls_df["Metadata_image_path"] = image_paths_df["image_paths"]
cls_df.head(1)

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_375,channel_DNA_cls_feature_376,channel_DNA_cls_feature_377,channel_DNA_cls_feature_378,channel_DNA_cls_feature_379,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383,Metadata_image_path
0,-0.012335,-0.039824,-0.014177,0.014345,-0.018855,0.073014,-0.05681,0.001735,-0.02451,0.049549,...,0.074432,-0.002679,-0.002545,0.04147,0.081154,0.028066,0.044984,0.006272,0.058049,../0.pre-process_images/data/processed_images/...


In [9]:
# split column into multiple columns
# Well, FOV, Time, Channel, Cell_id
cls_df["Metadata_Well"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[0]
)

cls_df["Metadata_FOV"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[1]
)

cls_df["Metadata_Time"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[2]
)

cls_df["Metadata_ImageNumber"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[7]
)

cls_df["Metadata_Nuclei_Number_Object_Number"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("cell_number_")[1].split("_crop")[0]
)

cls_df.head()

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number
0,-0.012335,-0.039824,-0.014177,0.014345,-0.018855,0.073014,-0.05681,0.001735,-0.02451,0.049549,...,0.028066,0.044984,0.006272,0.058049,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,102.0
1,-0.01216,-0.019531,-0.014679,0.009506,0.091985,0.056522,-0.03244,0.008592,-0.021424,0.08527,...,0.028939,0.006982,0.050384,0.063222,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,106.0
2,0.011821,-0.032649,-0.027432,-0.011297,0.013974,0.073502,-0.040491,0.042519,-0.002226,0.055581,...,0.002945,0.050048,-0.010751,0.075197,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,122.0
3,-0.008769,-0.037554,-0.000875,0.014342,0.040138,0.049598,-0.058837,-0.015216,-0.053646,0.026459,...,-0.002234,0.061149,0.027082,0.059765,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,128.0
4,-0.000989,-0.098983,0.007407,0.052806,0.06231,0.040308,-0.038187,-0.046912,-0.054462,0.076807,...,-0.009539,0.007986,0.004013,0.057065,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,13.0


In [10]:
# drop the plate column
plate_map_df.drop(columns=["plate"], inplace=True)
# rename columns
plate_map_df = plate_map_df.rename(
    columns={
        "well": "Metadata_Well",
        "compound": "Metadata_compound",
        "dose": "Metadata_dose",
        "control": "Metadata_control",
    },
)
plate_map_df.head()

Unnamed: 0,Metadata_Well,Metadata_compound,Metadata_dose,Metadata_control
0,E-10,Staurosporine,78.13,test
1,C-06,Staurosporine,4.88,test
2,E-02,Staurosporine,0.0,negative
3,C-05,Staurosporine,2.44,test
4,C-11,Staurosporine,156.25,test


In [11]:
# merge cls_df with plate_map_df
cls_df = cls_df.merge(plate_map_df, how="left", on="Metadata_Well")
Metadata_cols = cls_df.columns[cls_df.columns.str.contains("Metadata")]
# move Metadata columns to the front
cls_df = cls_df[
    Metadata_cols.tolist() + cls_df.columns.difference(Metadata_cols).tolist()
]
print(cls_df.shape)
cls_df.head()

(145489, 1545)


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_compound,Metadata_dose,Metadata_control,channel488-1_cls_feature_0,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,102.0,Staurosporine,0.0,negative,-0.012335,...,0.035068,0.000629,-0.034905,0.056939,-0.077406,0.033168,0.005295,0.015653,0.036244,-0.013741
1,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,106.0,Staurosporine,0.0,negative,-0.01216,...,-0.005525,-0.032507,-0.043178,0.01319,-0.073641,0.029824,0.032743,-0.017366,0.053009,-0.006046
2,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,122.0,Staurosporine,0.0,negative,0.011821,...,0.015511,0.027061,0.002058,0.039717,-0.118989,0.010754,0.00684,0.048121,0.046048,0.018789
3,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,128.0,Staurosporine,0.0,negative,-0.008769,...,-0.009289,-0.016169,-0.035088,0.037067,-0.101932,-0.030957,0.027611,0.001648,0.09071,-0.000729
4,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,13.0,Staurosporine,0.0,negative,-0.000989,...,0.026018,-0.003725,0.002089,0.032937,-0.053131,0.023977,-0.018216,0.041922,0.045374,-0.033786


In [12]:
# remove the "F" from each value in the Metadata_FOV column
cls_df["Metadata_FOV"] = cls_df["Metadata_FOV"].str.replace("F", "")

# remove the "T" from each value in the Metadata_Time column
cls_df["Metadata_Time"] = cls_df["Metadata_Time"].str.replace("T", "")

In [13]:
# save the data
cls_df.to_parquet(output_path)