In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# set paths
# input data
cls_path_dict = {
    "channel_DNA": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_DNA_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel488-1": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel488-1_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel488-2": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel488-2_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel561": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel561_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
}

# image paths
image_paths_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/image_paths.csv"
).resolve(strict=True)

# plate map
plate_map_path = pathlib.Path("../../../data/platemap_6hr_4ch.csv").resolve(strict=True)

# output path for the merged table
output_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve()

In [3]:
cls_df_dict = {}
cls_df_dict["channel488-1_df"] = pd.read_csv(cls_path_dict["channel488-1"], header=None)
cls_df_dict["channel488-2_df"] = pd.read_csv(cls_path_dict["channel488-2"], header=None)
cls_df_dict["channel561_df"] = pd.read_csv(cls_path_dict["channel561"], header=None)
cls_df_dict["channel_DNA_df"] = pd.read_csv(cls_path_dict["channel_DNA"], header=None)

In [4]:
image_paths_df = pd.read_csv(image_paths_path, header=None)
plate_map_df = pd.read_csv(plate_map_path)

In [5]:
print(
    len(cls_df_dict["channel488-1_df"]),
    len(cls_df_dict["channel488-2_df"]),
    len(cls_df_dict["channel561_df"]),
    len(cls_df_dict["channel_DNA_df"]),
    len(image_paths_df),
)
print(
    cls_df_dict["channel488-1_df"].shape,
    cls_df_dict["channel488-2_df"].shape,
    cls_df_dict["channel561_df"].shape,
    cls_df_dict["channel_DNA_df"].shape,
    image_paths_df.shape,
)
print(plate_map_df.shape)

243970 243970 243970 243970 243970
(243970, 384) (243970, 384) (243970, 384) (243970, 384) (243970, 1)
(30, 5)


In [6]:
# loop through each channel and adjust the cls_df column names
for channel, cls_df in cls_df_dict.items():
    channel = channel.strip("_df")
    cls_df.columns = [f"{channel}_cls_feature_{i}" for i in range(cls_df.shape[1])]

# merge the cls dataframes
cls_df = pd.concat(cls_df_dict.values(), axis=1)
cls_df.head()

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_374,channel_DNA_cls_feature_375,channel_DNA_cls_feature_376,channel_DNA_cls_feature_377,channel_DNA_cls_feature_378,channel_DNA_cls_feature_379,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383
0,-0.013294,-0.044794,0.041722,0.014056,-0.022456,0.07386,-0.015844,-0.019516,-0.029371,-0.005611,...,0.045141,0.076995,-0.000727,-0.003429,-0.033548,0.050659,0.001172,0.065055,0.041562,0.066025
1,0.003657,-0.044746,0.033793,0.007538,0.059636,0.047369,-0.020354,0.000413,-0.027407,0.065242,...,0.048771,0.068213,-0.007675,0.012108,-0.04635,0.101172,-0.017807,0.014505,-0.007682,0.066058
2,0.013813,-0.050277,0.005833,0.000924,0.01013,0.063986,-0.044673,-0.002333,-0.022488,0.011187,...,0.054014,0.05347,0.023144,-0.011415,-0.013116,0.0559,0.004171,-0.009915,0.035614,0.051017
3,0.026417,-0.047056,0.01821,-0.012879,0.004298,0.073675,0.005814,0.027006,-0.00032,0.058339,...,0.037942,0.064386,0.006445,-0.031742,0.000301,0.108493,-0.026688,0.016617,0.010483,0.067955
4,-0.002667,-0.047412,0.017486,-0.003195,-0.024446,0.075968,-0.013097,0.020677,-0.001588,0.039774,...,0.040023,0.063539,0.005399,-0.020769,-0.026474,0.076105,-0.010477,0.013235,0.064873,0.074581


In [7]:
# rename the columns
image_paths_df.columns = ["image_paths"]

In [8]:
# combine data
cls_df["Metadata_image_path"] = image_paths_df["image_paths"]
cls_df.head(1)

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_375,channel_DNA_cls_feature_376,channel_DNA_cls_feature_377,channel_DNA_cls_feature_378,channel_DNA_cls_feature_379,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383,Metadata_image_path
0,-0.013294,-0.044794,0.041722,0.014056,-0.022456,0.07386,-0.015844,-0.019516,-0.029371,-0.005611,...,0.076995,-0.000727,-0.003429,-0.033548,0.050659,0.001172,0.065055,0.041562,0.066025,../../data/processed_images/crops/C-02/image_n...


In [9]:
# split column into multiple columns
# Well, FOV, Time, Channel, Cell_id
cls_df["Metadata_Well"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[0]
)

cls_df["Metadata_FOV"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[1]
)

cls_df["Metadata_Time"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[2]
)

cls_df["Metadata_ImageNumber"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[7]
)

cls_df["Metadata_Nuclei_Number_Object_Number"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[8]
)

cls_df.head()

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number
0,-0.013294,-0.044794,0.041722,0.014056,-0.022456,0.07386,-0.015844,-0.019516,-0.029371,-0.005611,...,0.001172,0.065055,0.041562,0.066025,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,10
1,0.003657,-0.044746,0.033793,0.007538,0.059636,0.047369,-0.020354,0.000413,-0.027407,0.065242,...,-0.017807,0.014505,-0.007682,0.066058,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,100
2,0.013813,-0.050277,0.005833,0.000924,0.01013,0.063986,-0.044673,-0.002333,-0.022488,0.011187,...,0.004171,-0.009915,0.035614,0.051017,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,101
3,0.026417,-0.047056,0.01821,-0.012879,0.004298,0.073675,0.005814,0.027006,-0.00032,0.058339,...,-0.026688,0.016617,0.010483,0.067955,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,102
4,-0.002667,-0.047412,0.017486,-0.003195,-0.024446,0.075968,-0.013097,0.020677,-0.001588,0.039774,...,-0.010477,0.013235,0.064873,0.074581,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,103


In [10]:
# drop the plate column
plate_map_df.drop(columns=["plate"], inplace=True)
# rename columns
plate_map_df = plate_map_df.rename(
    columns={
        "well": "Metadata_Well",
        "compound": "Metadata_compound",
        "dose": "Metadata_dose",
        "control": "Metadata_control",
    },
)
plate_map_df.head()

Unnamed: 0,Metadata_Well,Metadata_compound,Metadata_dose,Metadata_control
0,E-10,Staurosporine,78.13,test
1,C-06,Staurosporine,4.88,test
2,E-02,Staurosporine,0.0,negative
3,C-05,Staurosporine,2.44,test
4,C-11,Staurosporine,156.25,test


In [11]:
# merge cls_df with plate_map_df
cls_df = cls_df.merge(plate_map_df, how="left", on="Metadata_Well")
Metadata_cols = cls_df.columns[cls_df.columns.str.contains("Metadata")]
# move Metadata columns to the front
cls_df = cls_df[
    Metadata_cols.tolist() + cls_df.columns.difference(Metadata_cols).tolist()
]
cls_df.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_compound,Metadata_dose,Metadata_control,channel488-1_cls_feature_0,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,10,Staurosporine,0.0,negative,-0.013294,...,-0.026835,-0.025143,0.022814,0.044236,-0.048172,0.003977,0.005565,0.033877,0.082223,0.009103
1,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,100,Staurosporine,0.0,negative,0.003657,...,0.02401,0.002705,-0.059467,0.032855,-0.05753,0.031927,0.017482,0.051654,0.024463,-0.034733
2,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,101,Staurosporine,0.0,negative,0.013813,...,-0.007732,0.024938,0.027292,0.034904,-0.127702,-0.014732,0.033218,0.008977,0.031269,-0.031651
3,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,102,Staurosporine,0.0,negative,0.026417,...,-0.041826,-0.028302,-0.034485,0.032456,-0.054537,0.021049,0.028299,-0.006374,0.101494,-0.018018
4,../../data/processed_images/crops/C-02/image_n...,C-02,F0001,T0010,10,103,Staurosporine,0.0,negative,-0.002667,...,-0.029698,-0.017408,-0.011866,0.045469,-0.077298,0.000747,0.076386,-0.007291,0.015712,-0.011359


In [12]:
# remove the "F" from each value in the Metadata_FOV column
cls_df["Metadata_FOV"] = cls_df["Metadata_FOV"].str.replace("F", "")

# remove the "T" from each value in the Metadata_Time column
cls_df["Metadata_Time"] = cls_df["Metadata_Time"].str.replace("T", "")

In [13]:
# save the data
cls_df.to_parquet(output_path)