In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# set paths
# input data
cls_path_dict = {
    "channel_DNA": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_DNA_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel488-1": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel488-1_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel488-2": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel488-2_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
    "channel561": pathlib.Path(
        "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/channel_channel561_model_dino_deitsmall16_pretrain_full_checkpoint_features.csv"
    ).resolve(strict=True),
}

# image paths
image_paths_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/image_paths.csv"
).resolve(strict=True)

# plate map
plate_map_path = pathlib.Path("../../../data/platemap_6hr_4ch.csv").resolve(strict=True)

# output path for the merged table
output_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve()

In [3]:
cls_df_dict = {}
cls_df_dict["channel488-1_df"] = pd.read_csv(cls_path_dict["channel488-1"], header=None)
cls_df_dict["channel488-2_df"] = pd.read_csv(cls_path_dict["channel488-2"], header=None)
cls_df_dict["channel561_df"] = pd.read_csv(cls_path_dict["channel561"], header=None)
cls_df_dict["channel_DNA_df"] = pd.read_csv(cls_path_dict["channel_DNA"], header=None)

In [4]:
image_paths_df = pd.read_csv(image_paths_path, header=None)
plate_map_df = pd.read_csv(plate_map_path)

In [5]:
print(
    len(cls_df_dict["channel488-1_df"]),
    len(cls_df_dict["channel488-2_df"]),
    len(cls_df_dict["channel561_df"]),
    len(cls_df_dict["channel_DNA_df"]),
    len(image_paths_df),
)
print(
    cls_df_dict["channel488-1_df"].shape,
    cls_df_dict["channel488-2_df"].shape,
    cls_df_dict["channel561_df"].shape,
    cls_df_dict["channel_DNA_df"].shape,
    image_paths_df.shape,
)
print(plate_map_df.shape)

182804 182804 182804 182804 182804
(182804, 384) (182804, 384) (182804, 384) (182804, 384) (182804, 1)
(30, 5)


In [6]:
# loop through each channel and adjust the cls_df column names
for channel, cls_df in cls_df_dict.items():
    channel = channel.strip("_df")
    cls_df.columns = [f"{channel}_cls_feature_{i}" for i in range(cls_df.shape[1])]

# merge the cls dataframes
cls_df = pd.concat(cls_df_dict.values(), axis=1)
cls_df.head()

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_374,channel_DNA_cls_feature_375,channel_DNA_cls_feature_376,channel_DNA_cls_feature_377,channel_DNA_cls_feature_378,channel_DNA_cls_feature_379,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383
0,-0.012062,-0.043927,-0.018934,0.016534,-0.015077,0.074192,-0.052186,0.012729,-0.016323,0.050879,...,0.043387,0.076849,-0.004747,-0.003402,0.040119,0.081051,0.023229,0.045192,0.009245,0.058797
1,-0.014538,-0.018479,0.013503,0.00215,0.080237,0.072019,-0.018354,0.019143,-0.009941,0.076986,...,0.055838,0.06367,0.013152,0.010394,0.011247,0.081992,0.015077,0.005999,0.059859,0.06179
2,-0.010044,-0.093711,0.022481,0.044029,0.093712,0.036702,-0.030992,-0.060077,-0.077113,0.065323,...,0.028506,0.075572,-0.006176,0.014053,-0.004615,0.042724,-0.007382,0.042903,0.008142,0.053775
3,0.021043,-0.030516,-0.012435,-0.015096,0.027948,0.065034,-0.02856,0.043659,-0.002623,0.037215,...,0.05134,0.083915,0.008224,-0.02435,0.00693,0.041619,0.001577,0.049175,-0.012604,0.075224
4,-0.009109,-0.043035,-0.004597,0.01229,0.045244,0.051989,-0.058144,-0.008549,-0.050373,0.031558,...,0.030677,0.064902,0.003703,-0.047008,-0.001545,0.094221,-0.003457,0.060076,0.026861,0.058533


In [7]:
# rename the columns
image_paths_df.columns = ["image_paths"]

In [8]:
# combine data
cls_df["Metadata_image_path"] = image_paths_df["image_paths"]
cls_df.head(1)

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_375,channel_DNA_cls_feature_376,channel_DNA_cls_feature_377,channel_DNA_cls_feature_378,channel_DNA_cls_feature_379,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383,Metadata_image_path
0,-0.012062,-0.043927,-0.018934,0.016534,-0.015077,0.074192,-0.052186,0.012729,-0.016323,0.050879,...,0.076849,-0.004747,-0.003402,0.040119,0.081051,0.023229,0.045192,0.009245,0.058797,../0.pre-process_images/data/processed_images/...


In [9]:
# split column into multiple columns
# Well, FOV, Time, Channel, Cell_id
cls_df["Metadata_Well"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[0]
)

cls_df["Metadata_FOV"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[1]
)

cls_df["Metadata_Time"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[2]
)

cls_df["Metadata_ImageNumber"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[7]
)

cls_df["Metadata_Nuclei_Number_Object_Number"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[8]
)

cls_df.head()

Unnamed: 0,channel488-1_cls_feature_0,channel488-1_cls_feature_1,channel488-1_cls_feature_2,channel488-1_cls_feature_3,channel488-1_cls_feature_4,channel488-1_cls_feature_5,channel488-1_cls_feature_6,channel488-1_cls_feature_7,channel488-1_cls_feature_8,channel488-1_cls_feature_9,...,channel_DNA_cls_feature_380,channel_DNA_cls_feature_381,channel_DNA_cls_feature_382,channel_DNA_cls_feature_383,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number
0,-0.012062,-0.043927,-0.018934,0.016534,-0.015077,0.074192,-0.052186,0.012729,-0.016323,0.050879,...,0.023229,0.045192,0.009245,0.058797,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell
1,-0.014538,-0.018479,0.013503,0.00215,0.080237,0.072019,-0.018354,0.019143,-0.009941,0.076986,...,0.015077,0.005999,0.059859,0.06179,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell
2,-0.010044,-0.093711,0.022481,0.044029,0.093712,0.036702,-0.030992,-0.060077,-0.077113,0.065323,...,-0.007382,0.042903,0.008142,0.053775,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell
3,0.021043,-0.030516,-0.012435,-0.015096,0.027948,0.065034,-0.02856,0.043659,-0.002623,0.037215,...,0.001577,0.049175,-0.012604,0.075224,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell
4,-0.009109,-0.043035,-0.004597,0.01229,0.045244,0.051989,-0.058144,-0.008549,-0.050373,0.031558,...,-0.003457,0.060076,0.026861,0.058533,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell


In [10]:
# drop the plate column
plate_map_df.drop(columns=["plate"], inplace=True)
# rename columns
plate_map_df = plate_map_df.rename(
    columns={
        "well": "Metadata_Well",
        "compound": "Metadata_compound",
        "dose": "Metadata_dose",
        "control": "Metadata_control",
    },
)
plate_map_df.head()

Unnamed: 0,Metadata_Well,Metadata_compound,Metadata_dose,Metadata_control
0,E-10,Staurosporine,78.13,test
1,C-06,Staurosporine,4.88,test
2,E-02,Staurosporine,0.0,negative
3,C-05,Staurosporine,2.44,test
4,C-11,Staurosporine,156.25,test


In [11]:
# merge cls_df with plate_map_df
cls_df = cls_df.merge(plate_map_df, how="left", on="Metadata_Well")
Metadata_cols = cls_df.columns[cls_df.columns.str.contains("Metadata")]
# move Metadata columns to the front
cls_df = cls_df[
    Metadata_cols.tolist() + cls_df.columns.difference(Metadata_cols).tolist()
]
print(cls_df.shape)
cls_df.head()

(182804, 1545)


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_compound,Metadata_dose,Metadata_control,channel488-1_cls_feature_0,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell,Staurosporine,0.0,negative,-0.012062,...,0.034462,-0.005022,-0.032672,0.056519,-0.081751,0.035719,0.00865,0.017762,0.038345,-0.015309
1,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell,Staurosporine,0.0,negative,-0.014538,...,0.008132,-0.048781,-0.029867,0.013783,-0.089631,0.029462,0.013753,-0.041825,0.045902,-0.025322
2,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell,Staurosporine,0.0,negative,-0.010044,...,0.016777,-0.02297,-0.012937,0.031086,-0.063206,0.017341,0.006725,0.058906,0.051218,-0.021713
3,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell,Staurosporine,0.0,negative,0.021043,...,0.016239,0.026569,9.4e-05,0.040027,-0.120004,0.01214,0.008501,0.04885,0.048612,0.017343
4,../0.pre-process_images/data/processed_images/...,C-02,F0001,T0001,1,cell,Staurosporine,0.0,negative,-0.009109,...,-0.009832,-0.015505,-0.03555,0.037506,-0.102835,-0.030543,0.026912,-0.000418,0.092182,-0.001539


In [12]:
# remove the "F" from each value in the Metadata_FOV column
cls_df["Metadata_FOV"] = cls_df["Metadata_FOV"].str.replace("F", "")

# remove the "T" from each value in the Metadata_Time column
cls_df["Metadata_Time"] = cls_df["Metadata_Time"].str.replace("T", "")

In [13]:
# save the data
cls_df.to_parquet(output_path)