In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# set paths
# input data
cls_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/test_run/CLS_features/channel_DNA_channel488-1_channel488-2_channel561_blank_model_sc-ViT_checkpoint0100_vitsmall16_features.csv"
).resolve(strict=True)
image_paths_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/test_run/CLS_features/image_paths.csv"
).resolve(strict=True)

# plate map
plate_map_path = pathlib.Path("../../data/platemap_6hr_4ch.csv").resolve(strict=True)

# output path for the merged table
output_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/test_run/CLS_features/CLS_features_annotated.parquet"
).resolve()

In [3]:
# load data
cls_df = pd.read_csv(cls_path, header=None)
image_paths_df = pd.read_csv(image_paths_path, header=None)
plate_map_df = pd.read_csv(plate_map_path)

In [4]:
print(len(cls_df), len(image_paths_df))

193146 193146


In [5]:
# rename each column to have cls_ prefix
cls_df.columns = [f"cls_{i}" for i in range(cls_df.shape[1])]
# rename the columns
image_paths_df.columns = ["image_paths"]

In [6]:
# combine data
cls_df["Metadata_image_path"] = image_paths_df["image_paths"]
cls_df.head(1)

Unnamed: 0,cls_0,cls_1,cls_2,cls_3,cls_4,cls_5,cls_6,cls_7,cls_8,cls_9,...,cls_375,cls_376,cls_377,cls_378,cls_379,cls_380,cls_381,cls_382,cls_383,Metadata_image_path
0,0.031278,-0.054149,0.107055,0.002328,0.034356,-0.048974,0.037127,-0.014773,0.026619,0.037709,...,-0.174261,-0.020648,0.015223,0.041441,-0.055121,0.038324,-0.046185,0.089597,0.006296,/home/lippincm/Documents/4TB/data/live_cell_ti...


In [7]:
# split column into multiple columns
cls_df["Metadata_Well"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[0]
)
cls_df["Metadata_FOV"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[1]
)
cls_df["Metadata_Time"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[2]
)
cls_df["Metadata_Channel"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[4]
)
cls_df["Metadata_Cell_id"] = cls_df["Metadata_image_path"].apply(
    lambda x: pathlib.Path(x).name.split("_")[6]
)
# move Metadata columns to the front
# pop out the Metadata columns


cls_df.head()

Unnamed: 0,cls_0,cls_1,cls_2,cls_3,cls_4,cls_5,cls_6,cls_7,cls_8,cls_9,...,cls_380,cls_381,cls_382,cls_383,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_Channel,Metadata_Cell_id
0,0.031278,-0.054149,0.107055,0.002328,0.034356,-0.048974,0.037127,-0.014773,0.026619,0.037709,...,0.038324,-0.046185,0.089597,0.006296,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,100
1,0.031099,-0.064783,0.12332,-0.037688,0.08013,-0.053484,0.046317,-0.004994,0.013629,0.04386,...,0.052623,-0.040115,0.084618,0.026104,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,101
2,0.038233,-0.058997,0.125674,-0.02844,0.03533,-0.06845,0.044963,-7.7e-05,0.051532,0.046608,...,0.052651,-0.044537,0.081724,0.001703,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,102
3,0.037757,-0.071355,0.12697,-0.010027,0.064131,-0.044636,0.03566,-0.005977,0.013472,0.043078,...,0.046203,-0.05223,0.084074,-0.003276,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,103
4,0.02529,-0.05107,0.091758,0.000504,-0.003291,-0.076319,0.029776,0.010253,0.047732,0.052157,...,0.04466,-0.013037,0.089427,0.003103,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,104


In [8]:
# drop the plate column
plate_map_df.drop(columns=["plate"], inplace=True)
# rename columns
plate_map_df = plate_map_df.rename(
    columns={
        "well": "Metadata_Well",
        "compound": "Metadata_compound",
        "dose": "Metadata_dose",
        "control": "Metadata_control",
    },
)
plate_map_df.head()

Unnamed: 0,Metadata_Well,Metadata_compound,Metadata_dose,Metadata_control
0,E-10,Staurosporine,78.13,test
1,C-06,Staurosporine,4.88,test
2,E-02,Staurosporine,0.0,negative
3,C-05,Staurosporine,2.44,test
4,C-11,Staurosporine,156.25,test


In [9]:
# merge cls_df with plate_map_df
cls_df = cls_df.merge(plate_map_df, how="left", on="Metadata_Well")
Metadata_cols = cls_df.columns[cls_df.columns.str.contains("Metadata")]
# move Metadata columns to the front
cls_df = cls_df[
    Metadata_cols.tolist() + cls_df.columns.difference(Metadata_cols).tolist()
]
cls_df.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_Channel,Metadata_Cell_id,Metadata_compound,Metadata_dose,Metadata_control,cls_0,...,cls_90,cls_91,cls_92,cls_93,cls_94,cls_95,cls_96,cls_97,cls_98,cls_99
0,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,100,Staurosporine,0.0,negative,0.031278,...,0.103771,-0.002917,-0.088505,-0.054172,0.009552,-0.053764,0.092698,0.052247,0.047284,0.025616
1,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,101,Staurosporine,0.0,negative,0.031099,...,0.082508,-0.004323,-0.041911,-0.060461,0.007774,-0.06558,0.110302,-3.4e-05,0.024379,0.01959
2,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,102,Staurosporine,0.0,negative,0.038233,...,0.090952,-0.004582,-0.037789,-0.04801,0.004603,-0.0582,0.084165,0.034247,0.05079,0.051865
3,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,103,Staurosporine,0.0,negative,0.037757,...,0.085263,0.001482,-0.084153,-0.051854,-0.000954,-0.049379,0.102376,0.030741,0.029512,0.040069
4,/home/lippincm/Documents/4TB/data/live_cell_ti...,C-02,F0001,T0001,C01,104,Staurosporine,0.0,negative,0.02529,...,0.098721,-0.009961,-0.039805,-0.040878,0.022797,-0.054397,0.045918,0.046192,0.052029,0.035139


In [10]:
# save the data
cls_df.to_parquet(output_path, index=False)