This notebook combines the cellprofiler extracted morphology features and the scDINO extracted morphology features into one feature space. Downstream notebooks will normalize the data and perform feature selection.

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# define data paths for import
# annotated features from cellprofiler including all time points
cellprofiler_annotated_features_path = pathlib.Path(
    "../../5.process_CP_features/data/3.combined_data/profiles/combined_data.parquet"
).resolve(strict=True)

# scDINO features from the scDINO analysis including all time points
scdino_features = pathlib.Path(
    "../../6.scDINO_analysis/1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

# set the output path
output_path = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_data.parquet"
).resolve()

# make the parent directory
output_path.parent.mkdir(parents=True, exist_ok=True)

In [3]:
# load in the data
cellprofiler_data = pd.read_parquet(cellprofiler_annotated_features_path)
scdino_data = pd.read_parquet(scdino_features)

print(f"cellprofiler data shape: {cellprofiler_data.shape}")
print(f"scDINO data shape: {scdino_data.shape}")

cellprofiler data shape: (209311, 2332)
scDINO data shape: (182804, 1545)


In [4]:
# append either CP or scDINO to the column names
for col in cellprofiler_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        cellprofiler_data.rename(columns={col: f"{col}_CP"}, inplace=True)
for col in scdino_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        scdino_data.rename(columns={col: f"{col}_scDINO"}, inplace=True)

In [5]:
# make the Metadata Columns objects
# these are the columns that are common between the two datasets
cellprofiler_metadata_columns = [
    "Metadata_Well",
    "Metadata_FOV",
    "Metadata_Time",
    "Metadata_ImageNumber",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_compound",
    "Metadata_dose",
    "Metadata_control",
]

In [6]:
# sort the data by Well, FOV, Time, ImageNumber, Nuclei_Number_Object_Number
cellprofiler_data = cellprofiler_data.sort_values(
    by=cellprofiler_metadata_columns,
    ascending=True,
).reset_index(drop=True)

scdino_data = scdino_data.sort_values(
    by=cellprofiler_metadata_columns,
    ascending=True,
).reset_index(drop=True)

In [7]:
scdino_data.head()
# convert time to float
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"].astype(float)
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"] - 1
scdino_data.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_compound,Metadata_dose,Metadata_control,channel488-1_cls_feature_0_scDINO,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,100.0,Staurosporine,0.0,negative,-0.012062,...,0.034462,-0.005022,-0.032672,0.056519,-0.081751,0.035719,0.00865,0.017762,0.038345,-0.015309
1,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,104.0,Staurosporine,0.0,negative,-0.014538,...,0.008132,-0.048781,-0.029867,0.013783,-0.089631,0.029462,0.013753,-0.041825,0.045902,-0.025322
2,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,12.0,Staurosporine,0.0,negative,-0.010044,...,0.016777,-0.02297,-0.012937,0.031086,-0.063206,0.017341,0.006725,0.058906,0.051218,-0.021713
3,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,120.0,Staurosporine,0.0,negative,0.021043,...,0.016239,0.026569,9.4e-05,0.040027,-0.120004,0.01214,0.008501,0.04885,0.048612,0.017343
4,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,126.0,Staurosporine,0.0,negative,-0.009109,...,-0.009832,-0.015505,-0.03555,0.037506,-0.102835,-0.030543,0.026912,-0.000418,0.092182,-0.001539


In [8]:
for col in cellprofiler_metadata_columns:
    if col not in cellprofiler_data.columns:
        raise ValueError(f"{col} not found in cellprofiler data.")
    cellprofiler_data[col] = cellprofiler_data[col].astype(str)
    if col not in scdino_data.columns:
        raise ValueError(f"{col} not found in scDINO data.")
    scdino_data[col] = scdino_data[col].astype(str)

In [9]:
print(f"cellprofiler data shape after sorting: {cellprofiler_data.shape}")
print(f"scDINO data shape after sorting: {scdino_data.shape}")
merged_df = pd.merge(
    cellprofiler_data,
    scdino_data,
    how="right",
    on=cellprofiler_metadata_columns,
)
print(f"merged data shape: {merged_df.shape}")
# drop duplicates
merged_df = merged_df.drop_duplicates(
    subset=cellprofiler_metadata_columns,
    keep="last",
)
print(f"merged data shape after dropping duplicates: {merged_df.shape}")

cellprofiler data shape after sorting: (209311, 2332)
scDINO data shape after sorting: (182804, 1545)
merged data shape: (184115, 3869)
merged data shape after dropping duplicates: (182804, 3869)


In [10]:
merged_df.to_parquet(output_path)
print(f"merged_df shape: {merged_df.shape}")
merged_df.head()

merged_df shape: (182804, 3869)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,101,...,0.034462,-0.005022,-0.032672,0.056519,-0.081751,0.035719,0.00865,0.017762,0.038345,-0.015309
1,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,111,...,0.008132,-0.048781,-0.029867,0.013783,-0.089631,0.029462,0.013753,-0.041825,0.045902,-0.025322
2,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,0.016777,-0.02297,-0.012937,0.031086,-0.063206,0.017341,0.006725,0.058906,0.051218,-0.021713
3,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,128,...,0.016239,0.026569,9.4e-05,0.040027,-0.120004,0.01214,0.008501,0.04885,0.048612,0.017343
4,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,132,...,-0.009832,-0.015505,-0.03555,0.037506,-0.102835,-0.030543,0.026912,-0.000418,0.092182,-0.001539
