This notebook combines the cellprofiler extracted morphology features and the scDINO extracted morphology features into one feature space. Downstream notebooks will normalize the data and perform feature selection.

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# define data paths for import
# annotated features from cellprofiler including all time points
cellprofiler_fs_features_path = pathlib.Path(
    "../../6.process_CP_features/data/3.combined_data/profiles/combined_data.parquet"
).resolve(strict=True)

# scDINO features from the scDINO analysis including all time points
scdino_features = pathlib.Path(
    "../../7.scDINO_analysis/1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

# set the output path
output_path = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_data.parquet"
).resolve()

# make the parent directory
output_path.parent.mkdir(parents=True, exist_ok=True)

In [3]:
# load in the data
cellprofiler_data = pd.read_parquet(cellprofiler_fs_features_path)
scdino_data = pd.read_parquet(scdino_features)

print(f"cellprofiler data shape: {cellprofiler_data.shape}")
print(f"scDINO data shape: {scdino_data.shape}")

cellprofiler data shape: (158431, 2332)
scDINO data shape: (140235, 1546)


In [4]:
cellprofiler_data["Metadata_original_index"] = cellprofiler_data.index

In [5]:
scdino_data.head(1)

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,145086,Staurosporine,0.0,negative,...,0.035033,-0.000629,-0.034413,0.056733,-0.078023,0.033735,0.006015,0.016154,0.03666,-0.014219


In [6]:
# append either CP or scDINO to the column names
for col in cellprofiler_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        cellprofiler_data.rename(columns={col: f"{col}_CP"}, inplace=True)
for col in scdino_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        scdino_data.rename(columns={col: f"{col}_scDINO"}, inplace=True)

In [7]:
# make the Metadata Columns objects
# these are the columns that are common between the two datasets
cellprofiler_metadata_columns = [
    "Metadata_Well",
    "Metadata_FOV",
    "Metadata_Time",
    "Metadata_ImageNumber",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_compound",
    "Metadata_dose",
    "Metadata_control",
    "Metadata_original_index",
]

In [8]:
scdino_data.head()
# convert time to float
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"].astype(float)
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"] - 1
scdino_data.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,102.0,145086,Staurosporine,0.0,negative,...,0.035033,-0.000629,-0.034413,0.056733,-0.078023,0.033735,0.006015,0.016154,0.03666,-0.014219
1,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,106.0,145087,Staurosporine,0.0,negative,...,-0.005406,-0.033099,-0.04295,0.012875,-0.074455,0.030203,0.032813,-0.017182,0.052587,-0.005798
2,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,122.0,145088,Staurosporine,0.0,negative,...,0.01569,0.027015,0.001716,0.039788,-0.119122,0.010958,0.007134,0.048436,0.046638,0.01856
3,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,128.0,145090,Staurosporine,0.0,negative,...,-0.009345,-0.016075,-0.035206,0.037123,-0.102122,-0.030821,0.02755,0.001286,0.091023,-0.000872
4,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,13.0,145073,Staurosporine,0.0,negative,...,0.026046,-0.004125,0.002168,0.03304,-0.054002,0.024171,-0.018173,0.041654,0.04535,-0.033588


In [9]:
for col in cellprofiler_metadata_columns:
    if col not in cellprofiler_data.columns:
        raise ValueError(f"{col} not found in cellprofiler data.")
    cellprofiler_data[col] = cellprofiler_data[col].astype(str)
    if col not in scdino_data.columns:
        raise ValueError(f"{col} not found in scDINO data.")
    scdino_data[col] = scdino_data[col].astype(str)

In [10]:
print(f"cellprofiler data shape after sorting: {cellprofiler_data.shape}")
print(f"scDINO data shape after sorting: {scdino_data.shape}")
merged_df = pd.merge(
    cellprofiler_data,
    scdino_data,
    how="inner",
    on=cellprofiler_metadata_columns,
)
print(f"merged data shape: {merged_df.shape}")
# drop duplicates
merged_df = merged_df.drop_duplicates(
    subset=cellprofiler_metadata_columns,
    keep="last",
)
print(f"merged data shape after dropping duplicates: {merged_df.shape}")

cellprofiler data shape after sorting: (158431, 2333)
scDINO data shape after sorting: (140235, 1546)
merged data shape: (140235, 3870)
merged data shape after dropping duplicates: (140235, 3870)


In [11]:
# merged_df.to_parquet(output_path)
print(f"merged_df shape: {merged_df.shape}")
# merged_df.head()
# drop rows with NaN values
merged_df = merged_df.dropna(axis=0, how="all")
merged_df.to_parquet(output_path, index=False)
print(f"merged_df shape: {merged_df.shape}")
merged_df.head()

merged_df shape: (140235, 3870)
merged_df shape: (140235, 3870)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,1.0,...,-0.009135,0.007438,-0.020541,0.058015,-0.090802,0.019711,0.001946,0.053973,0.035416,0.000113
1,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,15.0,...,-0.005651,0.022047,-0.013575,0.010399,-0.081259,0.035008,-0.027851,0.025665,0.052722,-0.001162
2,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,16.0,...,0.013007,-0.023978,-0.022457,0.021881,-0.052918,0.006701,-0.023775,-0.006807,0.058732,0.007511
3,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,26.0,...,0.072824,-0.040654,0.00464,0.01782,-0.113373,0.077095,0.004394,0.012186,0.004668,-0.01223
4,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,37.0,...,-0.042243,-0.032191,-0.024605,0.069248,-0.113662,0.003017,0.020446,7.7e-05,0.083102,0.015543
