This notebook combines the cellprofiler extracted morphology features and the scDINO extracted morphology features into one feature space. Downstream notebooks will normalize the data and perform feature selection.

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# define data paths for import
# annotated features from cellprofiler including all time points
cellprofiler_fs_features_path = pathlib.Path(
    "../../5.process_CP_features/data/5.feature_select/profiles/features_selected_profile.parquet"
).resolve(strict=True)

# scDINO features from the scDINO analysis including all time points
scdino_features = pathlib.Path(
    "../../6.scDINO_analysis/1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

# set the output path
output_path = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_data.parquet"
).resolve()

# make the parent directory
output_path.parent.mkdir(parents=True, exist_ok=True)

In [3]:
# load in the data
cellprofiler_data = pd.read_parquet(cellprofiler_fs_features_path)
scdino_data = pd.read_parquet(scdino_features)

print(f"cellprofiler data shape: {cellprofiler_data.shape}")
print(f"scDINO data shape: {scdino_data.shape}")

cellprofiler data shape: (158431, 901)
scDINO data shape: (148829, 1546)


In [4]:
cellprofiler_data["Metadata_original_index"] = cellprofiler_data.index

In [5]:
scdino_data.head(1)

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,153148,Staurosporine,0.0,negative,...,0.035034,-0.000642,-0.034409,0.05673,-0.078027,0.033738,0.00602,0.016161,0.036666,-0.014224


In [6]:
# append either CP or scDINO to the column names
for col in cellprofiler_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        cellprofiler_data.rename(columns={col: f"{col}_CP"}, inplace=True)
for col in scdino_data.columns:
    # ensure Metadata is not in the column name
    if not "Metadata" in col:
        scdino_data.rename(columns={col: f"{col}_scDINO"}, inplace=True)

In [7]:
# make the Metadata Columns objects
# these are the columns that are common between the two datasets
cellprofiler_metadata_columns = [
    "Metadata_Well",
    "Metadata_FOV",
    "Metadata_Time",
    "Metadata_ImageNumber",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_compound",
    "Metadata_dose",
    "Metadata_control",
    "Metadata_original_index",
]

In [8]:
scdino_data.head()
# convert time to float
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"].astype(float)
scdino_data["Metadata_Time"] = scdino_data["Metadata_Time"] - 1
scdino_data.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,102.0,153148,Staurosporine,0.0,negative,...,0.035034,-0.000642,-0.034409,0.05673,-0.078027,0.033738,0.00602,0.016161,0.036666,-0.014224
1,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,106.0,153149,Staurosporine,0.0,negative,...,-0.005401,-0.033103,-0.042951,0.012873,-0.07446,0.030204,0.03281,-0.017176,0.052583,-0.005795
2,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,122.0,153150,Staurosporine,0.0,negative,...,0.015695,0.027015,0.001708,0.039789,-0.119119,0.010958,0.007135,0.048439,0.046645,0.018558
3,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,128.0,153152,Staurosporine,0.0,negative,...,-0.009343,-0.016073,-0.03521,0.037125,-0.102121,-0.030821,0.027548,0.001282,0.091025,-0.000873
4,../0.pre-process_images/data/processed_images/...,C-02,1,0.0,1,13.0,153135,Staurosporine,0.0,negative,...,0.026049,-0.004126,0.002165,0.033042,-0.05401,0.02417,-0.018175,0.041652,0.04535,-0.033585


In [9]:
for col in cellprofiler_metadata_columns:
    if col not in cellprofiler_data.columns:
        raise ValueError(f"{col} not found in cellprofiler data.")
    cellprofiler_data[col] = cellprofiler_data[col].astype(str)
    if col not in scdino_data.columns:
        raise ValueError(f"{col} not found in scDINO data.")
    scdino_data[col] = scdino_data[col].astype(str)

In [10]:
print(f"cellprofiler data shape after sorting: {cellprofiler_data.shape}")
print(f"scDINO data shape after sorting: {scdino_data.shape}")
merged_df = pd.merge(
    cellprofiler_data,
    scdino_data,
    how="inner",
    on=cellprofiler_metadata_columns,
)
print(f"merged data shape: {merged_df.shape}")
# drop duplicates
merged_df = merged_df.drop_duplicates(
    subset=cellprofiler_metadata_columns,
    keep="last",
)
print(f"merged data shape after dropping duplicates: {merged_df.shape}")

cellprofiler data shape after sorting: (158431, 902)
scDINO data shape after sorting: (148829, 1546)
merged data shape: (2697, 2439)
merged data shape after dropping duplicates: (2697, 2439)


In [11]:
# merged_df.to_parquet(output_path)
print(f"merged_df shape: {merged_df.shape}")
# merged_df.head()
# drop rows with NaN values
merged_df = merged_df.dropna(how="any", axis=0)
merged_df.to_parquet(output_path, index=False)
print(f"merged_df shape: {merged_df.shape}")
merged_df.head()

merged_df shape: (2697, 2439)
merged_df shape: (2640, 2439)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_90_scDINO,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO
0,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,1.0,...,-0.009129,0.007438,-0.02054,0.058015,-0.09081,0.019709,0.001941,0.05396,0.035419,0.000103
1,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,15.0,...,-0.005639,0.022051,-0.013566,0.010396,-0.081235,0.035007,-0.027851,0.02566,0.052723,-0.00117
2,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,16.0,...,0.013015,-0.023979,-0.022454,0.021878,-0.052903,0.006697,-0.023782,-0.006809,0.058739,0.007515
3,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,26.0,...,0.07283,-0.040655,0.00464,0.017822,-0.113372,0.077085,0.004389,0.012183,0.004662,-0.012226
4,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,37.0,...,-0.042158,-0.032217,-0.024601,0.069246,-0.113645,0.003064,0.020424,8.1e-05,0.083128,0.015535
