This notebook combines the CellProfiller extracted morphology features and the scDINO extracted morphology features into one feature space. Downstream notebooks will normalize the data and perform feature selection.

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
# define data paths for import
cellprofiller_annotated_features_path = pathlib.Path(
    "../../4.process_CP_features/data/annotated_data/run_20231017ChromaLive_6hr_4ch_MaxIP_sc.parquet"
).resolve(strict=True)

scdino_features = pathlib.Path(
    "../../5.scDINO_analysis/1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)

# set the output path
output_path = pathlib.Path(
    "../data/20231017ChromaLive_6hr_4ch_MaxIP_combined_data.parquet"
).resolve()

# make the parent directory
output_path.parent.mkdir(parents=True, exist_ok=True)

In [3]:
# load in the data
cellprofiller_data = pd.read_parquet(cellprofiller_annotated_features_path)
scdino_data = pd.read_parquet(scdino_features)
print(f"CellProfiller data shape: {cellprofiller_data.shape}")
print(f"scDINO data shape: {scdino_data.shape}")
cellprofiller_data.head()

CellProfiller data shape: (244887, 1939)
scDINO data shape: (243970, 1545)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_TrackObjects_DistanceTraveled_50,Nuclei_TrackObjects_FinalAge_50,Nuclei_TrackObjects_IntegratedDistance_50,Nuclei_TrackObjects_Label_50,Nuclei_TrackObjects_Lifetime_50,Nuclei_TrackObjects_Linearity_50,Nuclei_TrackObjects_ParentImageNumber_50,Nuclei_TrackObjects_ParentObjectNumber_50,Nuclei_TrackObjects_TrajectoryX_50,Nuclei_TrackObjects_TrajectoryY_50
0,1,E-10,6743,Staurosporine,78.13,test,1457,1,1,1,...,0.0,,0.0,1,1,1.0,0,0,0,0
1,1,E-10,6743,Staurosporine,78.13,test,1457,1,1,2,...,0.0,,0.0,3,1,1.0,0,0,0,0
2,1,E-10,6743,Staurosporine,78.13,test,1457,1,1,3,...,0.0,,0.0,4,1,1.0,0,0,0,0
3,1,E-10,6743,Staurosporine,78.13,test,1457,1,1,4,...,0.0,,0.0,5,1,1.0,0,0,0,0
4,1,E-10,6743,Staurosporine,78.13,test,1457,1,1,5,...,0.0,,0.0,6,1,1.0,0,0,0,0


In [4]:
scdino_data.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_compound,Metadata_dose,Metadata_control,channel488-1_cls_feature_0,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../../data/processed_images/crops/C-02/image_n...,C-02,1,10,10,10,Staurosporine,0.0,negative,-0.013294,...,-0.026835,-0.025143,0.022814,0.044236,-0.048172,0.003977,0.005565,0.033877,0.082223,0.009103
1,../../data/processed_images/crops/C-02/image_n...,C-02,1,10,10,100,Staurosporine,0.0,negative,0.003657,...,0.02401,0.002705,-0.059467,0.032855,-0.05753,0.031927,0.017482,0.051654,0.024463,-0.034733
2,../../data/processed_images/crops/C-02/image_n...,C-02,1,10,10,101,Staurosporine,0.0,negative,0.013813,...,-0.007732,0.024938,0.027292,0.034904,-0.127702,-0.014732,0.033218,0.008977,0.031269,-0.031651
3,../../data/processed_images/crops/C-02/image_n...,C-02,1,10,10,102,Staurosporine,0.0,negative,0.026417,...,-0.041826,-0.028302,-0.034485,0.032456,-0.054537,0.021049,0.028299,-0.006374,0.101494,-0.018018
4,../../data/processed_images/crops/C-02/image_n...,C-02,1,10,10,103,Staurosporine,0.0,negative,-0.002667,...,-0.029698,-0.017408,-0.011866,0.045469,-0.077298,0.000747,0.076386,-0.007291,0.015712,-0.011359


In [5]:
# get metadata columns
metadata_cols = [
    col for col in cellprofiller_data.columns if col.startswith("Metadata_")
]
metadata_cp_df = cellprofiller_data[metadata_cols].copy()
metadata_cp_df = metadata_cp_df[metadata_cols]
metadata_cp_df.head(1)

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Metadata_Nuclei_Number_Object_Number,Metadata_Image_FileName_488_1,Metadata_Image_FileName_488_2,Metadata_Image_FileName_561,Metadata_Image_FileName_DNA,Metadata_Image_PathName_488_2,Metadata_Image_PathName_561,Metadata_Image_PathName_DNA,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y
0,1,E-10,6743,Staurosporine,78.13,test,1457,1,1,1,...,1,E-10_F0001_T0001_Z0001_C02_illumcorrect.tiff,E-10_F0001_T0001_Z0001_C03_illumcorrect.tiff,E-10_F0001_T0001_Z0001_C04_illumcorrect.tiff,E-10_F0001_T0001_Z0001_C01_illumcorrect.tiff,/home/lippincm/Documents/4TB/data/live_cell_ti...,/home/lippincm/Documents/4TB/data/live_cell_ti...,/home/lippincm/Documents/4TB/data/live_cell_ti...,1315.861871,28.504556


In [6]:
# get metadata columns
metadata_cols = [col for col in scdino_data.columns if col.startswith("Metadata_")]
metadata_scdino_df = scdino_data[metadata_cols].copy()
metadata_scdino_df = metadata_scdino_df[metadata_cols]
metadata_scdino_df.head(1)

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_compound,Metadata_dose,Metadata_control
0,../../data/processed_images/crops/C-02/image_n...,C-02,1,10,10,10,Staurosporine,0.0,negative


In [7]:
merge_columns = [
    "Metadata_Well",
    "Metadata_FOV",
    "Metadata_Time",
    "Metadata_ImageNumber",
    "Metadata_Nuclei_Number_Object_Number",
]

# make all of the merge columns in both dfs object types
for col in merge_columns:
    cellprofiller_data[col] = cellprofiller_data[col].astype(str)
    scdino_data[col] = scdino_data[col].astype(str)

# make all of the merge columns in both metadata dfs object types
for col in merge_columns:
    metadata_cp_df[col] = metadata_cp_df[col].astype(str)
    metadata_scdino_df[col] = metadata_scdino_df[col].astype(str)

In [8]:
# make the merge columns the index
cellprofiller_data.set_index(merge_columns, inplace=True)
scdino_data.set_index(merge_columns, inplace=True)

In [9]:
# get the unique index values
scdino_index = scdino_data.index
cellprofiller_index = cellprofiller_data.index

unique_scdino_index = scdino_index.nunique()
unique_cellprofiller_index = cellprofiller_index.nunique()
scdino_index_not_in_cp = scdino_index.difference(cellprofiller_index)
cp_index_not_in_scdino = cellprofiller_index.difference(scdino_index)
index_in_both = scdino_index.intersection(cellprofiller_index)

print(f"Unique scDINO index values: {scdino_index.nunique()}")
print(f"Unique CellProfiller index values: {cellprofiller_index.nunique()}")
print(
    f"scDINO index values not in CellProfiller: {scdino_index.difference(cellprofiller_index).nunique()}"
)
print(
    f"CellProfiller index values not in scDINO: {cellprofiller_index.difference(scdino_index).nunique()}"
)
print(
    f"Index values in both: {scdino_index.intersection(cellprofiller_index).nunique()}"
)

# print(f"Uique scDINO - CP index values: {unique_scdino_index - unique_cellprofiller_index}")
print(
    f"Unique CP - scDINO index values: {unique_cellprofiller_index - unique_scdino_index}"
)

scdino_data = scdino_data.loc[index_in_both]
cellprofiller_data = cellprofiller_data.loc[index_in_both]

Unique scDINO index values: 240165
Unique CellProfiller index values: 244887
scDINO index values not in CellProfiller: 117
CellProfiller index values not in scDINO: 4839
Index values in both: 240048
Unique CP - scDINO index values: 4722


In [10]:
# concatenate the data
combined_data = pd.concat([cellprofiller_data, scdino_data], axis=1)
print(combined_data.shape)
# drop duplicate columns
combined_data = combined_data.loc[:, ~combined_data.columns.duplicated()]
# reset the index
combined_data.reset_index(inplace=True)
combined_data.head()

(240048, 3474)


Unnamed: 0,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,C-02,1,10,10,10,1,7914,Staurosporine,0.0,negative,...,-0.026835,-0.025143,0.022814,0.044236,-0.048172,0.003977,0.005565,0.033877,0.082223,0.009103
1,C-02,1,10,10,100,1,7914,Staurosporine,0.0,negative,...,0.02401,0.002705,-0.059467,0.032855,-0.05753,0.031927,0.017482,0.051654,0.024463,-0.034733
2,C-02,1,10,10,101,1,7914,Staurosporine,0.0,negative,...,-0.007732,0.024938,0.027292,0.034904,-0.127702,-0.014732,0.033218,0.008977,0.031269,-0.031651
3,C-02,1,10,10,102,1,7914,Staurosporine,0.0,negative,...,-0.041826,-0.028302,-0.034485,0.032456,-0.054537,0.021049,0.028299,-0.006374,0.101494,-0.018018
4,C-02,1,10,10,103,1,7914,Staurosporine,0.0,negative,...,-0.029698,-0.017408,-0.011866,0.045469,-0.077298,0.000747,0.076386,-0.007291,0.015712,-0.011359


In [11]:
# save the combined data
combined_data.to_parquet(output_path)