In [1]:
import pathlib

import numpy as np
import pandas as pd
import umap

In [2]:
# set paths
# input path
data_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
).resolve(strict=True)
# output path
output_path = pathlib.Path(
    "../../1.scDINO_run/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated_umap.csv"
).resolve()
# shiny output path
shiny_output_path = pathlib.Path(
    "../temporal_shiny_app/CLS_features_annotated_umap.csv"
).resolve()

output_path.parent.mkdir(parents=True, exist_ok=True)
shiny_output_path.parent.mkdir(parents=True, exist_ok=True)

In [3]:
# load in data
cls_df = pd.read_parquet(data_path)
cls_df.head()

Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,...,channel_DNA_cls_feature_90,channel_DNA_cls_feature_91,channel_DNA_cls_feature_92,channel_DNA_cls_feature_93,channel_DNA_cls_feature_94,channel_DNA_cls_feature_95,channel_DNA_cls_feature_96,channel_DNA_cls_feature_97,channel_DNA_cls_feature_98,channel_DNA_cls_feature_99
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,153148,Staurosporine,0.0,negative,...,0.035034,-0.000642,-0.034409,0.05673,-0.078027,0.033738,0.00602,0.016161,0.036666,-0.014224
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106.0,153149,Staurosporine,0.0,negative,...,-0.005401,-0.033103,-0.042951,0.012873,-0.07446,0.030204,0.03281,-0.017176,0.052583,-0.005795
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,122.0,153150,Staurosporine,0.0,negative,...,0.015695,0.027015,0.001708,0.039789,-0.119119,0.010958,0.007135,0.048439,0.046645,0.018558
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,128.0,153152,Staurosporine,0.0,negative,...,-0.009343,-0.016073,-0.03521,0.037125,-0.102121,-0.030821,0.027548,0.001282,0.091025,-0.000873
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13.0,153135,Staurosporine,0.0,negative,...,0.026049,-0.004126,0.002165,0.033042,-0.05401,0.02417,-0.018175,0.041652,0.04535,-0.033585


In [4]:
# get the metadata
metadata_df = cls_df.columns[cls_df.columns.str.contains("Metadata")]
metadata_df = cls_df[metadata_df]
feature_df = cls_df.drop(metadata_df.columns, axis=1)
print(f"metadata_df shape: {metadata_df.shape}")
print(f"feature_df shape: {feature_df.shape}")

metadata_df shape: (148829, 10)
feature_df shape: (148829, 1536)


In [5]:
# define the UMAP model
umap_model = umap.UMAP(
    n_components=2, random_state=0, n_neighbors=30, min_dist=0.1, metric="euclidean"
)

# fit the UMAP model
umap_embedding = umap_model.fit_transform(feature_df)
umap_embedding_df = pd.DataFrame(umap_embedding, columns=["UMAP1", "UMAP2"])
# add the metadata back
umap_embedding_df = pd.concat([metadata_df, umap_embedding_df], axis=1)
umap_embedding_df.head()

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Unnamed: 0,Metadata_image_path,Metadata_Well,Metadata_FOV,Metadata_Time,Metadata_ImageNumber,Metadata_Nuclei_Number_Object_Number,Metadata_original_index,Metadata_compound,Metadata_dose,Metadata_control,UMAP1,UMAP2
0,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,102.0,153148,Staurosporine,0.0,negative,1.395388,1.723467
1,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,106.0,153149,Staurosporine,0.0,negative,1.695359,0.815478
2,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,122.0,153150,Staurosporine,0.0,negative,1.579764,1.776482
3,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,128.0,153152,Staurosporine,0.0,negative,-1.738279,-1.807134
4,../0.pre-process_images/data/processed_images/...,C-02,1,1,1,13.0,153135,Staurosporine,0.0,negative,2.388962,0.785667


In [6]:
# save the UMAP embeddings to parquet
umap_embedding_df.to_csv(output_path)

# save the UMAP embeddings to shiny app
umap_embedding_df.to_csv(shiny_output_path)