# Generate UMAP embeddings per plate and profile (single-cell or organoid)

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

## Set paths and constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

## Identify file paths to process

In [3]:
# Set input paths
data_dir = pathlib.Path("../4.preprocess_features/data/single_cell_profiles")

# Select only the feature selected files
file_suffix = "*feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files

['../4.preprocess_features/data/single_cell_profiles/NF0014_organoid_feature_selected.parquet',
 '../4.preprocess_features/data/single_cell_profiles/NF0014_sc_feature_selected.parquet']

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]

dict_keys(['NF0014_organoid_feature_selected.parquet', 'NF0014_sc_feature_selected.parquet'])


[(152, 530), (2142, 1334)]

In [5]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_treatment",
    "Metadata_dose",
    "Metadata_ZSlice",
    "Metadata_Nuclei_Location_Center_X",
    "Metadata_Nuclei_Location_Center_Y",
]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    # Extract the first two parts of the plate name
    plate_name_parts = pathlib.Path(plate).stem.split("_")[:2]
    plate_name = "_".join(plate_name_parts)
    print("UMAP embeddings being generated for", plate_name)

    # Set compartments based on the second part of the plate name
    if plate_name_parts[1] == "sc":
        compartments = ["nuclei", "cells", "cytoplasm"]
    elif plate_name_parts[1] == "organoid":
        compartments = ["organoids"]
    else:
        compartments = []  # or handle other cases if necessary

    # Continue with UMAP processing
    umap_fit = umap.UMAP(random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1)

    # Select one plate at a time to process
    cp_df = cp_dfs[plate]

    # Separate feature versus metadata
    cp_features = infer_cp_features(cp_df, compartments=compartments)
    meta_features = infer_cp_features(cp_df, metadata=True, compartments=compartments)
    filtered_meta_features = [
        feature for feature in meta_features if feature in desired_columns
    ]

    # Confirms that no NA columns are included
    cp_df = feature_select(
        cp_dfs[plate], features=cp_features, operation="drop_na_columns", na_cutoff=0
    )

    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )
    print(embeddings.shape)

    cp_umap_with_metadata_df = pd.concat(
        [cp_df.loc[:, filtered_meta_features].reset_index(drop=True), embeddings],
        axis=1,
    )
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

UMAP embeddings being generated for NF0014_organoid
(152, 2)
UMAP embeddings being generated for NF0014_sc
(2142, 2)


In [6]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head(10)

(2142, 10)


Unnamed: 0,Metadata_treatment,Metadata_dose,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_ZSlice,UMAP0,UMAP1
1719,DMSO,1,455.845159,1097.101441,NF0014,G9,2,ZS060,10.488344,3.322388
175,Onalespib,1,997.921257,587.684398,NF0014,F2,1,ZS016,9.02552,1.798351
1220,Cabozantinib,1,933.373082,828.089261,NF0014,E7,2,ZS028,14.360832,2.497238
562,DMSO,1,594.749104,876.462366,NF0014,E4,1,ZS048,12.116467,0.906233
1184,Cabozantinib,1,732.368894,395.807737,NF0014,D7,1,ZS046,13.056751,-0.201787
1342,Binimetinib,10,645.430652,1187.03968,NF0014,D8,1,ZS034,13.33567,-0.699606
2057,Selumetinib,1,919.729866,1270.625839,NF0014,D11,2,ZS060,13.62449,3.410051
1693,DMSO,1,1266.556122,539.344388,NF0014,F9,2,ZS046,13.370322,3.341992
391,DMSO,1,1105.579903,677.227935,NF0014,C4,2,ZS032,12.200756,2.486074
1275,Binimetinib,1,851.306344,734.030211,NF0014,G7,1,ZS024,11.024634,4.511296
