## Extract UMAP embeddings for CFReT CP Features

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
# Set constants (previously set prior, normally use 0 but the change in coordinates will impact already generated single-cell crops)
umap_random_seed = 1234
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
# Set input paths
data_dir = pathlib.Path("..", "..", "..", "3.process_cfret_features", "data", "single_cell_profiles")

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files

['../../../3.process_cfret_features/data/single_cell_profiles/localhost220513100001_KK22-05-198_FactinAdjusted_sc_feature_selected.parquet',
 '../../../3.process_cfret_features/data/single_cell_profiles/localhost220512140003_KK22-05-198_sc_feature_selected.parquet',
 '../../../3.process_cfret_features/data/single_cell_profiles/localhost230405150001_sc_feature_selected.parquet',
 '../../../3.process_cfret_features/data/single_cell_profiles/localhost231120090001_sc_feature_selected.parquet']

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]

dict_keys(['localhost220513100001_KK22-05-198_FactinAdjusted_sc_feature_selected.parquet', 'localhost220512140003_KK22-05-198_sc_feature_selected.parquet', 'localhost230405150001_sc_feature_selected.parquet', 'localhost231120090001_sc_feature_selected.parquet'])


[(17536, 642), (42288, 725), (26471, 697), (21370, 711)]

In [5]:
# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(
        random_state=umap_random_seed,
        n_components=umap_n_components
    )
    
    # Remove NA columns
    cp_df = cp_dfs[plate]
    cp_df = feature_select(
        cp_df,
        operation="drop_na_columns",
        na_cutoff=0
    )
    
    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    
    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)]
    )
    print(embeddings.shape)
    
    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat([
        cp_df.loc[:, meta_features],
        embeddings
    ], axis=1)
    
    # Generate output file, drop unnamed column, and save 
    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv.gz")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

(17536, 2)
(42288, 2)
(26471, 2)
(21370, 2)


In [6]:
# Print an example output file
cp_umap_with_metadata_df.head()

Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Metadata_Site,UMAP0,UMAP1
0,B,2,2,Healthy,,,520.849209,277.58342,20,1.0,,B02,6.0,6.0,8.0,8.0,f00,2.027209,0.060758
1,B,2,2,Healthy,,,313.66111,374.449986,20,1.0,,B02,7.0,7.0,9.0,9.0,f00,-0.215,-2.519607
2,B,2,2,Healthy,,,709.496383,379.652932,20,1.0,,B02,8.0,8.0,10.0,10.0,f00,-1.686028,-0.96724
3,B,2,2,Healthy,,,869.851378,511.154606,20,1.0,,B02,9.0,9.0,12.0,12.0,f00,1.831661,-0.902232
4,B,2,2,Healthy,,,588.967372,503.936707,20,1.0,,B02,10.0,10.0,13.0,13.0,f00,-0.611805,-0.944782
