# Generate UMAP coordinates for each plate

## Import libraries

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features


  from .autonotebook import tqdm as notebook_tqdm


## Set constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)


## Create list of paths to feature selected data per plate

In [3]:
# Set input paths
data_dir = pathlib.Path("../4.preprocess_features/data/single_cell_profiles")

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files


['../4.preprocess_features/data/single_cell_profiles/slide1_sc_feature_selected.parquet',
 '../4.preprocess_features/data/single_cell_profiles/slide3_sc_feature_selected.parquet',
 '../4.preprocess_features/data/single_cell_profiles/slide4_sc_feature_selected.parquet',
 '../4.preprocess_features/data/single_cell_profiles/slide2_sc_feature_selected.parquet']

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]


dict_keys(['slide1_sc_feature_selected.parquet', 'slide3_sc_feature_selected.parquet', 'slide4_sc_feature_selected.parquet', 'slide2_sc_feature_selected.parquet'])


[(71962, 285), (58106, 289), (65351, 287), (53637, 289)]

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [5]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_CellLine",
    "Metadata_Condition",
    "Metadata_Nuclei_Site_Count",
]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    print("UMAP embeddings being generated for", plate_name)

    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(random_state=umap_random_seed, n_components=umap_n_components)

    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(cp_df, operation="drop_na_columns", na_cutoff=0)

    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [
        feature for feature in meta_features if feature in desired_columns
    ]

    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )
    print(embeddings.shape)

    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat(
        [cp_df.loc[:, filtered_meta_features].reset_index(drop=True), embeddings],
        axis=1,
    )

    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

UMAP embeddings being generated for slide1_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(71962, 2)
UMAP embeddings being generated for slide3_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(58106, 2)
UMAP embeddings being generated for slide4_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(65351, 2)
UMAP embeddings being generated for slide2_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(53637, 2)


In [6]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head(10)

(53637, 8)


Unnamed: 0,Metadata_CellLine,Metadata_Condition,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Nuclei_Site_Count,UMAP0,UMAP1
47160,293T,untreated,slide2,B4,M57,130,-0.797395,11.367114
37058,293T,untreated,slide2,B4,M3,982,-2.343521,4.681797
28510,293T,untreated,slide2,B4,M27,630,-1.208591,7.816617
47632,293T,untreated,slide2,B4,M59,169,-0.261996,7.061363
11492,786O,TMEM259 kd5,slide2,B3,M29,133,2.90888,6.136668
28451,293T,untreated,slide2,B4,M27,630,-0.631544,3.043315
46798,293T,untreated,slide2,B4,M56,939,-1.742979,6.620913
20686,293T,untreated,slide2,B4,M16,1077,-2.819395,5.980897
15378,293T,untreated,slide2,B4,M10,775,-1.846369,6.406137
37421,293T,untreated,slide2,B4,M3,982,0.255176,2.220255


In [7]:
# Sort the DataFrame by Metadata_Nuclei_Site_Count in ascending order
sorted_df = cp_umap_with_metadata_df.sort_values(by='Metadata_Nuclei_Site_Count')

# Print the shape of the sorted DataFrame
print(sorted_df.shape)

# Display the first 10 rows of the sorted DataFrame
sorted_df.head(10)


(53637, 8)


Unnamed: 0,Metadata_CellLine,Metadata_Condition,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Nuclei_Site_Count,UMAP0,UMAP1
1909,786O,NTC,slide2,A1,M60,12,5.189864,4.746127
1916,786O,NTC,slide2,A1,M60,12,6.257066,5.037874
1910,786O,NTC,slide2,A1,M60,12,6.180382,5.005356
1913,786O,NTC,slide2,A1,M60,12,7.088726,5.255714
1907,786O,NTC,slide2,A1,M60,12,4.083712,5.311757
1905,786O,NTC,slide2,A1,M60,12,6.811673,4.748197
1911,786O,NTC,slide2,A1,M60,12,5.82826,4.762316
1914,786O,NTC,slide2,A1,M60,12,6.953853,4.63711
1906,786O,NTC,slide2,A1,M60,12,4.312699,5.310297
1915,786O,NTC,slide2,A1,M60,12,7.18483,4.751488


In [8]:
# Filter rows where UMAP1 column has values above 10
filtered_df = cp_umap_with_metadata_df[cp_umap_with_metadata_df['UMAP1'] > 10]

# Print the shape of the filtered DataFrame
print(filtered_df.shape)

# Display the first 10 rows of the filtered DataFrame
filtered_df.head(10)


(2589, 8)


Unnamed: 0,Metadata_CellLine,Metadata_Condition,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Nuclei_Site_Count,UMAP0,UMAP1
47160,293T,untreated,slide2,B4,M57,130,-0.797395,11.367114
34982,293T,untreated,slide2,B4,M36,349,1.012857,11.672754
6491,786O,SARNP kd3,slide2,B2,M22,194,2.542357,12.203351
19508,293T,untreated,slide2,B4,M15,1038,2.137926,12.044132
21563,293T,untreated,slide2,B4,M19,597,2.323619,12.531433
42950,293T,untreated,slide2,B4,M49,492,2.458835,11.711025
16102,293T,untreated,slide2,B4,M11,809,2.148402,12.422447
31781,293T,untreated,slide2,B4,M30,750,2.311525,12.533288
43838,293T,untreated,slide2,B4,M4,882,0.716558,11.352507
6376,786O,SARNP kd3,slide2,B2,M21,175,3.05252,12.205164
