# Generate UMAP coordinates for each plate

## Import libraries

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

## Set constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

## Create list of paths to feature selected data per plate

In [3]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files

['../../../3.processing_features/data/single_cell_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_6_sc_feature_selected.parquet']

### Set dictionary for all plates to be processed independently

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]

dict_keys(['Plate_5_sc_feature_selected.parquet', 'Plate_3_sc_feature_selected.parquet', 'Plate_1_sc_feature_selected.parquet', 'Plate_4_sc_feature_selected.parquet', 'Plate_3_prime_sc_feature_selected.parquet', 'Plate_2_sc_feature_selected.parquet', 'Plate_6_sc_feature_selected.parquet'])


[(5793, 1174),
 (11286, 1171),
 (241, 849),
 (7308, 1163),
 (5506, 1146),
 (1714, 856),
 (7383, 1152)]

### Create list of specific files for concat UMAP for plates used in modelling

In [5]:
# Select file paths for plates 5, 3, and 3 prime only
selected_plates = ["Plate_5", "Plate_3", "Plate_3_prime"]

# Filter and concatenate the selected plates
selected_dfs = []
for file_path in fs_files:
    plate_name = pathlib.Path(file_path).stem.replace("_sc_feature_selected", "")

    # Only read in selected plates
    if plate_name in selected_plates:
        df = pd.read_parquet(file_path)

        selected_dfs.append(df)

### Get specific features used in the model

In [6]:
# Get the column names of all DataFrames in selected_dfs
column_sets = [set(df.columns) for df in selected_dfs]

# Find the common column names across all DataFrames which are used in the model
common_columns = list(set.intersection(*column_sets))

# Exclude columns that start with "Metadata" to print the number of features
model_columns = [col for col in common_columns if not col.startswith("Metadata")]

# Print length of only features
len(model_columns)

907

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [7]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    print("UMAP embeddings being generated for", plate_name)

    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(
        random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
    )

    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(cp_df, operation="drop_na_columns", na_cutoff=0)

    # Make sure that the Plate_3_prime has correct name in Metadata_Plate column
    if plate_name.replace("_sc_feature_selected", "") == "Plate_3_prime":
        cp_df["Metadata_Plate"] = "Plate_3_prime"

    # Remove rows with genotype HET for Plate_6
    if plate_name.replace("_sc_feature_selected", "") == "Plate_6":
        cp_df = cp_df[cp_df["Metadata_genotype"] != "HET"]

    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [
        feature for feature in meta_features if feature in desired_columns
    ]

    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )
    print(embeddings.shape)

    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat(
        [cp_df.loc[:, filtered_meta_features].reset_index(drop=True), embeddings],
        axis=1,
    )

    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

UMAP embeddings being generated for Plate_5_sc_feature_selected




(5793, 2)
UMAP embeddings being generated for Plate_3_sc_feature_selected




(11286, 2)
UMAP embeddings being generated for Plate_1_sc_feature_selected




(241, 2)
UMAP embeddings being generated for Plate_4_sc_feature_selected




(7308, 2)
UMAP embeddings being generated for Plate_3_prime_sc_feature_selected




(5506, 2)
UMAP embeddings being generated for Plate_2_sc_feature_selected




(1714, 2)
UMAP embeddings being generated for Plate_6_sc_feature_selected




(4698, 2)


In [8]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head()

(4698, 7)


Unnamed: 0,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_genotype,Metadata_Plate,UMAP0,UMAP1
1422,D8,16,62,Null,Plate_6,7.546376,-1.156975
825,B9,21,135,WT,Plate_6,8.928883,-0.217073
971,B11,13,122,WT,Plate_6,8.568033,-0.431594
2833,E9,11,251,WT,Plate_6,1.046101,-0.248198
2815,E8,19,215,WT,Plate_6,1.78118,1.303337


## Create UMAP embeddings with the plates used to the train the model combined

### Save all plate data features together as parquet file

In [9]:
# Filter each DataFrame in selected_dfs to include only common columns
selected_dfs_filtered = [df.loc[:, common_columns] for df in selected_dfs]

# Concatenate the filtered dataframes along the rows
concatenated_df = pd.concat(selected_dfs_filtered, ignore_index=True)

# Ensure column consistency in the concatenated dataframe
concatenated_df = concatenated_df[sorted(concatenated_df.columns)]

# Save the concatenated dataframe to a file
output_concatenated_file = pathlib.Path(
    output_dir, "concatenated_norm_fs_plates_5_3_3prime.parquet"
)
concatenated_df.to_parquet(output_concatenated_file, index=False)

print(concatenated_df.shape)
concatenated_df.head()

(22585, 924)


Unnamed: 0,Cells_AreaShape_FormFactor,Cells_AreaShape_MedianRadius,Cells_AreaShape_Solidity,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_4_0,Cells_AreaShape_Zernike_4_2,Cells_AreaShape_Zernike_5_1,Cells_AreaShape_Zernike_5_3,...,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256
0,-0.537061,-0.114043,-0.452353,0.177763,-0.507576,0.7448,-0.593624,0.407819,0.897751,0.053598,...,0.778703,0.773044,0.446068,0.551483,-0.60931,-0.547578,-0.674383,-1.069853,-0.904776,-0.55027
1,0.13132,0.968984,0.784405,-1.050301,0.283189,-0.532453,1.5988,0.796724,-0.572857,0.176497,...,0.688567,-0.23481,-0.442285,0.046892,-0.574767,0.301496,-1.13317,-1.194594,-0.828107,-0.702868
2,-0.143029,0.443322,-0.759608,2.149026,0.136533,1.59427,-0.086187,-1.669112,-1.070604,-0.447571,...,-0.210697,-0.067782,-0.208035,-0.016246,-0.455077,-0.404736,-0.127992,-0.504128,-0.202547,0.325203
3,-0.130645,-0.207787,0.364933,-1.412007,-0.345663,-1.3586,1.695586,0.410564,-1.221655,-1.568193,...,0.494065,0.551486,-0.129755,-0.119069,0.2363,0.169281,0.48135,-0.392029,-0.4712,-0.169241
4,0.752448,2.595117,1.521807,1.892454,0.962098,1.138753,-0.471784,-0.259837,0.11104,-1.243963,...,-0.985316,-1.63619,-1.861623,-1.038659,-1.321303,0.64523,0.49264,0.033404,0.850012,-0.124964


In [10]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(concatenated_df)
meta_features = infer_cp_features(concatenated_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(concatenated_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [concatenated_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
output_umap_file = pathlib.Path(
    output_dir, "UMAP_concat_model_plates_sc_feature_selected.tsv"
)
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")



(22585, 2)


## Generate Plate 6 UMAP embeddings using the model features specifically

In [11]:
# Load in Plate 6 normalized data to then filter down the features with the model_columns
plate_6_norm_df = pd.read_parquet(
    pathlib.Path(data_dir, "Plate_6_sc_normalized.parquet")
)

# Drop rows where Metadata_genotype is HET
plate_6_norm_df = plate_6_norm_df[plate_6_norm_df["Metadata_genotype"] != "HET"]

# Filter the plate_6 data for the columns in model_columns
plate_6_filtered_features = plate_6_norm_df[model_columns]

# Add the metadata columns back
metadata_columns = [
    col for col in plate_6_norm_df.columns if col.startswith("Metadata_")
]
plate_6_filtered_df = pd.concat(
    [plate_6_norm_df[metadata_columns], plate_6_filtered_features], axis=1
)

# Drop rows with NaN values in the feature columns
plate_6_filtered_df = plate_6_filtered_df.dropna(
    subset=[
        col for col in plate_6_filtered_df.columns if not col.startswith("Metadata_")
    ]
)

# Change Metadata_Plate for all rows to Plate_6_filtered to avoid issues downstream loading in plates
plate_6_filtered_df["Metadata_Plate"] = "Plate_6_filtered"

# Confirm that index is reset to avoid any NaN issues
plate_6_filtered_df = plate_6_filtered_df.reset_index(drop=True)

print(len(plate_6_filtered_features.columns))

# Display the filtered dataframe
print(plate_6_filtered_df.shape)
plate_6_filtered_df.head()

907
(4695, 930)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Institution,Metadata_seed_density,Metadata_ImageNumber,...,Nuclei_RadialDistribution_ZernikeMagnitude_DAPI_2_2,Cytoplasm_RadialDistribution_ZernikeMagnitude_CY5_4_2,Cells_AreaShape_Zernike_6_0,Cytoplasm_Intensity_MADIntensity_DAPI,Nuclei_RadialDistribution_ZernikePhase_RFP_1_1,Cytoplasm_RadialDistribution_ZernikePhase_GFP_9_5,Cytoplasm_RadialDistribution_ZernikeMagnitude_DAPI_1_1,Nuclei_AreaShape_Zernike_8_6,Nuclei_RadialDistribution_RadialCV_GFP_1of4,Cytoplasm_Correlation_Correlation_DAPI_RFP
0,B,2,B2,22,129,NF1,WT,iNFixion,1000,80,...,1.475054,1.544858,1.959758,2.25309,-0.224372,0.681641,-0.512263,0.491676,-0.335859,1.29388
1,B,2,B2,2,129,NF1,WT,iNFixion,1000,88,...,1.504317,1.419296,-0.191983,-0.112935,0.5429,0.769917,0.197861,-0.736728,-0.617016,0.859221
2,B,2,B2,31,129,NF1,WT,iNFixion,1000,90,...,-0.302108,-1.02196,-1.319145,-0.455556,-0.600631,-1.186243,-0.422875,-1.793821,0.296374,1.996709
3,B,2,B2,5,129,NF1,WT,iNFixion,1000,98,...,1.23025,0.207371,1.55779,0.029285,0.96091,-0.888825,0.162054,1.039178,-0.031683,1.14651
4,B,2,B2,9,129,NF1,WT,iNFixion,1000,102,...,-1.734354,2.773808,1.719666,4.18599,1.044073,-0.318797,4.494823,1.593645,0.315793,-2.327942


In [12]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed, n_components=umap_n_components, n_jobs=1
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(plate_6_filtered_df)
meta_features = infer_cp_features(plate_6_filtered_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(plate_6_filtered_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [plate_6_filtered_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
output_umap_file = pathlib.Path(output_dir, "UMAP_Plate_6_sc_only_model_features.tsv")
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")



(4695, 2)
