# Generate UMAP coordinates for each plate

## Import libraries

In [None]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

## Set constants

In [None]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

## Create list of paths to feature selected data per plate

In [None]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files

['../../../3.processing_features/data/single_cell_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_6_sc_feature_selected.parquet']

In [None]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]

dict_keys(['Plate_5_sc_feature_selected.parquet', 'Plate_3_sc_feature_selected.parquet', 'Plate_1_sc_feature_selected.parquet', 'Plate_4_sc_feature_selected.parquet', 'Plate_3_prime_sc_feature_selected.parquet', 'Plate_2_sc_feature_selected.parquet', 'Plate_6_sc_feature_selected.parquet'])


[(5793, 1174),
 (11286, 1171),
 (241, 849),
 (7308, 1163),
 (5506, 1146),
 (1714, 856),
 (7383, 1152)]

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [None]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    print("UMAP embeddings being generated for", plate_name)

    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(random_state=umap_random_seed, n_components=umap_n_components)

    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(cp_df, operation="drop_na_columns", na_cutoff=0)

    # Make sure that the Plate_3_prime has correct name in Metadata_Plate column
    if plate_name.replace("_sc_feature_selected", "") == "Plate_3_prime":
        cp_df["Metadata_Plate"] = "Plate_3_prime"

    # Remove rows with genotype HET for Plate_5
    if plate_name.replace("_sc_feature_selected", "") == "Plate_5":
        cp_df = cp_df[cp_df["Metadata_genotype"] != "HET"]

    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [
        feature for feature in meta_features if feature in desired_columns
    ]

    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
    )
    print(embeddings.shape)

    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat(
        [cp_df.loc[:, filtered_meta_features].reset_index(drop=True), embeddings],
        axis=1,
    )

    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

UMAP embeddings being generated for Plate_5_sc_feature_selected


  warn(


(5793, 2)
UMAP embeddings being generated for Plate_3_sc_feature_selected


  warn(


(11286, 2)
UMAP embeddings being generated for Plate_1_sc_feature_selected


  warn(


(241, 2)
UMAP embeddings being generated for Plate_4_sc_feature_selected


  warn(


(7308, 2)
UMAP embeddings being generated for Plate_3_prime_sc_feature_selected


  warn(


(5506, 2)
UMAP embeddings being generated for Plate_2_sc_feature_selected


  warn(


(1714, 2)
UMAP embeddings being generated for Plate_6_sc_feature_selected


  warn(


(7383, 2)


In [None]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head()

(7383, 7)


Unnamed: 0,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_genotype,Metadata_Plate,UMAP0,UMAP1
6307,G2,10,114,Null,Plate_6,13.606967,3.857197
5036,F5,27,189,HET,Plate_6,13.040964,4.8061
1995,D3,2,55,Null,Plate_6,14.647306,-1.431455
4156,E11,20,232,WT,Plate_6,11.372808,3.504234
6328,G2,10,114,Null,Plate_6,12.755859,3.256218


## Create UMAP embeddings with the plates used to the train the model combined

In [None]:
# Select file paths for plates 5, 3, and 3 prime only
selected_plates = ["Plate_5", "Plate_3", "Plate_3_prime"]

# Filter and concatenate the selected plates
selected_dfs = []
for file_path in fs_files:
    plate_name = pathlib.Path(file_path).stem.replace("_sc_feature_selected", "")

    # Only read in selected plates
    if plate_name in selected_plates:
        df = pd.read_parquet(file_path)

        selected_dfs.append(df)

In [None]:
# Get the column names of all DataFrames in selected_dfs
column_sets = [set(df.columns) for df in selected_dfs]

# Find the common column names across all DataFrames
common_columns = list(set.intersection(*column_sets))

# Exclude columns that start with "Metadata" to print the number of features
feature_columns = [col for col in common_columns if not col.startswith("Metadata")]

# Print length of only features
len(feature_columns)

907

### Save all plate data features together as parquet file

In [None]:
# Filter each DataFrame in selected_dfs to include only common columns
selected_dfs_filtered = [df.loc[:, common_columns] for df in selected_dfs]

# Concatenate the filtered dataframes along the rows
concatenated_df = pd.concat(selected_dfs_filtered, ignore_index=True)

# Save the concatenated dataframe to a file
output_concatenated_file = pathlib.Path(
    output_dir, "concatenated_norm_fs_plates_5_3_3prime.parquet"
)
concatenated_df.to_parquet(output_concatenated_file, index=False)

print(concatenated_df.shape)
concatenated_df.head()

(22585, 924)


Unnamed: 0,Cytoplasm_RadialDistribution_ZernikeMagnitude_CY5_8_6,Cytoplasm_RadialDistribution_ZernikePhase_DAPI_7_7,Cells_Texture_Correlation_DAPI_3_02_256,Cytoplasm_RadialDistribution_ZernikeMagnitude_RFP_9_1,Nuclei_RadialDistribution_ZernikeMagnitude_DAPI_4_4,Nuclei_RadialDistribution_ZernikeMagnitude_DAPI_9_9,Cytoplasm_RadialDistribution_ZernikePhase_CY5_6_4,Cytoplasm_RadialDistribution_ZernikeMagnitude_RFP_4_0,Cells_Neighbors_FirstClosestDistance_Adjacent,Cytoplasm_RadialDistribution_ZernikeMagnitude_CY5_4_0,...,Cells_RadialDistribution_ZernikePhase_GFP_8_4,Nuclei_Correlation_Overlap_CY5_DAPI,Cytoplasm_RadialDistribution_ZernikePhase_DAPI_6_6,Nuclei_RadialDistribution_ZernikeMagnitude_CY5_8_6,Cells_RadialDistribution_ZernikeMagnitude_RFP_6_0,Nuclei_Intensity_MinIntensityEdge_GFP,Cytoplasm_RadialDistribution_ZernikePhase_CY5_3_3,Cytoplasm_RadialDistribution_ZernikePhase_DAPI_8_2,Nuclei_RadialDistribution_ZernikeMagnitude_RFP_9_1,Cytoplasm_AreaShape_MinorAxisLength
0,0.265286,1.677765,0.076674,0.267877,-0.952152,-0.021434,-0.0286,-0.730467,-0.682761,-0.676159,...,-1.460473,0.367525,-0.034545,0.527203,-1.008854,1.977849,1.094638,1.301773,-0.033985,0.665844
1,-0.594315,-0.35402,0.288133,-0.42473,-0.696601,-0.776204,-1.484421,-0.042732,3.063331,0.538387,...,0.255279,1.491258,-1.552236,0.134838,0.568971,1.074115,0.399387,0.733389,0.293142,-0.187398
2,-0.754955,-0.339014,0.352881,-0.87669,-0.435444,0.129837,0.027724,0.050477,-0.490331,-0.064076,...,-1.451149,-0.651553,-0.271857,0.385662,-0.353175,0.174312,-1.00624,-1.515276,-0.655531,0.709797
3,0.605487,-1.537686,-0.224682,0.478217,0.821912,-0.348275,0.294595,0.884025,-0.490331,0.30738,...,-1.54878,0.509328,0.924358,-0.433322,1.578459,0.464376,-0.9311,1.332815,1.479997,-0.872115
4,-0.785451,-0.061756,0.473039,-1.05622,-0.078474,-0.479195,0.472753,-0.619875,3.180456,-0.939778,...,1.729036,1.298129,-0.346023,0.02105,-0.824535,0.547177,1.706737,0.805289,-0.760059,1.363364


In [None]:
desired_columns = [
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Site",
    "Metadata_number_of_singlecells",
    "Metadata_genotype",
]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(random_state=umap_random_seed, n_components=umap_n_components)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(concatenated_df)
meta_features = infer_cp_features(concatenated_df, metadata=True)
filtered_meta_features = [
    feature for feature in meta_features if feature in desired_columns
]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(concatenated_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)],
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat(
    [concatenated_df.loc[:, filtered_meta_features], embeddings], axis=1
)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
output_umap_file = pathlib.Path(
    output_dir, f"UMAP_concat_model_plates_sc_feature_selected.tsv"
)
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")

  warn(


(22585, 2)
