# Generate UMAP coordinates for each plate

## Import libraries

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features


## Set constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)


## Create list of paths to feature selected data per plate

In [3]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files


['../../../3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet']

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]


dict_keys(['Plate_1_sc_feature_selected.parquet', 'Plate_3_prime_sc_feature_selected.parquet', 'Plate_4_sc_feature_selected.parquet', 'Plate_5_sc_feature_selected.parquet', 'Plate_2_sc_feature_selected.parquet', 'Plate_3_sc_feature_selected.parquet'])


[(241, 849),
 (5506, 1146),
 (7308, 1163),
 (5793, 1174),
 (1714, 856),
 (11286, 1171)]

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [5]:
desired_columns = ["Metadata_Plate","Metadata_Well", "Metadata_Site", "Metadata_number_of_singlecells", "Metadata_genotype"]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    print("UMAP embeddings being generated for", plate_name)

     # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(
        random_state=umap_random_seed,
        n_components=umap_n_components
    )

    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(
        cp_df,
        operation="drop_na_columns",
        na_cutoff=0
    )

    # Make sure that the Plate_3_prime has correct name in Metadata_Plate column
    if plate_name.replace("_sc_feature_selected", "") == "Plate_3_prime":
        cp_df["Metadata_Plate"] = "Plate_3_prime"

    # Remove rows with genotype HET for Plate_5
    if plate_name.replace("_sc_feature_selected", "") == "Plate_5":
        cp_df = cp_df[cp_df["Metadata_genotype"] != "HET"]
    
    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [feature for feature in meta_features if feature in desired_columns]
    
    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)]
    )
    print(embeddings.shape)
    
    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat([
        cp_df.loc[:, filtered_meta_features].reset_index(drop=True),
        embeddings
    ], axis=1)
    
    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")


UMAP embeddings being generated for Plate_1_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(241, 2)
UMAP embeddings being generated for Plate_3_prime_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(5506, 2)
UMAP embeddings being generated for Plate_4_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(7308, 2)
UMAP embeddings being generated for Plate_5_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(5793, 2)
UMAP embeddings being generated for Plate_2_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(1714, 2)
UMAP embeddings being generated for Plate_3_sc_feature_selected


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(11286, 2)


In [6]:
# Print an example output file
print(cp_umap_with_metadata_df.shape)
cp_umap_with_metadata_df.head()


(11286, 7)


Unnamed: 0,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_genotype,Metadata_Plate,UMAP0,UMAP1
4136,D4,8,636,WT,Plate_3,8.818437,4.097239
5457,B10,24,86,Null,Plate_3,9.01261,6.500041
10346,F12,17,647,Null,Plate_3,8.104309,5.912871
3701,D4,1,636,WT,Plate_3,10.596231,3.488836
9596,E12,14,607,Null,Plate_3,9.566523,3.041514


In [7]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Select only the feature selected files
fs_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{fs_suffix}")
fs_files

['../../../3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet']

In [8]:
# Select file paths for plates 5, 3, and 3 prime only
selected_plates = ["Plate_5", "Plate_3", "Plate_3_prime", "Plate_4"]

# Filter and concatenate the selected plates
selected_dfs = []
for file_path in fs_files:
    plate_name = pathlib.Path(file_path).stem.replace("_sc_feature_selected", "")

    # Only read in selected plates
    if plate_name in selected_plates:
        df = pd.read_parquet(file_path)

        # Update Metadata_Plate for Plate_3_prime
        if plate_name == "Plate_3_prime":
            df["Metadata_Plate"] = "Plate_3_prime"

        # For Plate_4, only include rows where Metadata_siRNA is "None"
        if plate_name == "Plate_4":
            # Fill NaN values in "Metadata_siRNA" column with "No Construct"
            df["Metadata_siRNA"].fillna("No Construct", inplace=True)

            # Filter out rows where "Metadata_siRNA" is "No Construct"
            df = df[df["Metadata_siRNA"] == "No Construct"]

        selected_dfs.append(df)

In [9]:
# Get the column names of all DataFrames in selected_dfs
column_sets = [set(df.columns) for df in selected_dfs]

# Find the common column names across all DataFrames
common_columns = list(set.intersection(*column_sets))

len(common_columns)

869

In [10]:
# Filter each DataFrame in selected_dfs to include only common columns
selected_dfs_filtered = [df.loc[:, common_columns] for df in selected_dfs]

# Concatenate the filtered dataframes along the rows
concatenated_df = pd.concat(selected_dfs_filtered, ignore_index=True)

# Save the concatenated dataframe to a file
output_concatenated_file = pathlib.Path(output_dir, "concatenated_norm_fs_plates_5_3_3prime_4controls.parquet")
concatenated_df.to_parquet(output_concatenated_file, index=False)

print(concatenated_df.shape)
concatenated_df.head()


(24237, 869)


Unnamed: 0,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Cells_RadialDistribution_ZernikePhase_GFP_3_3,Nuclei_Correlation_RWC_CY5_DAPI,Cells_AreaShape_Zernike_4_2,Cytoplasm_RadialDistribution_ZernikePhase_DAPI_3_1,Cells_Texture_Correlation_DAPI_3_03_256,Cells_RadialDistribution_RadialCV_CY5_3of4,Cells_RadialDistribution_ZernikeMagnitude_RFP_4_0,Cytoplasm_RadialDistribution_ZernikePhase_DAPI_5_3,Cells_RadialDistribution_ZernikePhase_DAPI_5_5,...,Nuclei_RadialDistribution_ZernikeMagnitude_RFP_6_4,Cytoplasm_RadialDistribution_MeanFrac_RFP_2of4,Nuclei_RadialDistribution_ZernikePhase_CY5_9_9,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Cytoplasm_Texture_Contrast_DAPI_3_01_256,Metadata_number_of_singlecells,Cytoplasm_RadialDistribution_ZernikePhase_CY5_7_5,Nuclei_RadialDistribution_ZernikeMagnitude_CY5_5_3,Nuclei_Correlation_RWC_CY5_RFP,Cytoplasm_RadialDistribution_ZernikeMagnitude_GFP_9_1
0,0.141945,-1.250977,-0.774238,-0.598308,-0.32482,0.469953,-0.16879,2.174741,0.234069,-0.810408,...,0.648051,0.702338,1.39833,-1.125453,0.255068,30,1.181074,-0.912926,-1.520393,1.966928
1,0.165176,0.716965,1.488602,-1.11713,-0.831449,-0.247176,0.379996,0.455417,-0.97752,0.813808,...,-0.086966,0.399675,1.135891,-0.568796,-0.314933,30,1.134162,-0.707515,0.759211,1.635376
2,0.896395,-1.278784,1.140682,-1.236014,0.489929,0.215079,0.443063,1.276863,0.085449,0.936329,...,-0.214997,0.501861,1.725139,-1.224239,-0.511855,30,-1.691796,-0.897987,0.726766,-0.142542
3,0.182652,-1.029697,-0.201231,0.099055,-0.321274,0.262069,2.796305,0.895989,0.161515,-0.915909,...,0.061657,-0.793286,0.297918,0.423809,0.244527,30,-1.661782,2.126933,0.70069,-0.451321
4,-0.339741,0.190367,1.291133,-0.038245,1.171063,0.946991,-0.905929,2.459778,-1.600329,0.638119,...,-1.1341,1.70305,-1.163913,0.609913,-0.019757,30,-0.839158,-0.933506,0.97742,0.16272


In [11]:
desired_columns = ["Metadata_Plate","Metadata_Well", "Metadata_Site", "Metadata_number_of_singlecells", "Metadata_genotype"]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed,
    n_components=umap_n_components
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(concatenated_df)
meta_features = infer_cp_features(concatenated_df, metadata=True)
filtered_meta_features = [feature for feature in meta_features if feature in desired_columns]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(concatenated_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)]
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat([
    concatenated_df.loc[:, filtered_meta_features],
    embeddings
], axis=1)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
output_umap_file = pathlib.Path(output_dir, f"UMAP_Concat_sc_feature_selected.tsv")
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(24237, 2)
