# Generate UMAP coordinates for each plate

## Import libraries

In [1]:
import glob
import pathlib
import pandas as pd
import umap

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features


## Set constants

In [2]:
# Set constants
umap_random_seed = 0
umap_n_components = 2

output_dir = pathlib.Path("results")
output_dir.mkdir(parents=True, exist_ok=True)


## Create list of paths to feature selected data per plate

In [3]:
# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Select only the feature selected files
file_suffix = "*sc_feature_selected.parquet"

# Obtain file paths for all feature selected plates
fs_files = glob.glob(f"{data_dir}/{file_suffix}")
fs_files


['../../../3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_5_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet']

In [4]:
# Load feature data into a dictionary, keyed on plate name
cp_dfs = {x.split("/")[-1]: pd.read_parquet(x) for x in fs_files}

# Print out useful information about each dataset
print(cp_dfs.keys())
[cp_dfs[x].shape for x in cp_dfs]


dict_keys(['Plate_1_sc_feature_selected.parquet', 'Plate_3_prime_sc_feature_selected.parquet', 'Plate_4_sc_feature_selected.parquet', 'Plate_5_sc_feature_selected.parquet', 'Plate_2_sc_feature_selected.parquet', 'Plate_3_sc_feature_selected.parquet'])


[(241, 849),
 (7300, 1140),
 (7308, 1163),
 (7759, 1153),
 (1714, 856),
 (18038, 1168)]

## Generate UMAP coordinates for each plate

**Note:** Only metadata that is common between plates are included in final data frame.

In [5]:
desired_columns = ["Metadata_Plate","Metadata_Well", "Metadata_Site", "Metadata_number_of_singlecells", "Metadata_genotype"]

# Fit UMAP features per dataset and save
for plate in cp_dfs:
    plate_name = pathlib.Path(plate).stem
    # Make sure to reinitialize UMAP instance per plate
    umap_fit = umap.UMAP(
        random_state=umap_random_seed,
        n_components=umap_n_components
    )
    
    # Make sure NA columns have been removed
    cp_df = cp_dfs[plate]
    cp_df = feature_select(
        cp_df,
        operation="drop_na_columns",
        na_cutoff=0
    )
    
    # Process cp_df to separate features and metadata
    cp_features = infer_cp_features(cp_df)
    meta_features = infer_cp_features(cp_df, metadata=True)
    filtered_meta_features = [feature for feature in meta_features if feature in desired_columns]
    
    # Fit UMAP and convert to pandas DataFrame
    embeddings = pd.DataFrame(
        umap_fit.fit_transform(cp_df.loc[:, cp_features]),
        columns=[f"UMAP{x}" for x in range(0, umap_n_components)]
    )
    print(embeddings.shape)
    
    # Combine with metadata
    cp_umap_with_metadata_df = pd.concat([
        cp_df.loc[:, filtered_meta_features],
        embeddings
    ], axis=1)
    
    # randomize the rows of the dataframe to plot the order of the data evenly
    cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

    # Generate output file and save
    output_umap_file = pathlib.Path(output_dir, f"UMAP_{plate_name}.tsv")
    cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(241, 2)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(7300, 2)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(7308, 2)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(7759, 2)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(1714, 2)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(18038, 2)


In [6]:
# Print an example output file
cp_umap_with_metadata_df.head()


Unnamed: 0,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_genotype,Metadata_Plate,UMAP0,UMAP1
980,B3,23,297,WT,Plate_3,1.599025,5.475614
15023,B12,6,531,Null,Plate_3,-1.793289,0.987861
4946,G4,23,386,WT,Plate_3,0.444975,-2.371571
17693,G12,1,481,Null,Plate_3,0.670747,2.854921
7156,D7,16,235,HET,Plate_3,0.737558,5.601034


In [7]:
# Set constants
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns"
]

# Set input paths
data_dir = pathlib.Path("../../../3.processing_features/data/single_cell_profiles/")

# Select only the feature selected files
norm_suffix = "*sc_normalized.parquet"

# Obtain file paths for all feature selected plates
norm_files = glob.glob(f"{data_dir}/{norm_suffix}")
norm_files

['../../../3.processing_features/data/single_cell_profiles/Plate_5_sc_normalized.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_4_sc_normalized.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_2_sc_normalized.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_1_sc_normalized.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_sc_normalized.parquet',
 '../../../3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_normalized.parquet']

In [8]:
# Select file paths for plates 5, 3, and 3 prime only
selected_plates = ["Plate_5", "Plate_3", "Plate_3_prime"]

# Filter and concatenate the selected plates
selected_dfs = []
for file_path in norm_files:
    plate_name = pathlib.Path(file_path).stem.replace("_sc_normalized", "")

    # Only read in selected plates
    if plate_name in selected_plates:
        df = pd.read_parquet(file_path)

        # Update Metadata_Plate for Plate_3_prime
        if plate_name == "Plate_3_prime":
            df["Metadata_Plate"] = "Plate_3_prime"

        selected_dfs.append(df)

# Concatenate the dataframes along the rows
concatenated_df = pd.concat(selected_dfs, ignore_index=True)

# perform feature selection with all plates concat
concat_cp_df = feature_select(
        concatenated_df,
        operation=feature_select_ops,
        na_cutoff=0
    )

# Save the concatenated dataframe to a file
output_concatenated_file = pathlib.Path(output_dir, "concatenated_norm_fs_plates_5_3_3prime.parquet")
concatenated_df.to_parquet(output_concatenated_file, index=False)

print(concat_cp_df.shape)
concat_cp_df

(33097, 1289)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_01_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256,Nuclei_Texture_SumVariance_GFP_3_01_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Metadata_seed_density
0,B,1,B1,10,79,NF1,WT,870.435339,133.774194,863.193505,...,-0.696943,-1.101186,-0.939207,-0.434638,0.874974,-0.588359,-0.602702,-0.537493,0.019817,
1,B,1,B1,11,79,NF1,WT,827.549320,342.283025,810.793536,...,-1.169745,-1.229256,-0.861044,-0.990924,0.388708,-0.719303,-0.756992,-0.466952,-0.285410,
2,B,1,B1,11,79,NF1,WT,427.937346,356.977306,406.334199,...,-0.133861,-0.520363,-0.223285,0.861188,-0.428275,0.025442,0.282479,0.578176,-0.374451,
3,B,1,B1,11,79,NF1,WT,272.036245,389.802436,282.897144,...,0.494096,-0.405273,-0.497176,0.601272,0.247370,-0.466676,-0.217448,-0.286037,-0.320494,
4,B,1,B1,11,79,NF1,WT,944.416824,736.917498,963.654663,...,0.505731,0.031514,0.849798,0.149430,-0.958885,-0.139885,-0.172680,0.198356,-0.458774,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33092,G,12,G12,24,187,NF1,Null,513.159193,509.002883,522.814499,...,0.416924,-0.351049,-0.456108,0.065769,0.531813,-0.177669,-0.568974,-0.419269,-0.140771,4000.0
33093,G,12,G12,24,187,NF1,Null,791.977499,650.624862,802.050294,...,-0.009982,-0.626664,-0.686808,0.139159,-0.083279,-0.503511,-0.627214,-0.437504,-0.532715,4000.0
33094,G,12,G12,24,187,NF1,Null,743.007224,211.270353,793.794442,...,0.858997,0.188559,0.424676,0.023472,-1.284763,-0.237832,-0.664309,-0.376546,-0.728277,4000.0
33095,G,12,G12,24,187,NF1,Null,777.312246,352.744411,755.145858,...,0.326765,-0.114988,0.194936,0.521099,-0.774525,-0.420123,-0.078034,-0.374089,-0.639521,4000.0


In [9]:
desired_columns = ["Metadata_Plate","Metadata_Well", "Metadata_Site", "Metadata_number_of_singlecells", "Metadata_genotype"]

# Make sure to reinitialize UMAP instance
umap_fit = umap.UMAP(
    random_state=umap_random_seed,
    n_components=umap_n_components
)

# Process cp_df to separate features and metadata
cp_features = infer_cp_features(concat_cp_df)
meta_features = infer_cp_features(concat_cp_df, metadata=True)
filtered_meta_features = [feature for feature in meta_features if feature in desired_columns]

# Fit UMAP and convert to pandas DataFrame
embeddings = pd.DataFrame(
    umap_fit.fit_transform(concat_cp_df.loc[:, cp_features]),
    columns=[f"UMAP{x}" for x in range(0, umap_n_components)]
)
print(embeddings.shape)

# Combine with metadata
cp_umap_with_metadata_df = pd.concat([
    concat_cp_df.loc[:, filtered_meta_features],
    embeddings
], axis=1)

# randomize the rows of the dataframe to plot the order of the data evenly
cp_umap_with_metadata_df = cp_umap_with_metadata_df.sample(frac=1, random_state=0)

# Generate output file and save
output_umap_file = pathlib.Path(output_dir, f"UMAP_Concat_sc_feature_selected.tsv")
cp_umap_with_metadata_df.to_csv(output_umap_file, index=False, sep="\t")


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(33097, 2)
